From 1c9419098e36de9006cd22a9b2d51b15012f1e34 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 25 Jan 2025 15:36:51 +0000
Subject: [PATCH] [CostModel][X86] getShuffleCost - shuffles with only one
 defined element are always cheap

If we're just moving a single element around inside a 128-bit lane (probably as an alternative to extracting it), we can assume this is cheap as a single PSRLDQ/PSHUFD/SHUFPS.

I've got the horrid feeling we're moving towards matching all SSE shuffle patterns inside the cost model, but I'm going to do my best to avoid this for now :|
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 13 ++++-
 llvm/test/Analysis/CostModel/X86/reduction.ll |  4 +-
 .../test/Transforms/PhaseOrdering/X86/hadd.ll | 21 ++++----
 .../test/Transforms/PhaseOrdering/X86/hsub.ll | 21 ++++----
 .../X86/extract-binop-inseltpoison.ll         | 48 ++++++-------------
 .../VectorCombine/X86/extract-binop.ll        | 48 ++++++-------------
 .../VectorCombine/X86/load-inseltpoison.ll    | 47 ++++++------------
 7 files changed, 76 insertions(+), 126 deletions(-)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d3c923a76d074c1..8d49d013b1a716e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1565,19 +1565,25 @@ InstructionCost X86TTIImpl::getShuffleCost(
 
   // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
   // permutation.
+  // Attempt to detect a shuffle mask with a single defined element.
   bool IsInLaneShuffle = false;
+  bool IsSingleElementMask = false;
   if (BaseTp->getPrimitiveSizeInBits() > 0 &&
       (BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
       BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
       Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
     unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
     unsigned NumEltsPerLane = Mask.size() / NumLanes;
-    if ((Mask.size() % NumLanes) == 0)
+    if ((Mask.size() % NumLanes) == 0) {
       IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
         return P.value() == PoisonMaskElem ||
                ((P.value() % Mask.size()) / NumEltsPerLane) ==
                    (P.index() / NumEltsPerLane);
       });
+      IsSingleElementMask = (Mask.size() - 1) == count_if(Mask, [](int M) {
+                              return M == PoisonMaskElem;
+                            });
+    }
   }
 
   // Treat <X x bfloat> shuffles as <X x half>.
@@ -1791,6 +1797,11 @@ InstructionCost X86TTIImpl::getShuffleCost(
     return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
   }
 
+  // If we're just moving a single element around (probably as an alternative to
+  // extracting it), we can assume this is cheap.
+  if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
+    return TTI::TCC_Basic;
+
   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index d7cf8e6cb8905b8..5ff3920c638749d 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -638,7 +638,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
@@ -1133,7 +1133,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 4b1234fda0e18bb..056d9d1fba1415b 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -59,18 +59,15 @@ define <8 x i16> @add_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
 
 define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @add_v8i16_u1234567(
-; SSE2-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
-; SSE2-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
-; SSE2-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
-; SSE2-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
-; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
-; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
-; SSE2-NEXT:    [[A23:%.*]] = add i16 [[A2]], [[A3]]
-; SSE2-NEXT:    [[A45:%.*]] = add i16 [[A4]], [[A5]]
-; SSE2-NEXT:    [[A67:%.*]] = add i16 [[A6]], [[A7]]
-; SSE2-NEXT:    [[HADD1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
-; SSE2-NEXT:    [[HADD2:%.*]] = insertelement <8 x i16> [[HADD1]], i16 [[A45]], i64 2
-; SSE2-NEXT:    [[HADD3:%.*]] = insertelement <8 x i16> [[HADD2]], i16 [[A67]], i64 3
+; SSE2-NEXT:    [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[A]], [[SHIFT2]]
+; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[A]], [[SHIFT3]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADD1:%.*]] = add <8 x i16> [[TMP7]], [[TMP4]]
+; SSE2-NEXT:    [[HADD2:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADD3:%.*]] = shufflevector <8 x i16> [[HADD2]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index c9cba0a4cc0ff22..572ec9efafe1ae0 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -59,18 +59,15 @@ define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
 
 define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @sub_v8i16_u1234567(
-; SSE2-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
-; SSE2-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
-; SSE2-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
-; SSE2-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
-; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
-; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
-; SSE2-NEXT:    [[A23:%.*]] = sub i16 [[A2]], [[A3]]
-; SSE2-NEXT:    [[A45:%.*]] = sub i16 [[A4]], [[A5]]
-; SSE2-NEXT:    [[A67:%.*]] = sub i16 [[A6]], [[A7]]
-; SSE2-NEXT:    [[HSUB1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
-; SSE2-NEXT:    [[HSUB2:%.*]] = insertelement <8 x i16> [[HSUB1]], i16 [[A45]], i64 2
-; SSE2-NEXT:    [[HSUB3:%.*]] = insertelement <8 x i16> [[HSUB2]], i16 [[A67]], i64 3
+; SSE2-NEXT:    [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[A]], [[SHIFT2]]
+; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[A]], [[SHIFT3]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HSUB1:%.*]] = sub <8 x i16> [[TMP7]], [[TMP4]]
+; SSE2-NEXT:    [[HSUB2:%.*]] = shufflevector <8 x i16> [[HSUB1]], <8 x i16> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HSUB3:%.*]] = shufflevector <8 x i16> [[HSUB2]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index f3b7f7b72ee4252..d369279c15db44a 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -252,17 +252,11 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext0_ext1_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; SSE-NEXT:    [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext0_ext1_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext0_ext1_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 0
   %e1 = extractelement <16 x i8> %y, i32 1
@@ -271,17 +265,11 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext5_ext0_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; SSE-NEXT:    [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext5_ext0_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext5_ext0_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 5
   %e1 = extractelement <16 x i8> %y, i32 0
@@ -290,17 +278,11 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext1_ext6_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
-; SSE-NEXT:    [[R:%.*]] = and i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext1_ext6_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext1_ext6_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 1
   %e1 = extractelement <16 x i8> %y, i32 6
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index c125b73fccddf92..7cbe1c6cec90629 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -252,17 +252,11 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext0_ext1_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; SSE-NEXT:    [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext0_ext1_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext0_ext1_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 0
   %e1 = extractelement <16 x i8> %y, i32 1
@@ -271,17 +265,11 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext5_ext0_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; SSE-NEXT:    [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext5_ext0_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext5_ext0_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 5
   %e1 = extractelement <16 x i8> %y, i32 0
@@ -290,17 +278,11 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext1_ext6_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
-; SSE-NEXT:    [[R:%.*]] = and i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext1_ext6_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext1_ext6_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 1
   %e1 = extractelement <16 x i8> %y, i32 6
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 2db1e21b3e95a71..f57583a3f53a642 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -265,16 +265,10 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %
 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(17) %p) nofree nosync {
-; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; SSE2-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 2
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
   %s = load i16, ptr %gep, align 2
@@ -285,16 +279,10 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable
 ; Verify that alignment of the new load is not over-specified.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 dereferenceable(16) %p) nofree nosync {
-; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; SSE2-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 8
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
   %s = load i16, ptr %gep, align 8
@@ -603,17 +591,10 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer
 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 
 define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(ptr align 1 dereferenceable(16) %p) nofree nosync {
-; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, ptr [[P:%.*]], i64 1
-; SSE2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, ptr [[GEP]], i32 0, i32 0
-; SSE2-NEXT:    [[S:%.*]] = load i16, ptr [[TMP1]], align 8
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 4
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <2 x i16>, ptr %p, i64 1
   %l = load <2 x i16>, ptr %gep, align 8