From 1c9419098e36de9006cd22a9b2d51b15012f1e34 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 25 Jan 2025 15:36:51 +0000 Subject: [PATCH] [CostModel][X86] getShuffleCost - shuffles with only one defined element are always cheap If we're just moving a single element around inside a 128-bit lane (probably as an alternative to extracting it), we can assume this is cheap as a single PSRLDQ/PSHUFD/SHUFPS. I've got the horrid feeling we're moving towards matching all SSE shuffle patterns inside the cost model, but I'm going to do my best to avoid this for now :| --- .../lib/Target/X86/X86TargetTransformInfo.cpp | 13 ++++- llvm/test/Analysis/CostModel/X86/reduction.ll | 4 +- .../test/Transforms/PhaseOrdering/X86/hadd.ll | 21 ++++---- .../test/Transforms/PhaseOrdering/X86/hsub.ll | 21 ++++---- .../X86/extract-binop-inseltpoison.ll | 48 ++++++------------- .../VectorCombine/X86/extract-binop.ll | 48 ++++++------------- .../VectorCombine/X86/load-inseltpoison.ll | 47 ++++++------------ 7 files changed, 76 insertions(+), 126 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index d3c923a76d074c1..8d49d013b1a716e 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1565,19 +1565,25 @@ InstructionCost X86TTIImpl::getShuffleCost( // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector // permutation. + // Attempt to detect a shuffle mask with a single defined element. bool IsInLaneShuffle = false; + bool IsSingleElementMask = false; if (BaseTp->getPrimitiveSizeInBits() > 0 && (BaseTp->getPrimitiveSizeInBits() % 128) == 0 && BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && Mask.size() == BaseTp->getElementCount().getKnownMinValue()) { unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128; unsigned NumEltsPerLane = Mask.size() / NumLanes; - if ((Mask.size() % NumLanes) == 0) + if ((Mask.size() % NumLanes) == 0) { IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) { return P.value() == PoisonMaskElem || ((P.value() % Mask.size()) / NumEltsPerLane) == (P.index() / NumEltsPerLane); }); + IsSingleElementMask = (Mask.size() - 1) == count_if(Mask, [](int M) { + return M == PoisonMaskElem; + }); + } } // Treat shuffles as . @@ -1791,6 +1797,11 @@ InstructionCost X86TTIImpl::getShuffleCost( return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); } + // If we're just moving a single element around (probably as an alternative to + // extracting it), we can assume this is cheap. + if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask) + return TTI::TCC_Basic; + static const CostTblEntry AVX512VBMIShuffleTbl[] = { {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll index d7cf8e6cb8905b8..5ff3920c638749d 100644 --- a/llvm/test/Analysis/CostModel/X86/reduction.ll +++ b/llvm/test/Analysis/CostModel/X86/reduction.ll @@ -638,7 +638,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r @@ -1133,7 +1133,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll index 4b1234fda0e18bb..056d9d1fba1415b 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll @@ -59,18 +59,15 @@ define <8 x i16> @add_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: @add_v8i16_u1234567( -; SSE2-NEXT: [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2 -; SSE2-NEXT: [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3 -; SSE2-NEXT: [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4 -; SSE2-NEXT: [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5 -; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6 -; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7 -; SSE2-NEXT: [[A23:%.*]] = add i16 [[A2]], [[A3]] -; SSE2-NEXT: [[A45:%.*]] = add i16 [[A4]], [[A5]] -; SSE2-NEXT: [[A67:%.*]] = add i16 [[A6]], [[A7]] -; SSE2-NEXT: [[HADD1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1 -; SSE2-NEXT: [[HADD2:%.*]] = insertelement <8 x i16> [[HADD1]], i16 [[A45]], i64 2 -; SSE2-NEXT: [[HADD3:%.*]] = insertelement <8 x i16> [[HADD2]], i16 [[A67]], i64 3 +; SSE2-NEXT: [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = add <8 x i16> [[A]], [[SHIFT2]] +; SSE2-NEXT: [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +; SSE2-NEXT: [[TMP6:%.*]] = add <8 x i16> [[A]], [[SHIFT3]] +; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +; SSE2-NEXT: [[HADD1:%.*]] = add <8 x i16> [[TMP7]], [[TMP4]] +; SSE2-NEXT: [[HADD2:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP5]], <8 x i32> +; SSE2-NEXT: [[HADD3:%.*]] = shufflevector <8 x i16> [[HADD2]], <8 x i16> [[TMP6]], <8 x i32> ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> ; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> ; SSE2-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll index c9cba0a4cc0ff22..572ec9efafe1ae0 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll @@ -59,18 +59,15 @@ define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: @sub_v8i16_u1234567( -; SSE2-NEXT: [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2 -; SSE2-NEXT: [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3 -; SSE2-NEXT: [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4 -; SSE2-NEXT: [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5 -; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6 -; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7 -; SSE2-NEXT: [[A23:%.*]] = sub i16 [[A2]], [[A3]] -; SSE2-NEXT: [[A45:%.*]] = sub i16 [[A4]], [[A5]] -; SSE2-NEXT: [[A67:%.*]] = sub i16 [[A6]], [[A7]] -; SSE2-NEXT: [[HSUB1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1 -; SSE2-NEXT: [[HSUB2:%.*]] = insertelement <8 x i16> [[HSUB1]], i16 [[A45]], i64 2 -; SSE2-NEXT: [[HSUB3:%.*]] = insertelement <8 x i16> [[HSUB2]], i16 [[A67]], i64 3 +; SSE2-NEXT: [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[A]], [[SHIFT2]] +; SSE2-NEXT: [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +; SSE2-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[A]], [[SHIFT3]] +; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +; SSE2-NEXT: [[HSUB1:%.*]] = sub <8 x i16> [[TMP7]], [[TMP4]] +; SSE2-NEXT: [[HSUB2:%.*]] = shufflevector <8 x i16> [[HSUB1]], <8 x i16> [[TMP5]], <8 x i32> +; SSE2-NEXT: [[HSUB3:%.*]] = shufflevector <8 x i16> [[HSUB2]], <8 x i16> [[TMP6]], <8 x i32> ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> ; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> ; SSE2-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll index f3b7f7b72ee4252..d369279c15db44a 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll @@ -252,17 +252,11 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) { } define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) { -; SSE-LABEL: @ext0_ext1_add( -; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0 -; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1 -; SSE-NEXT: [[R:%.*]] = add nuw i8 [[E0]], [[E1]] -; SSE-NEXT: ret i8 [[R]] -; -; AVX-LABEL: @ext0_ext1_add( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> -; AVX-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]] -; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 -; AVX-NEXT: ret i8 [[R]] +; CHECK-LABEL: @ext0_ext1_add( +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]] +; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: ret i8 [[R]] ; %e0 = extractelement <16 x i8> %x, i32 0 %e1 = extractelement <16 x i8> %y, i32 1 @@ -271,17 +265,11 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) { } define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) { -; SSE-LABEL: @ext5_ext0_add( -; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5 -; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0 -; SSE-NEXT: [[R:%.*]] = sub nsw i8 [[E0]], [[E1]] -; SSE-NEXT: ret i8 [[R]] -; -; AVX-LABEL: @ext5_ext0_add( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> -; AVX-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]] -; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0 -; AVX-NEXT: ret i8 [[R]] +; CHECK-LABEL: @ext5_ext0_add( +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0 +; CHECK-NEXT: ret i8 [[R]] ; %e0 = extractelement <16 x i8> %x, i32 5 %e1 = extractelement <16 x i8> %y, i32 0 @@ -290,17 +278,11 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) { } define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) { -; SSE-LABEL: @ext1_ext6_add( -; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1 -; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6 -; SSE-NEXT: [[R:%.*]] = and i8 [[E0]], [[E1]] -; SSE-NEXT: ret i8 [[R]] -; -; AVX-LABEL: @ext1_ext6_add( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> -; AVX-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]] -; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1 -; AVX-NEXT: ret i8 [[R]] +; CHECK-LABEL: @ext1_ext6_add( +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]] +; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: ret i8 [[R]] ; %e0 = extractelement <16 x i8> %x, i32 1 %e1 = extractelement <16 x i8> %y, i32 6 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll index c125b73fccddf92..7cbe1c6cec90629 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -252,17 +252,11 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) { } define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) { -; SSE-LABEL: @ext0_ext1_add( -; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0 -; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1 -; SSE-NEXT: [[R:%.*]] = add nuw i8 [[E0]], [[E1]] -; SSE-NEXT: ret i8 [[R]] -; -; AVX-LABEL: @ext0_ext1_add( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> -; AVX-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]] -; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 -; AVX-NEXT: ret i8 [[R]] +; CHECK-LABEL: @ext0_ext1_add( +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]] +; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: ret i8 [[R]] ; %e0 = extractelement <16 x i8> %x, i32 0 %e1 = extractelement <16 x i8> %y, i32 1 @@ -271,17 +265,11 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) { } define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) { -; SSE-LABEL: @ext5_ext0_add( -; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5 -; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0 -; SSE-NEXT: [[R:%.*]] = sub nsw i8 [[E0]], [[E1]] -; SSE-NEXT: ret i8 [[R]] -; -; AVX-LABEL: @ext5_ext0_add( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> -; AVX-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]] -; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0 -; AVX-NEXT: ret i8 [[R]] +; CHECK-LABEL: @ext5_ext0_add( +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0 +; CHECK-NEXT: ret i8 [[R]] ; %e0 = extractelement <16 x i8> %x, i32 5 %e1 = extractelement <16 x i8> %y, i32 0 @@ -290,17 +278,11 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) { } define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) { -; SSE-LABEL: @ext1_ext6_add( -; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1 -; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6 -; SSE-NEXT: [[R:%.*]] = and i8 [[E0]], [[E1]] -; SSE-NEXT: ret i8 [[R]] -; -; AVX-LABEL: @ext1_ext6_add( -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> -; AVX-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]] -; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1 -; AVX-NEXT: ret i8 [[R]] +; CHECK-LABEL: @ext1_ext6_add( +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]] +; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: ret i8 [[R]] ; %e0 = extractelement <16 x i8> %x, i32 1 %e1 = extractelement <16 x i8> %y, i32 6 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll index 2db1e21b3e95a71..f57583a3f53a642 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -265,16 +265,10 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) % ; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(17) %p) nofree nosync { -; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 -; SSE2-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 2 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 %s = load i16, ptr %gep, align 2 @@ -285,16 +279,10 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable ; Verify that alignment of the new load is not over-specified. define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 dereferenceable(16) %p) nofree nosync { -; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 -; SSE2-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 8 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2 +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 %s = load i16, ptr %gep, align 8 @@ -603,17 +591,10 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer ; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(ptr align 1 dereferenceable(16) %p) nofree nosync { -; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, ptr [[P:%.*]], i64 1 -; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, ptr [[GEP]], i32 0, i32 0 -; SSE2-NEXT: [[S:%.*]] = load i16, ptr [[TMP1]], align 8 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 4 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <2 x i16>, ptr %p, i64 1 %l = load <2 x i16>, ptr %gep, align 8