-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[CostModel][X86] getShuffleCost - shuffles with only one defined element are always cheap #124412
Conversation
…ent are always cheap If we're just moving a single element around inside a 128-bit lane (probably as an alternative to extracting it), we can assume this is cheap as a single PSRLDQ/PSHUFD/SHUFPS. I've got the horrid feeling we're moving towards matching all SSE shuffle patterns inside the cost model, but I'm going to do my best to avoid this for now :|
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-transforms Author: Simon Pilgrim (RKSimon) ChangesIf we're just moving a single element around inside a 128-bit lane (probably as an alternative to extracting it), we can assume this is cheap as a single PSRLDQ/PSHUFD/SHUFPS. I've got the horrid feeling we're moving towards matching all SSE shuffle patterns inside the cost model, but I'm going to do my best to avoid this for now :| Patch is 25.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124412.diff 7 Files Affected:
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d3c923a76d074c..8d49d013b1a716 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1565,19 +1565,25 @@ InstructionCost X86TTIImpl::getShuffleCost(
// Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
// permutation.
+ // Attempt to detect a shuffle mask with a single defined element.
bool IsInLaneShuffle = false;
+ bool IsSingleElementMask = false;
if (BaseTp->getPrimitiveSizeInBits() > 0 &&
(BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
unsigned NumEltsPerLane = Mask.size() / NumLanes;
- if ((Mask.size() % NumLanes) == 0)
+ if ((Mask.size() % NumLanes) == 0) {
IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
return P.value() == PoisonMaskElem ||
((P.value() % Mask.size()) / NumEltsPerLane) ==
(P.index() / NumEltsPerLane);
});
+ IsSingleElementMask = (Mask.size() - 1) == count_if(Mask, [](int M) {
+ return M == PoisonMaskElem;
+ });
+ }
}
// Treat <X x bfloat> shuffles as <X x half>.
@@ -1791,6 +1797,11 @@ InstructionCost X86TTIImpl::getShuffleCost(
return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
}
+ // If we're just moving a single element around (probably as an alternative to
+ // extracting it), we can assume this is cheap.
+ if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
+ return TTI::TCC_Basic;
+
static const CostTblEntry AVX512VBMIShuffleTbl[] = {
{TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
{TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index d7cf8e6cb8905b..5ff3920c638749 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -638,7 +638,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
@@ -1133,7 +1133,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 4b1234fda0e18b..056d9d1fba1415 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -59,18 +59,15 @@ define <8 x i16> @add_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @add_v8i16_u1234567(
-; SSE2-NEXT: [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
-; SSE2-NEXT: [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
-; SSE2-NEXT: [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
-; SSE2-NEXT: [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
-; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
-; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
-; SSE2-NEXT: [[A23:%.*]] = add i16 [[A2]], [[A3]]
-; SSE2-NEXT: [[A45:%.*]] = add i16 [[A4]], [[A5]]
-; SSE2-NEXT: [[A67:%.*]] = add i16 [[A6]], [[A7]]
-; SSE2-NEXT: [[HADD1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
-; SSE2-NEXT: [[HADD2:%.*]] = insertelement <8 x i16> [[HADD1]], i16 [[A45]], i64 2
-; SSE2-NEXT: [[HADD3:%.*]] = insertelement <8 x i16> [[HADD2]], i16 [[A67]], i64 3
+; SSE2-NEXT: [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = add <8 x i16> [[A]], [[SHIFT2]]
+; SSE2-NEXT: [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT: [[TMP6:%.*]] = add <8 x i16> [[A]], [[SHIFT3]]
+; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HADD1:%.*]] = add <8 x i16> [[TMP7]], [[TMP4]]
+; SSE2-NEXT: [[HADD2:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HADD3:%.*]] = shufflevector <8 x i16> [[HADD2]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index c9cba0a4cc0ff2..572ec9efafe1ae 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -59,18 +59,15 @@ define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @sub_v8i16_u1234567(
-; SSE2-NEXT: [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
-; SSE2-NEXT: [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
-; SSE2-NEXT: [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
-; SSE2-NEXT: [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
-; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
-; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
-; SSE2-NEXT: [[A23:%.*]] = sub i16 [[A2]], [[A3]]
-; SSE2-NEXT: [[A45:%.*]] = sub i16 [[A4]], [[A5]]
-; SSE2-NEXT: [[A67:%.*]] = sub i16 [[A6]], [[A7]]
-; SSE2-NEXT: [[HSUB1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
-; SSE2-NEXT: [[HSUB2:%.*]] = insertelement <8 x i16> [[HSUB1]], i16 [[A45]], i64 2
-; SSE2-NEXT: [[HSUB3:%.*]] = insertelement <8 x i16> [[HSUB2]], i16 [[A67]], i64 3
+; SSE2-NEXT: [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[A]], [[SHIFT2]]
+; SSE2-NEXT: [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[A]], [[SHIFT3]]
+; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HSUB1:%.*]] = sub <8 x i16> [[TMP7]], [[TMP4]]
+; SSE2-NEXT: [[HSUB2:%.*]] = shufflevector <8 x i16> [[HSUB1]], <8 x i16> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HSUB3:%.*]] = shufflevector <8 x i16> [[HSUB2]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index f3b7f7b72ee425..d369279c15db44 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -252,17 +252,11 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext0_ext1_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; SSE-NEXT: [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext0_ext1_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext0_ext1_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %y, i32 1
@@ -271,17 +265,11 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext5_ext0_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; SSE-NEXT: [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext5_ext0_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext5_ext0_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 5
%e1 = extractelement <16 x i8> %y, i32 0
@@ -290,17 +278,11 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext1_ext6_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
-; SSE-NEXT: [[R:%.*]] = and i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext1_ext6_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext1_ext6_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 1
%e1 = extractelement <16 x i8> %y, i32 6
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index c125b73fccddf9..7cbe1c6cec9062 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -252,17 +252,11 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext0_ext1_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; SSE-NEXT: [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext0_ext1_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext0_ext1_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %y, i32 1
@@ -271,17 +265,11 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext5_ext0_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; SSE-NEXT: [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext5_ext0_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext5_ext0_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 5
%e1 = extractelement <16 x i8> %y, i32 0
@@ -290,17 +278,11 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext1_ext6_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
-; SSE-NEXT: [[R:%.*]] = and i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext1_ext6_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext1_ext6_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x ...
[truncated]
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf we're just moving a single element around inside a 128-bit lane (probably as an alternative to extracting it), we can assume this is cheap as a single PSRLDQ/PSHUFD/SHUFPS. I've got the horrid feeling we're moving towards matching all SSE shuffle patterns inside the cost model, but I'm going to do my best to avoid this for now :| Patch is 25.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124412.diff 7 Files Affected:
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d3c923a76d074c..8d49d013b1a716 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1565,19 +1565,25 @@ InstructionCost X86TTIImpl::getShuffleCost(
// Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
// permutation.
+ // Attempt to detect a shuffle mask with a single defined element.
bool IsInLaneShuffle = false;
+ bool IsSingleElementMask = false;
if (BaseTp->getPrimitiveSizeInBits() > 0 &&
(BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
unsigned NumEltsPerLane = Mask.size() / NumLanes;
- if ((Mask.size() % NumLanes) == 0)
+ if ((Mask.size() % NumLanes) == 0) {
IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
return P.value() == PoisonMaskElem ||
((P.value() % Mask.size()) / NumEltsPerLane) ==
(P.index() / NumEltsPerLane);
});
+ IsSingleElementMask = (Mask.size() - 1) == count_if(Mask, [](int M) {
+ return M == PoisonMaskElem;
+ });
+ }
}
// Treat <X x bfloat> shuffles as <X x half>.
@@ -1791,6 +1797,11 @@ InstructionCost X86TTIImpl::getShuffleCost(
return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
}
+ // If we're just moving a single element around (probably as an alternative to
+ // extracting it), we can assume this is cheap.
+ if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
+ return TTI::TCC_Basic;
+
static const CostTblEntry AVX512VBMIShuffleTbl[] = {
{TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
{TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index d7cf8e6cb8905b..5ff3920c638749 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -638,7 +638,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
@@ -1133,7 +1133,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 4b1234fda0e18b..056d9d1fba1415 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -59,18 +59,15 @@ define <8 x i16> @add_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @add_v8i16_u1234567(
-; SSE2-NEXT: [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
-; SSE2-NEXT: [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
-; SSE2-NEXT: [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
-; SSE2-NEXT: [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
-; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
-; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
-; SSE2-NEXT: [[A23:%.*]] = add i16 [[A2]], [[A3]]
-; SSE2-NEXT: [[A45:%.*]] = add i16 [[A4]], [[A5]]
-; SSE2-NEXT: [[A67:%.*]] = add i16 [[A6]], [[A7]]
-; SSE2-NEXT: [[HADD1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
-; SSE2-NEXT: [[HADD2:%.*]] = insertelement <8 x i16> [[HADD1]], i16 [[A45]], i64 2
-; SSE2-NEXT: [[HADD3:%.*]] = insertelement <8 x i16> [[HADD2]], i16 [[A67]], i64 3
+; SSE2-NEXT: [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = add <8 x i16> [[A]], [[SHIFT2]]
+; SSE2-NEXT: [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT: [[TMP6:%.*]] = add <8 x i16> [[A]], [[SHIFT3]]
+; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HADD1:%.*]] = add <8 x i16> [[TMP7]], [[TMP4]]
+; SSE2-NEXT: [[HADD2:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HADD3:%.*]] = shufflevector <8 x i16> [[HADD2]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index c9cba0a4cc0ff2..572ec9efafe1ae 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -59,18 +59,15 @@ define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: @sub_v8i16_u1234567(
-; SSE2-NEXT: [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
-; SSE2-NEXT: [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
-; SSE2-NEXT: [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
-; SSE2-NEXT: [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
-; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
-; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
-; SSE2-NEXT: [[A23:%.*]] = sub i16 [[A2]], [[A3]]
-; SSE2-NEXT: [[A45:%.*]] = sub i16 [[A4]], [[A5]]
-; SSE2-NEXT: [[A67:%.*]] = sub i16 [[A6]], [[A7]]
-; SSE2-NEXT: [[HSUB1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
-; SSE2-NEXT: [[HSUB2:%.*]] = insertelement <8 x i16> [[HSUB1]], i16 [[A45]], i64 2
-; SSE2-NEXT: [[HSUB3:%.*]] = insertelement <8 x i16> [[HSUB2]], i16 [[A67]], i64 3
+; SSE2-NEXT: [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[A]], [[SHIFT2]]
+; SSE2-NEXT: [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[A]], [[SHIFT3]]
+; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HSUB1:%.*]] = sub <8 x i16> [[TMP7]], [[TMP4]]
+; SSE2-NEXT: [[HSUB2:%.*]] = shufflevector <8 x i16> [[HSUB1]], <8 x i16> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HSUB3:%.*]] = shufflevector <8 x i16> [[HSUB2]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index f3b7f7b72ee425..d369279c15db44 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -252,17 +252,11 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext0_ext1_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; SSE-NEXT: [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext0_ext1_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext0_ext1_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %y, i32 1
@@ -271,17 +265,11 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext5_ext0_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; SSE-NEXT: [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext5_ext0_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext5_ext0_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 5
%e1 = extractelement <16 x i8> %y, i32 0
@@ -290,17 +278,11 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext1_ext6_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
-; SSE-NEXT: [[R:%.*]] = and i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext1_ext6_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext1_ext6_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 1
%e1 = extractelement <16 x i8> %y, i32 6
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index c125b73fccddf9..7cbe1c6cec9062 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -252,17 +252,11 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext0_ext1_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; SSE-NEXT: [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext0_ext1_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext0_ext1_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %y, i32 1
@@ -271,17 +265,11 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext5_ext0_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; SSE-NEXT: [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext5_ext0_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext5_ext0_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 5
%e1 = extractelement <16 x i8> %y, i32 0
@@ -290,17 +278,11 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
}
define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext1_ext6_add(
-; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
-; SSE-NEXT: [[R:%.*]] = and i8 [[E0]], [[E1]]
-; SSE-NEXT: ret i8 [[R]]
-;
-; AVX-LABEL: @ext1_ext6_add(
-; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
-; AVX-NEXT: ret i8 [[R]]
+; CHECK-LABEL: @ext1_ext6_add(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
+; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x ...
[truncated]
|
ping - any objections? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
Seeing a warning on the comparison between
|
If we're just moving a single element around inside a 128-bit lane (probably as an alternative to extracting it), we can assume this is cheap as a single PSRLDQ/PSHUFD/SHUFPS.
I've got the horrid feeling we're moving towards matching all SSE shuffle patterns inside the cost model, but I'm going to do my best to avoid this for now :|