From 840b2950632baf26b2276dbafc3448da444504a3 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 18 Dec 2024 00:19:54 -0800 Subject: [PATCH 1/7] [LV] Fix FindLastIV reduction for epilogue vectorization. --- .../include/llvm/Transforms/Utils/LoopUtils.h | 10 ++++--- llvm/lib/Transforms/Utils/LoopUtils.cpp | 10 +++---- .../Transforms/Vectorize/LoopVectorize.cpp | 9 ------ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 ++++----- .../LoopVectorize/epilog-iv-select-cmp.ll | 28 ++++++++----------- 5 files changed, 29 insertions(+), 41 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index b4cd52fef70fd2..4e5ec962e606d0 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -420,15 +420,17 @@ Value *createAnyOfReduction(IRBuilderBase &B, Value *Src, PHINode *OrigPhi); /// Create a reduction of the given vector \p Src for a reduction of the -/// kind RecurKind::IFindLastIV or RecurKind::FFindLastIV. The reduction -/// operation is described by \p Desc. -Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src, +/// kind RecurKind::IFindLastIV or RecurKind::FFindLastIV. The scalar \p +/// StartVal is the incoming value of reduction phi from outside the loop. The +/// reduction operation is described by \p Desc. +Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src, Value *StartVal, const RecurrenceDescriptor &Desc); /// Create a generic reduction using a recurrence descriptor \p Desc /// Fast-math-flags are propagated using the RecurrenceDescriptor. Value *createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, - Value *Src, PHINode *OrigPhi = nullptr); + Value *Src, Value *StartVal = nullptr, + PHINode *OrigPhi = nullptr); /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 45915c10107b2e..91b31c793b0f53 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1209,11 +1209,12 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src, } Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src, + Value *StartVal, const RecurrenceDescriptor &Desc) { assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind( Desc.getRecurrenceKind()) && "Unexpected reduction kind"); - Value *StartVal = Desc.getRecurrenceStartValue(); + assert(StartVal && "Null start value"); Value *Sentinel = Desc.getSentinelValue(); Value *MaxRdx = Src->getType()->isVectorTy() ? Builder.CreateIntMaxReduce(Src, true) @@ -1320,9 +1321,8 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src, return VBuilder.createSimpleReduction(Id, SrcTy, Ops); } -Value *llvm::createReduction(IRBuilderBase &B, - const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi) { +Value *llvm::createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, + Value *Src, Value *StartVal, PHINode *OrigPhi) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. @@ -1333,7 +1333,7 @@ Value *llvm::createReduction(IRBuilderBase &B, if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) return createAnyOfReduction(B, Src, Desc, OrigPhi); if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) - return createFindLastIVReduction(B, Src, Desc); + return createFindLastIVReduction(B, Src, StartVal, Desc); return createSimpleReduction(B, Src, RK); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d32a463a996c4f..d72a5df5693d83 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9794,15 +9794,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // Convert the reduction phi to operate on bools. PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( OrigLoop->getHeader()->getContext()))); - continue; - } - - if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( - RdxDesc.getRecurrenceKind())) { - // Adjust the start value for FindLastIV recurrences to use the sentinel - // value after generating the ResumePhi recipe, which uses the original - // start value. - PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue())); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index e54df8bdeac55a..aff648d8eed782 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -566,8 +566,9 @@ Value *VPInstruction::generate(VPTransformState &State) { RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) && !PhiR->isInLoop()) { + Value *StartVal = PhiR->getStartValue()->getLiveInIRValue(); ReducedPartRdx = - createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); + createReduction(Builder, RdxDesc, ReducedPartRdx, StartVal, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) @@ -3394,15 +3395,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { } } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { // [I|F]FindLastIV will use a sentinel value to initialize the reduction - // phi or the resume value from the main vector loop when vectorizing the - // epilogue loop. In the exit block, ComputeReductionResult will generate - // checks to verify if the reduction result is the sentinel value. If the - // result is the sentinel value, it will be corrected back to the start - // value. + // phi. In the exit block, ComputeReductionResult will generate checks to + // verify if the reduction result is the sentinel value. If the result is + // the sentinel value, it will be corrected back to the start value. // TODO: The sentinel value is not always necessary. When the start value is // a constant, and smaller than the start value of the induction variable, // the start value can be directly used to initialize the reduction phi. - Iden = StartV; + StartV = Iden = RdxDesc.getSentinelValue(); if (!ScalarPHI) { IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index 052b4a10e9c8d5..23ab84e46e1ff6 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -46,13 +46,11 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], -; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0 -; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 @@ -66,16 +64,16 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP11]]) ; CHECK-NEXT: [[RDX_SELECT_CMP10:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808 -; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 3 +; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 [[BC_MERGE_RDX]] ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 3, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 3, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[L]], 3 @@ -150,13 +148,11 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], -; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0 -; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 @@ -170,16 +166,16 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP11]]) ; CHECK-NEXT: [[RDX_SELECT_CMP10:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808 -; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 2 +; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 [[BC_MERGE_RDX]] ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 2, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 2, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4 ; CHECK-NEXT: [[C:%.*]] = fcmp fast ueq float [[L]], 3.000000e+00 From 98ab17954c787df084d583d453bac5b9a1c099d0 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 20 Dec 2024 02:16:43 -0800 Subject: [PATCH 2/7] Revert "[LV] Fix FindLastIV reduction for epilogue vectorization." This reverts commit be9d783ba941a80896f861ff424d81b886680615. --- .../include/llvm/Transforms/Utils/LoopUtils.h | 10 +++---- llvm/lib/Transforms/Utils/LoopUtils.cpp | 10 +++---- .../Transforms/Vectorize/LoopVectorize.cpp | 9 ++++++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 +++++---- .../LoopVectorize/epilog-iv-select-cmp.ll | 28 +++++++++++-------- 5 files changed, 41 insertions(+), 29 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 4e5ec962e606d0..b4cd52fef70fd2 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -420,17 +420,15 @@ Value *createAnyOfReduction(IRBuilderBase &B, Value *Src, PHINode *OrigPhi); /// Create a reduction of the given vector \p Src for a reduction of the -/// kind RecurKind::IFindLastIV or RecurKind::FFindLastIV. The scalar \p -/// StartVal is the incoming value of reduction phi from outside the loop. The -/// reduction operation is described by \p Desc. -Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src, Value *StartVal, +/// kind RecurKind::IFindLastIV or RecurKind::FFindLastIV. The reduction +/// operation is described by \p Desc. +Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src, const RecurrenceDescriptor &Desc); /// Create a generic reduction using a recurrence descriptor \p Desc /// Fast-math-flags are propagated using the RecurrenceDescriptor. Value *createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, - Value *Src, Value *StartVal = nullptr, - PHINode *OrigPhi = nullptr); + Value *Src, PHINode *OrigPhi = nullptr); /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 91b31c793b0f53..45915c10107b2e 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1209,12 +1209,11 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src, } Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src, - Value *StartVal, const RecurrenceDescriptor &Desc) { assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind( Desc.getRecurrenceKind()) && "Unexpected reduction kind"); - assert(StartVal && "Null start value"); + Value *StartVal = Desc.getRecurrenceStartValue(); Value *Sentinel = Desc.getSentinelValue(); Value *MaxRdx = Src->getType()->isVectorTy() ? Builder.CreateIntMaxReduce(Src, true) @@ -1321,8 +1320,9 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src, return VBuilder.createSimpleReduction(Id, SrcTy, Ops); } -Value *llvm::createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, - Value *Src, Value *StartVal, PHINode *OrigPhi) { +Value *llvm::createReduction(IRBuilderBase &B, + const RecurrenceDescriptor &Desc, Value *Src, + PHINode *OrigPhi) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. @@ -1333,7 +1333,7 @@ Value *llvm::createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) return createAnyOfReduction(B, Src, Desc, OrigPhi); if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) - return createFindLastIVReduction(B, Src, StartVal, Desc); + return createFindLastIVReduction(B, Src, Desc); return createSimpleReduction(B, Src, RK); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d72a5df5693d83..d32a463a996c4f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9794,6 +9794,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // Convert the reduction phi to operate on bools. PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( OrigLoop->getHeader()->getContext()))); + continue; + } + + if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + // Adjust the start value for FindLastIV recurrences to use the sentinel + // value after generating the ResumePhi recipe, which uses the original + // start value. + PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue())); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index aff648d8eed782..e54df8bdeac55a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -566,9 +566,8 @@ Value *VPInstruction::generate(VPTransformState &State) { RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) && !PhiR->isInLoop()) { - Value *StartVal = PhiR->getStartValue()->getLiveInIRValue(); ReducedPartRdx = - createReduction(Builder, RdxDesc, ReducedPartRdx, StartVal, OrigPhi); + createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) @@ -3395,13 +3394,15 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { } } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { // [I|F]FindLastIV will use a sentinel value to initialize the reduction - // phi. In the exit block, ComputeReductionResult will generate checks to - // verify if the reduction result is the sentinel value. If the result is - // the sentinel value, it will be corrected back to the start value. + // phi or the resume value from the main vector loop when vectorizing the + // epilogue loop. In the exit block, ComputeReductionResult will generate + // checks to verify if the reduction result is the sentinel value. If the + // result is the sentinel value, it will be corrected back to the start + // value. // TODO: The sentinel value is not always necessary. When the start value is // a constant, and smaller than the start value of the induction variable, // the start value can be directly used to initialize the reduction phi. - StartV = Iden = RdxDesc.getSentinelValue(); + Iden = StartV; if (!ScalarPHI) { IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index 23ab84e46e1ff6..052b4a10e9c8d5 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -46,11 +46,13 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 @@ -64,16 +66,16 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP11]]) ; CHECK-NEXT: [[RDX_SELECT_CMP10:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808 -; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 [[BC_MERGE_RDX]] +; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 3 ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 3, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 3, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[L]], 3 @@ -148,11 +150,13 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 @@ -166,16 +170,16 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP11]]) ; CHECK-NEXT: [[RDX_SELECT_CMP10:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808 -; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 [[BC_MERGE_RDX]] +; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 2 ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 2, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 2, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4 ; CHECK-NEXT: [[C:%.*]] = fcmp fast ueq float [[L]], 3.000000e+00 From b3b436b140302e0c1d23a3b10922927f25a5c1b4 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 9 Jan 2025 01:38:14 -0800 Subject: [PATCH 3/7] Revert "Revert "[LV] Fix FindLastIV reduction for epilogue vectorization."" This reverts commit 895a134e768ae2467016d9e3e3641b0e92b4a3fa. --- .../include/llvm/Transforms/Utils/LoopUtils.h | 10 ++++--- llvm/lib/Transforms/Utils/LoopUtils.cpp | 10 +++---- .../Transforms/Vectorize/LoopVectorize.cpp | 9 ------ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 ++++----- .../LoopVectorize/epilog-iv-select-cmp.ll | 28 ++++++++----------- 5 files changed, 29 insertions(+), 41 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index b4cd52fef70fd2..4e5ec962e606d0 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -420,15 +420,17 @@ Value *createAnyOfReduction(IRBuilderBase &B, Value *Src, PHINode *OrigPhi); /// Create a reduction of the given vector \p Src for a reduction of the -/// kind RecurKind::IFindLastIV or RecurKind::FFindLastIV. The reduction -/// operation is described by \p Desc. -Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src, +/// kind RecurKind::IFindLastIV or RecurKind::FFindLastIV. The scalar \p +/// StartVal is the incoming value of reduction phi from outside the loop. The +/// reduction operation is described by \p Desc. +Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src, Value *StartVal, const RecurrenceDescriptor &Desc); /// Create a generic reduction using a recurrence descriptor \p Desc /// Fast-math-flags are propagated using the RecurrenceDescriptor. Value *createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, - Value *Src, PHINode *OrigPhi = nullptr); + Value *Src, Value *StartVal = nullptr, + PHINode *OrigPhi = nullptr); /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 45915c10107b2e..91b31c793b0f53 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1209,11 +1209,12 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src, } Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src, + Value *StartVal, const RecurrenceDescriptor &Desc) { assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind( Desc.getRecurrenceKind()) && "Unexpected reduction kind"); - Value *StartVal = Desc.getRecurrenceStartValue(); + assert(StartVal && "Null start value"); Value *Sentinel = Desc.getSentinelValue(); Value *MaxRdx = Src->getType()->isVectorTy() ? Builder.CreateIntMaxReduce(Src, true) @@ -1320,9 +1321,8 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src, return VBuilder.createSimpleReduction(Id, SrcTy, Ops); } -Value *llvm::createReduction(IRBuilderBase &B, - const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi) { +Value *llvm::createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, + Value *Src, Value *StartVal, PHINode *OrigPhi) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. @@ -1333,7 +1333,7 @@ Value *llvm::createReduction(IRBuilderBase &B, if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) return createAnyOfReduction(B, Src, Desc, OrigPhi); if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) - return createFindLastIVReduction(B, Src, Desc); + return createFindLastIVReduction(B, Src, StartVal, Desc); return createSimpleReduction(B, Src, RK); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d32a463a996c4f..d72a5df5693d83 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9794,15 +9794,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // Convert the reduction phi to operate on bools. PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( OrigLoop->getHeader()->getContext()))); - continue; - } - - if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( - RdxDesc.getRecurrenceKind())) { - // Adjust the start value for FindLastIV recurrences to use the sentinel - // value after generating the ResumePhi recipe, which uses the original - // start value. - PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue())); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index e54df8bdeac55a..aff648d8eed782 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -566,8 +566,9 @@ Value *VPInstruction::generate(VPTransformState &State) { RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) && !PhiR->isInLoop()) { + Value *StartVal = PhiR->getStartValue()->getLiveInIRValue(); ReducedPartRdx = - createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); + createReduction(Builder, RdxDesc, ReducedPartRdx, StartVal, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) @@ -3394,15 +3395,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { } } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { // [I|F]FindLastIV will use a sentinel value to initialize the reduction - // phi or the resume value from the main vector loop when vectorizing the - // epilogue loop. In the exit block, ComputeReductionResult will generate - // checks to verify if the reduction result is the sentinel value. If the - // result is the sentinel value, it will be corrected back to the start - // value. + // phi. In the exit block, ComputeReductionResult will generate checks to + // verify if the reduction result is the sentinel value. If the result is + // the sentinel value, it will be corrected back to the start value. // TODO: The sentinel value is not always necessary. When the start value is // a constant, and smaller than the start value of the induction variable, // the start value can be directly used to initialize the reduction phi. - Iden = StartV; + StartV = Iden = RdxDesc.getSentinelValue(); if (!ScalarPHI) { IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index 052b4a10e9c8d5..23ab84e46e1ff6 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -46,13 +46,11 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], -; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0 -; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 @@ -66,16 +64,16 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP11]]) ; CHECK-NEXT: [[RDX_SELECT_CMP10:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808 -; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 3 +; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 [[BC_MERGE_RDX]] ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 3, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 3, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[L]], 3 @@ -150,13 +148,11 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], -; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0 -; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 @@ -170,16 +166,16 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP11]]) ; CHECK-NEXT: [[RDX_SELECT_CMP10:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808 -; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 2 +; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 [[BC_MERGE_RDX]] ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 2, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 2, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4 ; CHECK-NEXT: [[C:%.*]] = fcmp fast ueq float [[L]], 3.000000e+00 From 1cf21f00ffdb43f6e97577739678691e358698bb Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 9 Jan 2025 01:38:54 -0800 Subject: [PATCH 4/7] Revert "[LV] Fix FindLastIV reduction for epilogue vectorization." This reverts commit 7e233ba712e8842ab65dbddeedb5fa82cf1dfc03. --- .../include/llvm/Transforms/Utils/LoopUtils.h | 10 +++---- llvm/lib/Transforms/Utils/LoopUtils.cpp | 10 +++---- .../Transforms/Vectorize/LoopVectorize.cpp | 9 ++++++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 +++++---- .../LoopVectorize/epilog-iv-select-cmp.ll | 28 +++++++++++-------- 5 files changed, 41 insertions(+), 29 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 4e5ec962e606d0..b4cd52fef70fd2 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -420,17 +420,15 @@ Value *createAnyOfReduction(IRBuilderBase &B, Value *Src, PHINode *OrigPhi); /// Create a reduction of the given vector \p Src for a reduction of the -/// kind RecurKind::IFindLastIV or RecurKind::FFindLastIV. The scalar \p -/// StartVal is the incoming value of reduction phi from outside the loop. The -/// reduction operation is described by \p Desc. -Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src, Value *StartVal, +/// kind RecurKind::IFindLastIV or RecurKind::FFindLastIV. The reduction +/// operation is described by \p Desc. +Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src, const RecurrenceDescriptor &Desc); /// Create a generic reduction using a recurrence descriptor \p Desc /// Fast-math-flags are propagated using the RecurrenceDescriptor. Value *createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, - Value *Src, Value *StartVal = nullptr, - PHINode *OrigPhi = nullptr); + Value *Src, PHINode *OrigPhi = nullptr); /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 91b31c793b0f53..45915c10107b2e 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1209,12 +1209,11 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src, } Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src, - Value *StartVal, const RecurrenceDescriptor &Desc) { assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind( Desc.getRecurrenceKind()) && "Unexpected reduction kind"); - assert(StartVal && "Null start value"); + Value *StartVal = Desc.getRecurrenceStartValue(); Value *Sentinel = Desc.getSentinelValue(); Value *MaxRdx = Src->getType()->isVectorTy() ? Builder.CreateIntMaxReduce(Src, true) @@ -1321,8 +1320,9 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src, return VBuilder.createSimpleReduction(Id, SrcTy, Ops); } -Value *llvm::createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, - Value *Src, Value *StartVal, PHINode *OrigPhi) { +Value *llvm::createReduction(IRBuilderBase &B, + const RecurrenceDescriptor &Desc, Value *Src, + PHINode *OrigPhi) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. @@ -1333,7 +1333,7 @@ Value *llvm::createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) return createAnyOfReduction(B, Src, Desc, OrigPhi); if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) - return createFindLastIVReduction(B, Src, StartVal, Desc); + return createFindLastIVReduction(B, Src, Desc); return createSimpleReduction(B, Src, RK); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d72a5df5693d83..d32a463a996c4f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9794,6 +9794,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // Convert the reduction phi to operate on bools. PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( OrigLoop->getHeader()->getContext()))); + continue; + } + + if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + // Adjust the start value for FindLastIV recurrences to use the sentinel + // value after generating the ResumePhi recipe, which uses the original + // start value. + PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue())); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index aff648d8eed782..e54df8bdeac55a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -566,9 +566,8 @@ Value *VPInstruction::generate(VPTransformState &State) { RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) && !PhiR->isInLoop()) { - Value *StartVal = PhiR->getStartValue()->getLiveInIRValue(); ReducedPartRdx = - createReduction(Builder, RdxDesc, ReducedPartRdx, StartVal, OrigPhi); + createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) @@ -3395,13 +3394,15 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { } } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { // [I|F]FindLastIV will use a sentinel value to initialize the reduction - // phi. In the exit block, ComputeReductionResult will generate checks to - // verify if the reduction result is the sentinel value. If the result is - // the sentinel value, it will be corrected back to the start value. + // phi or the resume value from the main vector loop when vectorizing the + // epilogue loop. In the exit block, ComputeReductionResult will generate + // checks to verify if the reduction result is the sentinel value. If the + // result is the sentinel value, it will be corrected back to the start + // value. // TODO: The sentinel value is not always necessary. When the start value is // a constant, and smaller than the start value of the induction variable, // the start value can be directly used to initialize the reduction phi. - StartV = Iden = RdxDesc.getSentinelValue(); + Iden = StartV; if (!ScalarPHI) { IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index 23ab84e46e1ff6..052b4a10e9c8d5 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -46,11 +46,13 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 @@ -64,16 +66,16 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP11]]) ; CHECK-NEXT: [[RDX_SELECT_CMP10:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808 -; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 [[BC_MERGE_RDX]] +; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 3 ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 3, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 3, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[L]], 3 @@ -148,11 +150,13 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 @@ -166,16 +170,16 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP11]]) ; CHECK-NEXT: [[RDX_SELECT_CMP10:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808 -; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 [[BC_MERGE_RDX]] +; CHECK-NEXT: [[RDX_SELECT11:%.*]] = select i1 [[RDX_SELECT_CMP10]], i64 [[TMP13]], i64 2 ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 2, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 2, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4 ; CHECK-NEXT: [[C:%.*]] = fcmp fast ueq float [[L]], 3.000000e+00 From c6cf63f1d9291b6376d86d318c93fdf495f2893e Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 9 Jan 2025 23:32:48 -0800 Subject: [PATCH 5/7] Adjust resume value of FindLastIV Use ResumeV = ResumeV == StartValue ? Sentinel : ResumeV to adjust the resume value. --- .../Transforms/Vectorize/LoopVectorize.cpp | 26 +++++++++++++++++++ .../LoopVectorize/epilog-iv-select-cmp.ll | 8 ++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d32a463a996c4f..781514750ad540 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7687,6 +7687,19 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( "AnyOf expected to start by comparing main resume value to original " "start value"); MainResumeValue = Cmp->getOperand(0); + } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + using namespace llvm::PatternMatch; + Value *Cmp, *OrigResumeV; + bool IsExpectedPattern = + match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), + m_Specific(RdxDesc.getSentinelValue()), + m_Value(OrigResumeV))) && + match(Cmp, + m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), + m_Specific(RdxDesc.getRecurrenceStartValue()))); + assert(IsExpectedPattern && "Unexpected reduction resume pattern"); + MainResumeValue = OrigResumeV; } PHINode *MainResumePhi = cast(MainResumeValue); @@ -10282,6 +10295,19 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, cast(ResumeV)->getParent()->getFirstNonPHI()); ResumeV = Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); + } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { + // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment + // to the resume value. The resume value is adjusted to the sentinel + // value when the final value from the main vector loop equals the start + // value. This ensures correctness when the start value might not be + // less than the minimum value of a monotonically increasing induction + // variable. + IRBuilder<> Builder( + cast(ResumeV)->getParent()->getFirstNonPHI()); + Value *Cmp = + Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue()); + ResumeV = + Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); } } else { // Retrieve the induction resume values for wide inductions from diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index 052b4a10e9c8d5..06f0f058891164 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -40,7 +40,9 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX1:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[BC_MERGE_RDX1]], 3 +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = select i1 [[TMP14]], i64 -9223372036854775808, i64 [[BC_MERGE_RDX1]] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 @@ -144,7 +146,9 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX1:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[BC_MERGE_RDX1]], 2 +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = select i1 [[TMP14]], i64 -9223372036854775808, i64 [[BC_MERGE_RDX1]] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 From 3e5d819e07df55519d7372747009ed85c1445487 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 13 Jan 2025 17:14:51 +0800 Subject: [PATCH 6/7] Update llvm/lib/Transforms/Vectorize/LoopVectorize.cpp Avoid warning on release build Co-authored-by: Florian Hahn --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 781514750ad540..d4db9b06d3d6c7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7699,6 +7699,7 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), m_Specific(RdxDesc.getRecurrenceStartValue()))); assert(IsExpectedPattern && "Unexpected reduction resume pattern"); + (void) IsExpectedPattern; MainResumeValue = OrigResumeV; } PHINode *MainResumePhi = cast(MainResumeValue); From ca464d0baa5016880004c9c2a83d5918bbc4e125 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 13 Jan 2025 01:16:07 -0800 Subject: [PATCH 7/7] Format --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d4db9b06d3d6c7..28357757682f68 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7699,7 +7699,7 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), m_Specific(RdxDesc.getRecurrenceStartValue()))); assert(IsExpectedPattern && "Unexpected reduction resume pattern"); - (void) IsExpectedPattern; + (void)IsExpectedPattern; MainResumeValue = OrigResumeV; } PHINode *MainResumePhi = cast(MainResumeValue);