diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4fd07bf11bd8..9b08ec5bcb73 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58675,6 +58675,17 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, DAG.getTargetConstant(M, DL, MVT::i8)); } break; + case X86ISD::VPERMV: + if (IdxVal != 0) { + SDValue Mask = InVec.getOperand(0); + SDValue Src = InVec.getOperand(1); + Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits); + Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG, + DL, InSizeInBits); + SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src); + return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits); + } + break; case X86ISD::VPERMV3: if (IdxVal != 0) { SDValue Src0 = InVec.getOperand(0); diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 694f2bc53c51..955a7ffcec79 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -242,9 +242,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) @@ -308,9 +307,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) @@ -374,9 +372,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) @@ -440,9 +437,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 8d7f8d1db852..13410fb5cc4b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -226,9 +226,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 ; AVX512-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] @@ -292,9 +291,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 ; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] @@ -358,9 +356,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 ; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] @@ -424,9 +421,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]