From 023a0294232fe5fb354380c4f41738439a16c8bd Mon Sep 17 00:00:00 2001 From: Zhijin Zeng Date: Fri, 13 Dec 2024 10:56:28 +0800 Subject: [PATCH] [TargetLibraryInfo] Add libmvec support for risc-v 1. Rename LIBMVEC_X86 into LIBMVEC to support libmvec in risc-v. 2. Add RVVM1/2/4/8 in VFISAKind to distingusih the LMUL value. 3. Declare some RVV vector math functions in VecFuncs.def. In VecFuncs.def, I add the LI_DEFINE_VECFUNC of LIBMVEC_RVV as follow: ``` TLI_DEFINE_VECFUNC("sin", "_ZGV1Nxv_sin", SCALABLE(1), "_ZGVr1Nxv") TLI_DEFINE_VECFUNC("sin", "_ZGV2Nxv_sin", SCALABLE(2), "_ZGVr2Nxv") TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGV1Nxv_expf", SCALABLE(2), "_ZGVr1Nxv") TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGV2Nxv_expf", SCALABLE(4), "_ZGVr2Nxv") ``` The `VEC` of TLI_DEFINE_VECFUNC (e.g., `_ZGV2Nxv_sin`), its name mangling rules defined in https://github.com/riscv-non-isa/riscv-elf-psabi-doc/pull/455 Now it's still under review. The `VF` (e.g., `SCALABLE(2)`), it should be `vscale x (LMUL * 64 / sizeof(Type)`. The `VABI_PREFIX` (e.g., `_ZGVr1Nxv`), `r` means RISC-V vector extension, `1` is the LMUL value. ``` _ZGVr1Nxv --> RISC-V Vector Extension with LMUL=1 _ZGVr2Nxv --> RISC-V Vector Extension with LMUL=2 _ZGVr4Nxv --> RISC-V Vector Extension with LMUL=4 _ZGVr8Nxv --> RISC-V Vector Extension with LMUL=8 ``` --- clang/lib/Driver/ToolChains/Clang.cpp | 6 +- clang/lib/Driver/ToolChains/CommonArgs.cpp | 2 +- clang/lib/Driver/ToolChains/Flang.cpp | 6 +- clang/test/Driver/fveclib.c | 10 +- .../include/llvm/Analysis/TargetLibraryInfo.h | 2 +- llvm/include/llvm/Analysis/VecFuncs.def | 73 +++++ llvm/include/llvm/IR/VFABIDemangler.h | 4 + llvm/lib/Analysis/TargetLibraryInfo.cpp | 23 +- llvm/lib/Frontend/Driver/CodeGenOptions.cpp | 2 +- llvm/lib/IR/VFABIDemangler.cpp | 69 ++++- .../Transforms/Utils/InjectTLIMappings.cpp | 4 +- .../Generic/replace-intrinsics-with-veclib.ll | 18 +- .../LoopVectorize/RISCV/libm-vector-calls.ll | 284 +++++++++++------- .../X86/libm-vector-calls-VF2-VF8.ll | 2 +- .../X86/libm-vector-calls-finite.ll | 2 +- .../LoopVectorize/X86/libm-vector-calls.ll | 2 +- llvm/test/Transforms/Util/add-TLI-mappings.ll | 24 +- 17 files changed, 379 insertions(+), 154 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a020e00cd17392..1fdc1c66e9750e 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5805,9 +5805,11 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Triple.getArch() != llvm::Triple::x86_64) D.Diag(diag::err_drv_unsupported_opt_for_target) << Name << Triple.getArchName(); - } else if (Name == "LIBMVEC-X86") { + } else if (Name == "LIBMVEC") { if (Triple.getArch() != llvm::Triple::x86 && - Triple.getArch() != llvm::Triple::x86_64) + Triple.getArch() != llvm::Triple::x86_64 && + Triple.getArch() != llvm::Triple::riscv32 && + Triple.getArch() != llvm::Triple::riscv64) D.Diag(diag::err_drv_unsupported_opt_for_target) << Name << Triple.getArchName(); } else if (Name == "SLEEF" || Name == "ArmPL") { diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 0d851314a89539..2a4c298cfdeafb 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -909,7 +909,7 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, std::optional OptVal = llvm::StringSwitch>(ArgVecLib->getValue()) .Case("Accelerate", "Accelerate") - .Case("LIBMVEC", "LIBMVEC-X86") + .Case("LIBMVEC", "LIBMVEC") .Case("MASSV", "MASSV") .Case("SVML", "SVML") .Case("SLEEF", "sleefgnuabi") diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index c98fdbd157bac8..5d630d6e7777df 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -446,9 +446,11 @@ void Flang::addTargetOptions(const ArgList &Args, Triple.getArch() != llvm::Triple::x86_64) D.Diag(diag::err_drv_unsupported_opt_for_target) << Name << Triple.getArchName(); - } else if (Name == "LIBMVEC-X86") { + } else if (Name == "LIBMVEC") { if (Triple.getArch() != llvm::Triple::x86 && - Triple.getArch() != llvm::Triple::x86_64) + Triple.getArch() != llvm::Triple::x86_64 && + Triple.getArch() != llvm::Triple::riscv32 && + Triple.getArch() != llvm::Triple::riscv64) D.Diag(diag::err_drv_unsupported_opt_for_target) << Name << Triple.getArchName(); } else if (Name == "SLEEF" || Name == "ArmPL") { diff --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c index 7d0985c4dd4f48..0b127cb540e176 100644 --- a/clang/test/Driver/fveclib.c +++ b/clang/test/Driver/fveclib.c @@ -5,6 +5,7 @@ // RUN: %clang -### -c -fveclib=Darwin_libsystem_m %s 2>&1 | FileCheck --check-prefix=CHECK-DARWIN_LIBSYSTEM_M %s // RUN: %clang -### -c --target=aarch64 -fveclib=SLEEF %s 2>&1 | FileCheck --check-prefix=CHECK-SLEEF %s // RUN: %clang -### -c --target=riscv64-unknown-linux-gnu -fveclib=SLEEF -march=rv64gcv %s 2>&1 | FileCheck -check-prefix CHECK-SLEEF-RISCV %s +// RUN: %clang -### -c --target=riscv64-unknown-linux-gnu -fveclib=libmvec -march=rv64gcv %s 2>&1 | FileCheck -check-prefix CHECK-libmvec %s // RUN: %clang -### -c --target=aarch64 -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-ARMPL %s // RUN: not %clang -c -fveclib=something %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s @@ -21,7 +22,7 @@ // RUN: not %clang --target=x86 -c -fveclib=SLEEF %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s // RUN: not %clang --target=x86 -c -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s -// RUN: not %clang --target=aarch64 -c -fveclib=LIBMVEC-X86 %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s +// RUN: not %clang --target=aarch64 -c -fveclib=LIBMVEC %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s // RUN: not %clang --target=aarch64 -c -fveclib=SVML %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s // CHECK-ERROR: unsupported option {{.*}} for target @@ -38,7 +39,7 @@ /* Verify that the correct vector library is passed to LTO flags. */ // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=LIBMVEC -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-LIBMVEC %s -// CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC-X86" +// CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC" // RUN: %clang -### --target=powerpc64-unknown-linux-gnu -fveclib=MASSV -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-MASSV %s // CHECK-LTO-MASSV: "-plugin-opt=-vector-library=MASSV" @@ -52,6 +53,9 @@ // RUN: %clang -### --target=riscv64-unknown-linux-gnu -fveclib=SLEEF -flto -march=rv64gcv %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SLEEF-RISCV %s // CHECK-LTO-SLEEF-RISCV: "-plugin-opt=-vector-library=sleefgnuabi" +// RUN: %clang -### --target=riscv64-unknown-linux-gnu -fveclib=LIBMVEC -flto -march=rv64gcv %s 2>&1 | FileCheck -check-prefix CHECK-LTO-LIBMVEC-RISCV %s +// CHECK-LTO-LIBMVEC-RISCV: "-plugin-opt=-vector-library=LIBMVEC" + // RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-ARMPL %s // CHECK-LTO-ARMPL: "-plugin-opt=-vector-library=ArmPL" @@ -110,7 +114,7 @@ // CHECK-ENABLED-LAST: math errno enabled by '-ffp-model=strict' after it was implicitly disabled by '-fveclib=ArmPL', this may limit the utilization of the vector library [-Wmath-errno-enabled-with-veclib] /* Verify no warning when math-errno is re-enabled for a different veclib (that does not imply -fno-math-errno). */ -// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL -fmath-errno -fveclib=LIBMVEC %s 2>&1 | FileCheck --check-prefix=CHECK-REPEAT-VECLIB %s +// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL -fmath-errno -fveclib=LIBMVEC-X86 %s 2>&1 | FileCheck --check-prefix=CHECK-REPEAT-VECLIB %s // CHECK-REPEAT-VECLIB-NOT: math errno enabled /// Verify that vectorized routines library is being linked in. diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h index f51d2bb9d50a21..b6bfcb865e20db 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -124,7 +124,7 @@ class TargetLibraryInfoImpl { NoLibrary, // Don't use any vector library. Accelerate, // Use Accelerate framework. DarwinLibSystemM, // Use Darwin's libsystem_m. - LIBMVEC_X86, // GLIBC Vector Math library. + LIBMVEC, // GLIBC Vector Math library. MASSV, // IBM MASS vector library. SVML, // Intel short vector math library. SLEEFGNUABI, // SLEEF - SIMD Library for Evaluating Elementary Functions. diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def index 251331b9f860c8..5e5a37dc39f687 100644 --- a/llvm/include/llvm/Analysis/VecFuncs.def +++ b/llvm/include/llvm/Analysis/VecFuncs.def @@ -236,6 +236,79 @@ TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVdN4v_log", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVbN4v_logf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVdN8v_logf", FIXED(8), "_ZGV_LLVM_N8v") +#elif defined(TLI_DEFINE_LIBMVEC_RVV_VECFUNCS) +// GLIBC Vector math Functions for RISC-V + +TLI_DEFINE_VECFUNC("sin", "_ZGV1Nxv_sin", SCALABLE(1), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("sin", "_ZGV2Nxv_sin", SCALABLE(2), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("sin", "_ZGV4Nxv_sin", SCALABLE(4), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("sin", "_ZGV8Nxv_sin", SCALABLE(8), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGV1Nxv_sin", SCALABLE(1), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGV2Nxv_sin", SCALABLE(2), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGV4Nxv_sin", SCALABLE(4), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGV8Nxv_sin", SCALABLE(8), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("cos", "_ZGV1Nxv_cos", SCALABLE(1), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("cos", "_ZGV2Nxv_cos", SCALABLE(2), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("cos", "_ZGV4Nxv_cos", SCALABLE(4), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("cos", "_ZGV8Nxv_cos", SCALABLE(8), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGV1Nxv_cos", SCALABLE(1), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGV2Nxv_cos", SCALABLE(2), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGV4Nxv_cos", SCALABLE(4), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGV8Nxv_cos", SCALABLE(8), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("tan", "_ZGV1Nxv_tan", SCALABLE(1), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("tan", "_ZGV2Nxv_tan", SCALABLE(2), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("tan", "_ZGV4Nxv_tan", SCALABLE(4), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("tan", "_ZGV8Nxv_tan", SCALABLE(8), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGV1Nxv_tan", SCALABLE(1), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGV2Nxv_tan", SCALABLE(2), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGV4Nxv_tan", SCALABLE(4), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGV8Nxv_tan", SCALABLE(8), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("pow", "_ZGV1Nxvv_pow", SCALABLE(1), "_ZGVr1Nxvv") +TLI_DEFINE_VECFUNC("pow", "_ZGV2Nxvv_pow", SCALABLE(2), "_ZGVr2Nxvv") +TLI_DEFINE_VECFUNC("pow", "_ZGV4Nxvv_pow", SCALABLE(4), "_ZGVr4Nxvv") +TLI_DEFINE_VECFUNC("pow", "_ZGV8Nxvv_pow", SCALABLE(8), "_ZGVr8Nxvv") + +TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGV1Nxvv_pow", SCALABLE(1), "_ZGVr1Nxvv") +TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGV2Nxvv_pow", SCALABLE(2), "_ZGVr2Nxvv") +TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGV4Nxvv_pow", SCALABLE(4), "_ZGVr4Nxvv") +TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGV8Nxvv_pow", SCALABLE(8), "_ZGVr8Nxvv") + +TLI_DEFINE_VECFUNC("exp", "_ZGV1Nxv_exp", SCALABLE(1), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("exp", "_ZGV2Nxv_exp", SCALABLE(2), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("exp", "_ZGV4Nxv_exp", SCALABLE(4), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("exp", "_ZGV8Nxv_exp", SCALABLE(8), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("expf", "_ZGV1Nxv_expf", SCALABLE(2), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("expf", "_ZGV2Nxv_expf", SCALABLE(4), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("expf", "_ZGV4Nxv_expf", SCALABLE(8), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("expf", "_ZGV8Nxv_expf", SCALABLE(16), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGV1Nxv_exp", SCALABLE(1), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGV2Nxv_exp", SCALABLE(2), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGV4Nxv_exp", SCALABLE(4), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGV8Nxv_exp", SCALABLE(8), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGV1Nxv_expf", SCALABLE(2), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGV2Nxv_expf", SCALABLE(4), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGV4Nxv_expf", SCALABLE(8), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGV8Nxv_expf", SCALABLE(16), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("log", "_ZGV1Nxv_log", SCALABLE(1), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("log", "_ZGV2Nxv_log", SCALABLE(2), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("log", "_ZGV4Nxv_log", SCALABLE(4), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("log", "_ZGV8Nxv_log", SCALABLE(8), "_ZGVr8Nxv") + +TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGV1Nxv_log", SCALABLE(1), "_ZGVr1Nxv") +TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGV2Nxv_log", SCALABLE(2), "_ZGVr2Nxv") +TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGV4Nxv_log", SCALABLE(4), "_ZGVr4Nxv") +TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGV8Nxv_log", SCALABLE(8), "_ZGVr8Nxv") + #elif defined(TLI_DEFINE_MASSV_VECFUNCS) // IBM MASS library's vector Functions diff --git a/llvm/include/llvm/IR/VFABIDemangler.h b/llvm/include/llvm/IR/VFABIDemangler.h index de731cd7051c1b..283d42375fd4aa 100644 --- a/llvm/include/llvm/IR/VFABIDemangler.h +++ b/llvm/include/llvm/IR/VFABIDemangler.h @@ -49,6 +49,10 @@ enum class VFISAKind { AVX, // x86 AVX AVX2, // x86 AVX2 AVX512, // x86 AVX512 + RVVM1, // RISC-V Vector Extension LMUL=1 + RVVM2, // RISC-V Vector Extension LMUL=2 + RVVM4, // RISC-V Vector Extension LMUL=4 + RVVM8, // RISC-V Vector Extension LMUL=8 LLVM, // LLVM internal ISA for functions that are not // attached to an existing ABI via name mangling. Unknown // Unknown ISA diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 8557901192e406..0135c7a9b9ce7d 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -29,7 +29,7 @@ static cl::opt ClVectorLibrary( "Accelerate framework"), clEnumValN(TargetLibraryInfoImpl::DarwinLibSystemM, "Darwin_libsystem_m", "Darwin libsystem_m"), - clEnumValN(TargetLibraryInfoImpl::LIBMVEC_X86, "LIBMVEC-X86", + clEnumValN(TargetLibraryInfoImpl::LIBMVEC, "LIBMVEC", "GLIBC Vector Math library"), clEnumValN(TargetLibraryInfoImpl::MASSV, "MASSV", "IBM MASS vector library"), @@ -1291,6 +1291,12 @@ static const VecDesc VecFuncs_LIBMVEC_X86[] = { #undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS }; +static const VecDesc VecFuncs_LIBMVEC_RVV[] = { +#define TLI_DEFINE_LIBMVEC_RVV_VECFUNCS +#include "llvm/Analysis/VecFuncs.def" +#undef TLI_DEFINE_LIBMVEC_RVV_VECFUNCS +}; + static const VecDesc VecFuncs_MASSV[] = { #define TLI_DEFINE_MASSV_VECFUNCS #include "llvm/Analysis/VecFuncs.def" @@ -1360,8 +1366,19 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( addVectorizableFunctions(VecFuncs_DarwinLibSystemM); break; } - case LIBMVEC_X86: { - addVectorizableFunctions(VecFuncs_LIBMVEC_X86); + case LIBMVEC: { + switch (TargetTriple.getArch()) { + default: + break; + case llvm::Triple::x86: + case llvm::Triple::x86_64: + addVectorizableFunctions(VecFuncs_LIBMVEC_X86); + break; + case llvm::Triple::riscv64: + case llvm::Triple::riscv32: + addVectorizableFunctions(VecFuncs_LIBMVEC_RVV); + break; + } break; } case MASSV: { diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp index 2d74a91f62dc07..e368a15a09bf34 100644 --- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp +++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp @@ -23,7 +23,7 @@ TargetLibraryInfoImpl *createTLII(llvm::Triple &TargetTriple, TargetTriple); break; case VectorLibrary::LIBMVEC: - TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::LIBMVEC_X86, + TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::LIBMVEC, TargetTriple); break; case VectorLibrary::MASSV: diff --git a/llvm/lib/IR/VFABIDemangler.cpp b/llvm/lib/IR/VFABIDemangler.cpp index 897583084bf38c..07befb7b89b978 100644 --- a/llvm/lib/IR/VFABIDemangler.cpp +++ b/llvm/lib/IR/VFABIDemangler.cpp @@ -38,11 +38,19 @@ static ParseRet tryParseISA(StringRef &MangledName, VFISAKind &ISA) { if (MangledName.consume_front(VFABI::_LLVM_)) { ISA = VFISAKind::LLVM; + } else if (MangledName.consume_front("r")) { + ISA = StringSwitch(MangledName.take_front(1)) + .Case("1", VFISAKind::RVVM1) + .Case("2", VFISAKind::RVVM2) + .Case("4", VFISAKind::RVVM4) + .Case("8", VFISAKind::RVVM8) + .Default(VFISAKind::RVV); + if (ISA != VFISAKind::RVV) + MangledName = MangledName.drop_front(1); } else { ISA = StringSwitch(MangledName.take_front(1)) .Case("n", VFISAKind::AdvancedSIMD) .Case("s", VFISAKind::SVE) - .Case("r", VFISAKind::RVV) .Case("b", VFISAKind::SSE) .Case("c", VFISAKind::AVX) .Case("d", VFISAKind::AVX2) @@ -79,8 +87,10 @@ static ParseRet tryParseMask(StringRef &MangledName, bool &IsMasked) { static ParseRet tryParseVLEN(StringRef &ParseString, VFISAKind ISA, std::pair &ParsedVF) { if (ParseString.consume_front("x")) { - // SVE is the only scalable ISA currently supported. - if (ISA != VFISAKind::SVE && ISA != VFISAKind::RVV) { + // SVE/RVV is the only two scalable ISAs currently supported. + if (ISA != VFISAKind::SVE && ISA != VFISAKind::RVV && + ISA != VFISAKind::RVVM1 && ISA != VFISAKind::RVVM2 && + ISA != VFISAKind::RVVM4 && ISA != VFISAKind::RVVM8) { LLVM_DEBUG(dbgs() << "Vector function variant declared with scalable VF " << "but ISA supported for SVE and RVV only\n"); return ParseRet::Error; @@ -302,17 +312,52 @@ static ParseRet tryParseAlign(StringRef &ParseString, Align &Alignment) { // the number of elements of the given type which would fit in such a vector. static std::optional getElementCountForTy(const VFISAKind ISA, const Type *Ty) { - assert((ISA == VFISAKind::SVE || ISA == VFISAKind::RVV) && + // Only AArch64 SVE and RVV are supported at present. + assert((ISA == VFISAKind::SVE || ISA == VFISAKind::RVV || + ISA == VFISAKind::RVVM1 || ISA == VFISAKind::RVVM2 || + ISA == VFISAKind::RVVM4 || ISA == VFISAKind::RVVM8) && "Scalable VF decoding only implemented for SVE and RVV\n"); - if (Ty->isIntegerTy(64) || Ty->isDoubleTy() || Ty->isPointerTy()) - return ElementCount::getScalable(2); - if (Ty->isIntegerTy(32) || Ty->isFloatTy()) - return ElementCount::getScalable(4); - if (Ty->isIntegerTy(16) || Ty->is16bitFPTy()) - return ElementCount::getScalable(8); - if (Ty->isIntegerTy(8)) - return ElementCount::getScalable(16); + if (ISA == VFISAKind::SVE || ISA == VFISAKind::RVV) { + if (Ty->isIntegerTy(64) || Ty->isDoubleTy() || Ty->isPointerTy()) + return ElementCount::getScalable(2); + if (Ty->isIntegerTy(32) || Ty->isFloatTy()) + return ElementCount::getScalable(4); + if (Ty->isIntegerTy(16) || Ty->is16bitFPTy()) + return ElementCount::getScalable(8); + if (Ty->isIntegerTy(8)) + return ElementCount::getScalable(16); + } else if (ISA == VFISAKind::RVVM1 || ISA == VFISAKind::RVVM2 || + ISA == VFISAKind::RVVM4 || ISA == VFISAKind::RVVM8) { + // Because 'vscale = VLENB/8', so the ElementCount should be + // 'vscale x (LMUL * 64 / sizeof(Type))'. + unsigned Number = 1; + unsigned LMUL = 1; + unsigned ElemCount; + + // TODO: need to distingush rv32 and rv64. + if (Ty->isPointerTy()) + return std::nullopt; + + if (Ty->isIntegerTy(64) || Ty->isDoubleTy()) + Number = 1; + if (Ty->isIntegerTy(32) || Ty->isFloatTy()) + Number = 2; + if (Ty->isIntegerTy(16) || Ty->is16bitFPTy()) + Number = 4; + if (Ty->isIntegerTy(8)) + Number = 8; + + if (ISA == VFISAKind::RVVM2) + LMUL = 2; + else if (ISA == VFISAKind::RVVM4) + LMUL = 4; + else if (ISA == VFISAKind::RVVM8) + LMUL = 8; + + ElemCount = LMUL * Number; + return ElementCount::getScalable(ElemCount); + } return std::nullopt; } diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp index 92b5d444aac340..8456e4294083bf 100644 --- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp +++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp @@ -109,11 +109,11 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) { TLI.getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF); for (bool Predicated : {false, true}) { - for (ElementCount VF = ElementCount::getFixed(2); + for (ElementCount VF = ElementCount::getFixed(1); ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2) AddVariantDecl(VF, Predicated); - for (ElementCount VF = ElementCount::getScalable(2); + for (ElementCount VF = ElementCount::getScalable(1); ElementCount::isKnownLE(VF, WidestScalableVF); VF *= 2) AddVariantDecl(VF, Predicated); } diff --git a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll index fde6cb788b46f9..1bd929f836f9ed 100644 --- a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll +++ b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes ; RUN: opt -vector-library=SVML -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,SVML ; RUN: opt -vector-library=AMDLIBM -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,AMDLIBM -; RUN: opt -vector-library=LIBMVEC-X86 -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,LIBMVEC-X86 +; RUN: opt -vector-library=LIBMVEC -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,LIBMVEC ; RUN: opt -vector-library=MASSV -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,MASSV ; RUN: opt -vector-library=Accelerate -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,ACCELERATE @@ -19,10 +19,10 @@ define <4 x double> @exp_v4(<4 x double> %in) { ; AMDLIBM-NEXT: [[TMP1:%.*]] = call <4 x double> @amd_vrd4_exp(<4 x double> [[IN]]) ; AMDLIBM-NEXT: ret <4 x double> [[TMP1]] ; -; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_v4 -; LIBMVEC-X86-SAME: (<4 x double> [[IN:%.*]]) { -; LIBMVEC-X86-NEXT: [[TMP1:%.*]] = call <4 x double> @_ZGVdN4v_exp(<4 x double> [[IN]]) -; LIBMVEC-X86-NEXT: ret <4 x double> [[TMP1]] +; LIBMVEC-LABEL: define {{[^@]+}}@exp_v4 +; LIBMVEC-SAME: (<4 x double> [[IN:%.*]]) { +; LIBMVEC-NEXT: [[TMP1:%.*]] = call <4 x double> @_ZGVdN4v_exp(<4 x double> [[IN]]) +; LIBMVEC-NEXT: ret <4 x double> [[TMP1]] ; ; MASSV-LABEL: define {{[^@]+}}@exp_v4 ; MASSV-SAME: (<4 x double> [[IN:%.*]]) { @@ -51,10 +51,10 @@ define <4 x float> @exp_f32(<4 x float> %in) { ; AMDLIBM-NEXT: [[TMP1:%.*]] = call <4 x float> @amd_vrs4_expf(<4 x float> [[IN]]) ; AMDLIBM-NEXT: ret <4 x float> [[TMP1]] ; -; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_f32 -; LIBMVEC-X86-SAME: (<4 x float> [[IN:%.*]]) { -; LIBMVEC-X86-NEXT: [[TMP1:%.*]] = call <4 x float> @_ZGVbN4v_expf(<4 x float> [[IN]]) -; LIBMVEC-X86-NEXT: ret <4 x float> [[TMP1]] +; LIBMVEC-LABEL: define {{[^@]+}}@exp_f32 +; LIBMVEC-SAME: (<4 x float> [[IN:%.*]]) { +; LIBMVEC-NEXT: [[TMP1:%.*]] = call <4 x float> @_ZGVbN4v_expf(<4 x float> [[IN]]) +; LIBMVEC-NEXT: ret <4 x float> [[TMP1]] ; ; MASSV-LABEL: define {{[^@]+}}@exp_f32 ; MASSV-SAME: (<4 x float> [[IN:%.*]]) { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/libm-vector-calls.ll b/llvm/test/Transforms/LoopVectorize/RISCV/libm-vector-calls.ll index 75fdd00e25f988..bbcd8b322cbbd7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/libm-vector-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/libm-vector-calls.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -mtriple=riscv64 -mattr=+v -vector-library=LIBMVEC-X86 -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s +; RUN: opt -mtriple=riscv64 -mattr=+v -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" target triple = "riscv64-unknown-linux-gnu" @@ -8,37 +8,50 @@ define void @sin_f64(ptr nocapture %varray) { ; CHECK-LABEL: define void @sin_f64( ; CHECK-SAME: ptr nocapture [[VARRAY:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i32 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 1, [[TMP9]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double> -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x double> -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP1]]) -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP12:%.*]] = sitofp [[VEC_IND]] to +; CHECK-NEXT: [[TMP13:%.*]] = call @_ZGV2Nxv_sin( [[TMP12]]) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[VARRAY]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 4 -; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[TMP15]], align 4 -; CHECK-NEXT: store <4 x double> [[TMP4]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: store [[TMP13]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double -; CHECK-NEXT: [[CALL:%.*]] = tail call double @sin(double [[CONV]]) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call double @sin(double [[CONV]]) #[[ATTR4:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VARRAY]], i64 [[IV]] ; CHECK-NEXT: store double [[CALL]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -73,37 +86,50 @@ define void @sin_f64_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: define void @sin_f64_intrinsic( ; CHECK-SAME: ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i32 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 1, [[TMP9]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double> -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x double> -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP1]]) -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP12:%.*]] = sitofp [[VEC_IND]] to +; CHECK-NEXT: [[TMP13:%.*]] = call @_ZGV2Nxv_sin( [[TMP12]]) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[VARRAY]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 4 -; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[TMP15]], align 4 -; CHECK-NEXT: store <4 x double> [[TMP4]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: store [[TMP13]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double -; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV]]) #[[ATTR5:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VARRAY]], i64 [[IV]] ; CHECK-NEXT: store double [[CALL]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -137,37 +163,50 @@ define void @cos_f64(ptr nocapture %varray) { ; CHECK-LABEL: define void @cos_f64( ; CHECK-SAME: ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i32 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 1, [[TMP9]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double> -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x double> -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP1]]) -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP12:%.*]] = sitofp [[VEC_IND]] to +; CHECK-NEXT: [[TMP13:%.*]] = call @_ZGV2Nxv_cos( [[TMP12]]) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[VARRAY]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 4 -; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[TMP15]], align 4 -; CHECK-NEXT: store <4 x double> [[TMP4]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: store [[TMP13]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double -; CHECK-NEXT: [[CALL:%.*]] = tail call double @cos(double [[CONV]]) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call double @cos(double [[CONV]]) #[[ATTR6:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VARRAY]], i64 [[IV]] ; CHECK-NEXT: store double [[CALL]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -201,37 +240,50 @@ define void @cos_f64_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: define void @cos_f64_intrinsic( ; CHECK-SAME: ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i32 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 1, [[TMP9]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double> -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x double> -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP1]]) -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP12:%.*]] = sitofp [[VEC_IND]] to +; CHECK-NEXT: [[TMP13:%.*]] = call @_ZGV2Nxv_cos( [[TMP12]]) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[VARRAY]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 4 -; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[TMP15]], align 4 -; CHECK-NEXT: store <4 x double> [[TMP4]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: store [[TMP13]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double -; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV]]) #[[ATTR6:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV]]) #[[ATTR7:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VARRAY]], i64 [[IV]] ; CHECK-NEXT: store double [[CALL]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -265,37 +317,50 @@ define void @exp_f32(ptr nocapture %varray) { ; CHECK-LABEL: define void @exp_f32( ; CHECK-SAME: ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i32 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 1, [[TMP9]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i32> [[VEC_IND]], splat (i32 8) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float> -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[STEP_ADD]] to <8 x float> -; CHECK-NEXT: [[TMP3:%.*]] = call fast <8 x float> @_ZGVdN8v_expf(<8 x float> [[TMP1]]) -; CHECK-NEXT: [[TMP4:%.*]] = call fast <8 x float> @_ZGVdN8v_expf(<8 x float> [[TMP2]]) +; CHECK-NEXT: [[TMP12:%.*]] = sitofp [[VEC_IND]] to +; CHECK-NEXT: [[TMP13:%.*]] = call fast @_ZGV2Nxv_expf( [[TMP12]]) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[VARRAY]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 8 -; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[TMP15]], align 4 -; CHECK-NEXT: store <8 x float> [[TMP4]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[STEP_ADD]], splat (i32 8) -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 -; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: store [[TMP13]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 992, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float -; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @expf(float [[CONV]]) #[[ATTR7:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @expf(float [[CONV]]) #[[ATTR8:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VARRAY]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store float [[CALL]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -329,37 +394,50 @@ define void @exp_f32_intrin(ptr nocapture %varray) { ; CHECK-LABEL: define void @exp_f32_intrin( ; CHECK-SAME: ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i32 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 1, [[TMP9]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i32> [[VEC_IND]], splat (i32 8) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float> -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[STEP_ADD]] to <8 x float> -; CHECK-NEXT: [[TMP3:%.*]] = call fast <8 x float> @_ZGVdN8v_expf(<8 x float> [[TMP1]]) -; CHECK-NEXT: [[TMP4:%.*]] = call fast <8 x float> @_ZGVdN8v_expf(<8 x float> [[TMP2]]) +; CHECK-NEXT: [[TMP12:%.*]] = sitofp [[VEC_IND]] to +; CHECK-NEXT: [[TMP13:%.*]] = call fast @_ZGV2Nxv_expf( [[TMP12]]) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[VARRAY]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 8 -; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[TMP15]], align 4 -; CHECK-NEXT: store <8 x float> [[TMP4]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[STEP_ADD]], splat (i32 8) -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 -; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: store [[TMP13]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 992, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float -; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @llvm.exp.f32(float [[CONV]]) #[[ATTR8:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @llvm.exp.f32(float [[CONV]]) #[[ATTR9:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VARRAY]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store float [[CALL]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll index 67a2cf2b80e70c..91d5c52fa43c6b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll @@ -1,4 +1,4 @@ -; RUN: opt -vector-library=LIBMVEC-X86 -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s +; RUN: opt -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll index d0d0d78a0d27e2..bdb89fbbaa8473 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll @@ -1,4 +1,4 @@ -; RUN: opt -vector-library=LIBMVEC-X86 -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s +; RUN: opt -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll index 7a0e44c9e99161..e0661301fd90c2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll @@ -1,4 +1,4 @@ -; RUN: opt -vector-library=LIBMVEC-X86 -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s +; RUN: opt -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll index 39caf008f924f1..dd0f921cf315a0 100644 --- a/llvm/test/Transforms/Util/add-TLI-mappings.ll +++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll @@ -1,7 +1,7 @@ ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=SVML -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,SVML ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=AMDLIBM -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,AMDLIBM ; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -vector-library=MASSV -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,MASSV -; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=LIBMVEC-X86 -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,LIBMVEC-X86 +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,LIBMVEC ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=Accelerate -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,ACCELERATE ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=sleefgnuabi -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,SLEEFGNUABI ; RUN: opt -mtriple=riscv64-unknown-linux-gnu -vector-library=sleefgnuabi -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,SLEEFGNUABI_RISCV @@ -32,9 +32,9 @@ ; MASSV-SAME: ptr @__log10f4 ; ACCELERATE-SAME: [1 x ptr] [ ; ACCELERATE-SAME: ptr @vlog10f -; LIBMVEC-X86-SAME: [2 x ptr] [ -; LIBMVEC-X86-SAME: ptr @_ZGVbN2v_sin, -; LIBMVEC-X86-SAME: ptr @_ZGVdN4v_sin +; LIBMVEC-SAME: [2 x ptr] [ +; LIBMVEC-SAME: ptr @_ZGVbN2v_sin, +; LIBMVEC-SAME: ptr @_ZGVdN4v_sin ; SLEEFGNUABI-SAME: [16 x ptr] [ ; SLEEFGNUABI-SAME: ptr @_ZGVnN2vl8_modf, ; SLEEFGNUABI-SAME: ptr @_ZGVsNxvl8_modf, @@ -97,7 +97,7 @@ define double @sin_f64(double %in) { ; AMDLIBM: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] ; MASSV: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] ; ACCELERATE: call double @sin(double %{{.*}}) -; LIBMVEC-X86: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] +; LIBMVEC: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] ; SLEEFGNUABI: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] ; SLEEFGNUABI_RISCV: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] ; ARMPL: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] @@ -155,7 +155,7 @@ define float @call_llvm.log10.f32(float %in) { ; COMMON-LABEL: @call_llvm.log10.f32( ; SVML: call float @llvm.log10.f32(float %{{.*}}) ; AMDLIBM: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]] -; LIBMVEC-X86: call float @llvm.log10.f32(float %{{.*}}) +; LIBMVEC: call float @llvm.log10.f32(float %{{.*}}) ; MASSV: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]] ; ACCELERATE: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]] ; SLEEFGNUABI: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]] @@ -164,7 +164,7 @@ define float @call_llvm.log10.f32(float %in) { ; No mapping of "llvm.log10.f32" to a vector function for SVML. ; SVML-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}}) ; AMDLIBM-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}}) -; LIBMVEC-X86-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}}) +; LIBMVEC-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}}) %call = tail call float @llvm.log10.f32(float %in) ret float %call } @@ -193,8 +193,8 @@ declare float @llvm.log10.f32(float) #0 ; MASSV: declare <2 x double> @__sind2(<2 x double>) ; MASSV: declare <4 x float> @__log10f4(<4 x float>) -; LIBMVEC-X86: declare <2 x double> @_ZGVbN2v_sin(<2 x double>) -; LIBMVEC-X86: declare <4 x double> @_ZGVdN4v_sin(<4 x double>) +; LIBMVEC: declare <2 x double> @_ZGVbN2v_sin(<2 x double>) +; LIBMVEC: declare <4 x double> @_ZGVdN4v_sin(<4 x double>) ; ACCELERATE: declare <4 x float> @vlog10f(<4 x float>) @@ -266,9 +266,9 @@ attributes #0 = { nounwind readnone } ; ACCELERATE: attributes #[[LOG10]] = { "vector-function-abi-variant"= ; ACCELERATE-SAME: "_ZGV_LLVM_N4v_llvm.log10.f32(vlog10f)" } -; LIBMVEC-X86: attributes #[[SIN]] = { "vector-function-abi-variant"= -; LIBMVEC-X86-SAME: "_ZGV_LLVM_N2v_sin(_ZGVbN2v_sin), -; LIBMVEC-X86-SAME: _ZGV_LLVM_N4v_sin(_ZGVdN4v_sin)" } +; LIBMVEC: attributes #[[SIN]] = { "vector-function-abi-variant"= +; LIBMVEC-SAME: "_ZGV_LLVM_N2v_sin(_ZGVbN2v_sin), +; LIBMVEC-SAME: _ZGV_LLVM_N4v_sin(_ZGVdN4v_sin)" } ; SLEEFGNUABI: attributes #[[MODF]] = { "vector-function-abi-variant"= ; SLEEFGNUABI-SAME: "_ZGV_LLVM_N2vl8_modf(_ZGVnN2vl8_modf),