diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh index 6b8f851d772..d37c65aa8ec 100644 --- a/.ci/scripts/build_llama_android.sh +++ b/.ci/scripts/build_llama_android.sh @@ -10,6 +10,12 @@ set -exu # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi +which "${PYTHON_EXECUTABLE}" +CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" + install_executorch_and_backend_lib() { echo "Installing executorch and xnnpack backend" clean_executorch_install_folders @@ -22,6 +28,7 @@ install_executorch_and_backend_lib() { -DANDROID_ABI="${ANDROID_ABI}" \ -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ @@ -47,6 +54,7 @@ build_llama_runner() { -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -Bcmake-android-out/examples/models/llama examples/models/llama cmake --build cmake-android-out/examples/models/llama -j4 --config Release diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 9bb881ce8eb..9735e26798d 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -154,6 +154,7 @@ cmake_install_executorch_libraries() { rm -rf cmake-out retry cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" \ -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index b4fbc4486a2..ef4859135c6 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -50,10 +50,12 @@ prepare_artifacts_upload() { build_cmake_executor_runner() { echo "Building executor_runner" + CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" rm -rf ${CMAKE_OUTPUT_DIR} cmake -DCMAKE_BUILD_TYPE=Debug \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ -B${CMAKE_OUTPUT_DIR} . cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug @@ -98,8 +100,7 @@ test_model() { build_cmake_xnn_executor_runner() { echo "Building xnn_executor_runner" - SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" - CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch" + CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" (rm -rf ${CMAKE_OUTPUT_DIR} \ && mkdir ${CMAKE_OUTPUT_DIR} \ diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh index 40767013e23..64dd6b829d8 100644 --- a/.ci/scripts/test_phi_3_mini.sh +++ b/.ci/scripts/test_phi_3_mini.sh @@ -22,8 +22,10 @@ NPROC=8 if hash nproc &> /dev/null; then NPROC=$(nproc); fi cmake_install_executorch_libraries() { + CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" cmake -DPYTHON_EXECUTABLE=python \ -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ @@ -39,8 +41,10 @@ cmake_install_executorch_libraries() { } cmake_build_phi_3_mini() { + CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh index be684b7bfa2..c21d0bb604e 100644 --- a/.ci/scripts/utils.sh +++ b/.ci/scripts/utils.sh @@ -136,6 +136,7 @@ cmake_install_executorch_lib() { clean_executorch_install_folders retry cmake -DBUCK2="$BUCK" \ -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_PREFIX_PATH="$($PYTHON_EXECUTABLE -c 'import torch as _; print(_.__path__[0])')" \ -DCMAKE_BUILD_TYPE=Release \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -Bcmake-out . diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index c759ae420e4..4c7d5ef2336 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -147,6 +147,8 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" + source .ci/scripts/utils.sh + install_executorch "use-pt-pinned-commit" BUILD_TOOL="cmake" PYTHON_EXECUTABLE=python \ bash .ci/scripts/build_llama_android.sh "${BUILD_TOOL}" diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 325dc6ff933..0f3b8d7494a 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -394,6 +394,7 @@ jobs: rm -rf cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \ -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ @@ -411,6 +412,7 @@ jobs: cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/build/Utils.cmake b/build/Utils.cmake index 47bc46b18d3..a2f2b61225f 100644 --- a/build/Utils.cmake +++ b/build/Utils.cmake @@ -321,3 +321,20 @@ function(resolve_python_executable) ) endif() endfunction() + +# find_package(Torch CONFIG REQUIRED) replacement for targets that +# have a header-only Torch dependency. Because find_package sets +# variables in the parent scope, we use a macro to preserve this +# rather than maintaining our own list of those variables. +macro(find_package_torch_headers) + # We cannot simply use CMAKE_FIND_ROOT_PATH_BOTH, because that does + # not propagate into TorchConfig.cmake. + foreach(mode_kind IN ITEMS PACKAGE LIBRARY INCLUDE) + set(OLD_CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} ${CMAKE_FIND_ROOT_PATH_MODE_${mode_kind}}) + set(CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} BOTH) + endforeach() + find_package(Torch CONFIG REQUIRED) + foreach(mode_kind IN ITEMS PACKAGE LIBRARY INCLUDE) + set(CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} ${OLD_CMAKE_FIND_ROOT_PATH_MODE_${mode_kind}}) + endforeach() +endmacro() diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index f8ded210996..b72968037c1 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -7,6 +7,12 @@ set -ex +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi +which "${PYTHON_EXECUTABLE}" +CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" + build_jar() { pushd extension/android ./gradlew build @@ -36,6 +42,7 @@ build_android_native_library() { fi cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ -DANDROID_ABI="${ANDROID_ABI}" \ -DANDROID_PLATFORM=android-26 \ @@ -69,6 +76,7 @@ build_android_native_library() { -DANDROID_ABI="${ANDROID_ABI}" \ -DANDROID_PLATFORM=android-26 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index 99e388095f6..3aecc0e1b33 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -61,6 +61,8 @@ message("Generated files ${gen_command_sources}") list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(optimized_kernels ${_optimized_kernels__srcs}) +find_package_torch_headers() +target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS}) target_link_libraries( optimized_kernels PRIVATE executorch_core cpublas extension_threadpool ) diff --git a/kernels/optimized/cpu/op_gelu.cpp b/kernels/optimized/cpu/op_gelu.cpp index 88591323397..dcb6bbc4279 100644 --- a/kernels/optimized/cpu/op_gelu.cpp +++ b/kernels/optimized/cpu/op_gelu.cpp @@ -13,6 +13,7 @@ #include +#include #include #include #include @@ -47,48 +48,26 @@ void gelu( CTYPE* out_data = output.mutable_data_ptr(); size_t lim = input.numel(); - // TODO: Add fast path for tanh using sleef's tanh if (approximate == "tanh") { - // 0.5 * x * (1 + Tanh(sqrt(2 / pi) * (x + 0.044715 * x^3)) - for (size_t i = 0; i < lim; ++i) { - const CTYPE x = in_data[i]; - const CTYPE kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; - const CTYPE kKappa = 0.044715; - auto x_cube = x * x * x; - auto inner = kBeta * (x + kKappa * x_cube); - out_data[i] = CTYPE(0.5) * x * (CTYPE(1) + std::tanh(inner)); + using Vec = at::vec::Vectorized; + int i = 0; + for (; i < lim - (lim % Vec::size()); i += Vec::size()) { + Vec x = Vec::loadu(in_data + i); + at::native::vectorized_gelu_approximated_with_tanh(x).store(out_data + i); } - } else if (approximate == "none") { // dont appx - // GELU(x) = x * Φ(x) where Φ(x) is the is the Cumulative Distribution - // Function for Gaussian Distribution. - -#ifndef __aarch64__ - for (size_t i = 0; i < lim; ++i) { - const CTYPE x = in_data[i]; - out_data[i] = CTYPE(0.5) * x * (CTYPE(1) + std::erf(x * M_SQRT1_2)); + for (; i < lim; ++i) { + out_data[i] = at::native::scalar_gelu_approximated_with_tanh(in_data[i]); } -#else - size_t i = 0; - if constexpr (std::is_same_v) { - for (; i + 4 < lim; i += 4) { - const float32x4_t in = - vld1q_f32(static_cast(&in_data[i])); - const float32x4_t m_sqrt1_2x4 = { - M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2}; - const float32x4_t ones = vmovq_n_f32(1.0); - const float32x4_t halves = vmovq_n_f32(0.5); - float32x4_t out = Sleef_erff4_u10(vmulq_f32(in, m_sqrt1_2x4)); - vst1q_f32( - static_cast(&out_data[i]), - vmulq_f32(vmulq_f32(vaddq_f32(out, ones), in), halves)); - } + } else if (approximate == "none") { + using Vec = at::vec::Vectorized; + int i = 0; + for (; i < lim - (lim % Vec::size()); i += Vec::size()) { + Vec x = Vec::loadu(in_data + i); + at::native::vectorized_gelu(x).store(out_data + i); } for (; i < lim; ++i) { - const CTYPE x = in_data[i]; - out_data[i] = CTYPE(0.5) * x * (CTYPE(1) + std::erf(x * M_SQRT1_2)); + out_data[i] = at::native::scalar_gelu(in_data[i]); } -#endif // __aarch64__ - } else { ET_KERNEL_CHECK_MSG( context, diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index d97e1eb5122..d8d6cfdec71 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -28,13 +28,9 @@ _OPTIMIZED_ATEN_OPS = ( op_target(name = "op_sigmoid"), op_target( name = "op_gelu", - deps = select({ - "DEFAULT": [], - "ovr_config//cpu:arm64": [ - "fbsource//third-party/sleef:sleef_arm", - ], - }) + [ + deps = [ "//executorch/kernels/portable/cpu/util:activation_ops_util", + "//executorch/runtime/core/portable_type/c10:aten_headers_for_executorch", ], ), op_target( @@ -96,6 +92,13 @@ _OPTIMIZED_ATEN_OPS = ( ), ) + +def get_sleef_preprocessor_flags(): + if runtime.is_oss: + return [] + return ["-DAT_BUILD_ARM_VEC256_WITH_SLEEF"] + + def define_common_targets(): """Defines targets that should be shared between fbcode and xplat. diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml index 52262e2dd53..28f1d595272 100644 --- a/kernels/optimized/optimized-oss.yaml +++ b/kernels/optimized/optimized-oss.yaml @@ -1,8 +1,8 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # # This yaml file contains operators that have optimized kernels available. -# Note that this is a copy of optimized.yaml that does not include gelu and -# log_softmax, due to the OSS build not currently including sleef. +# Note that this is a copy of optimized.yaml that does not include log_softmax, +# due to the OSS build not currently including sleef. # TODO (T183193812) - op: add.out @@ -40,6 +40,11 @@ - arg_meta: null kernel_name: torch::executor::opt_sigmoid_out +- op: gelu.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_gelu_out + - op: le.Scalar_out kernels: - arg_meta: null diff --git a/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl index 37a68abaa07..c079b97f634 100644 --- a/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl +++ b/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl @@ -134,5 +134,5 @@ def define_op_target(name, deps): def is_op_disabled(name): # TODO (gjcomer) Enable ops with sleef dependency in OSS - disabled_ops = ["op_gelu", "op_log_softmax"] + disabled_ops = ["op_log_softmax"] return name in disabled_ops diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index 2c8685ea5b7..f8d0a361733 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -22,13 +22,20 @@ elif [[ $(uname) == "Linux" ]]; then export LLVM_COV="${LLVM_COV:-llvm-cov}" fi +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi +which "${PYTHON_EXECUTABLE}" + build_executorch() { BUILD_VULKAN="OFF" if [ -x "$(command -v glslc)" ]; then BUILD_VULKAN="ON" fi + CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" cmake . \ -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ -DEXECUTORCH_USE_CPP_CODE_COVERAGE=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \