From e83c35dce31ed9d8c7fc81ba0115d2cbf702631e Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Thu, 26 Sep 2024 10:53:38 -0700 Subject: [PATCH] Add executorch parallel Differential Revision: D62711909 Pull Request resolved: https://github.com/pytorch/ao/pull/953 --- torchao/experimental/CMakeLists.txt | 6 +++- torchao/experimental/Utils.cmake | 8 ++++++ torchao/experimental/build_torchao_ops.sh | 4 +-- .../kernels/cpu/aarch64/CMakeLists.txt | 5 ++++ .../experimental/ops/linear/CMakeLists.txt | 5 ++++ ...bit_activation_groupwise_lowbit_weight.cpp | 2 +- .../linear/linear_a8wxdq_op/CMakeLists.txt | 6 ++-- .../ops/parallel-executorch-impl.h | 28 +++++++++++++++++++ torchao/experimental/ops/parallel.h | 3 +- 9 files changed, 58 insertions(+), 9 deletions(-) create mode 100644 torchao/experimental/ops/parallel-executorch-impl.h diff --git a/torchao/experimental/CMakeLists.txt b/torchao/experimental/CMakeLists.txt index 198e9ebd44..db2054c3a8 100644 --- a/torchao/experimental/CMakeLists.txt +++ b/torchao/experimental/CMakeLists.txt @@ -24,11 +24,15 @@ if(NOT TORCHAO_INCLUDE_DIRS) set(TORCHAO_INCLUDE_DIRS ${TORCHAO_ROOT}/../..) endif() +if (NOT TORCHAO_OP_TARGET) + message(FATAL_ERROR "TORCHAO_OP_TARGET is not set. Set it to ATEN or EXECUTORCH.") +endif() + if (NOT TORCHAO_PARALLEL_BACKEND) if (TORCHAO_OP_TARGET STREQUAL "ATEN") set(TORCHAO_PARALLEL_BACKEND "ATEN_OPENMP") elseif(TORCHAO_OP_TARGET STREQUAL "EXECUTORCH") - set(TORCHAO_PARALLEL_BACKEND "PTHREADPOOL") + set(TORCHAO_PARALLEL_BACKEND "EXECUTORCH") else() message(TORCHAO_PARALLEL_BACKEND "TORCHAO_PARALLEL_BACKEND is not set. Please set it directly or set TORCHAO_OP_TARGET to get a default.") endif() diff --git a/torchao/experimental/Utils.cmake b/torchao/experimental/Utils.cmake index 592f9366fc..d6e6254de7 100644 --- a/torchao/experimental/Utils.cmake +++ b/torchao/experimental/Utils.cmake @@ -23,6 +23,14 @@ function(target_link_torchao_parallel_backend target_name torchao_parallel_backe target_compile_definitions(${target_name} PRIVATE TORCHAO_PARALLEL_ATEN=1 AT_PARALLEL_OPENMP=1 INTRA_OP_PARALLEL=1) target_link_libraries(${target_name} PRIVATE ${TORCH_INSTALL_PREFIX}/lib/libomp${CMAKE_SHARED_LIBRARY_SUFFIX}) + elseif(TORCHAO_PARALLEL_BACKEND_TOUPPER STREQUAL "EXECUTORCH") + message(STATUS "Building with TORCHAO_PARALLEL_BACKEND=TORCHAO_PARALLEL_EXECUTORCH") + message(STATUS "EXECUTORCH_INCLUDE_DIRS: ${EXECUTORCH_INCLUDE_DIRS}") + message(STATUS "EXECUTORCH_LIBRARIES: ${EXECUTORCH_LIBRARIES}") + target_include_directories(${target_name} PRIVATE "${EXECUTORCH_INCLUDE_DIRS}") + target_link_libraries(${target_name} PRIVATE "${EXECUTORCH_LIBRARIES}") + target_compile_definitions(${target_name} PRIVATE TORCHAO_PARALLEL_EXECUTORCH=1) + elseif(TORCHAO_PARALLEL_BACKEND_TOUPPER STREQUAL "OPENMP") message(STATUS "Building with TORCHAO_PARALLEL_BACKEND=OPENMP. You must set the CMake variable OpenMP_ROOT to the OMP library location before compiling. Do not use this option if Torch was built with OPENMP; use ATEN_OPENMP instead.") find_package(OpenMP REQUIRED) diff --git a/torchao/experimental/build_torchao_ops.sh b/torchao/experimental/build_torchao_ops.sh index de6d8e17d8..2cb7201588 100644 --- a/torchao/experimental/build_torchao_ops.sh +++ b/torchao/experimental/build_torchao_ops.sh @@ -11,8 +11,8 @@ export CMAKE_OUT=/tmp/cmake-out/torchao cmake -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} \ -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT} \ -DTORCHAO_OP_TARGET="$1" \ - -DEXECUTORCH_LIBRARIES=${EXECUTORCH_LIBRARIES} \ - -DEXECUTORCH_INCLUDE_DIRS=${EXECUTORCH_INCLUDE_DIRS} \ + -DEXECUTORCH_LIBRARIES="${EXECUTORCH_LIBRARIES}" \ + -DEXECUTORCH_INCLUDE_DIRS="${EXECUTORCH_INCLUDE_DIRS}" \ -S . \ -B ${CMAKE_OUT} cmake --build ${CMAKE_OUT} --target install --config Release diff --git a/torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt b/torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt index ec497a1871..4f36945f8a 100644 --- a/torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt +++ b/torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt @@ -13,3 +13,8 @@ if (CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") ${TORCHAO_INCLUDE_DIRS}/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp ) endif() + +install( + TARGETS torchao_kernels_aarch64 + DESTINATION lib +) diff --git a/torchao/experimental/ops/linear/CMakeLists.txt b/torchao/experimental/ops/linear/CMakeLists.txt index 2f7b91bbf9..087dfeb383 100644 --- a/torchao/experimental/ops/linear/CMakeLists.txt +++ b/torchao/experimental/ops/linear/CMakeLists.txt @@ -10,3 +10,8 @@ include(${TORCHAO_ROOT}/Utils.cmake) add_library(torchao_ops_linear_${TORCHAO_PARALLEL_BACKEND} STATIC channelwise_8bit_activation_groupwise_lowbit_weight.cpp) target_link_torchao_parallel_backend(torchao_ops_linear_${TORCHAO_PARALLEL_BACKEND} "${TORCHAO_PARALLEL_BACKEND}") + +install( + TARGETS torchao_ops_linear_${TORCHAO_PARALLEL_BACKEND} + DESTINATION lib +) diff --git a/torchao/experimental/ops/linear/channelwise_8bit_activation_groupwise_lowbit_weight.cpp b/torchao/experimental/ops/linear/channelwise_8bit_activation_groupwise_lowbit_weight.cpp index ae611d3ccc..02557b61fa 100644 --- a/torchao/experimental/ops/linear/channelwise_8bit_activation_groupwise_lowbit_weight.cpp +++ b/torchao/experimental/ops/linear/channelwise_8bit_activation_groupwise_lowbit_weight.cpp @@ -93,7 +93,7 @@ LinearTilingParams get_default_linear_tiling_params( LinearTilingParams tiling_params; auto num_threads = torchao::get_num_threads(); - assert(num_threads >= 1); + TORCHAO_CHECK(num_threads >= 1, "num_threads must be >= 1"); tiling_params.mc_by_mr = 1; int mc = tiling_params.mc_by_mr * ukernel_config.mr; diff --git a/torchao/experimental/ops/linear/linear_a8wxdq_op/CMakeLists.txt b/torchao/experimental/ops/linear/linear_a8wxdq_op/CMakeLists.txt index f69d884cd8..31a8320108 100644 --- a/torchao/experimental/ops/linear/linear_a8wxdq_op/CMakeLists.txt +++ b/torchao/experimental/ops/linear/linear_a8wxdq_op/CMakeLists.txt @@ -19,7 +19,7 @@ if(TORCHAO_OP_TARGET STREQUAL "ATEN") target_compile_definitions(linear_a8wxdq_${TORCHAO_OP_TARGET} PRIVATE USE_ATEN=1) elseif(TORCHAO_OP_TARGET STREQUAL "EXECUTORCH") message(STATUS "Building with TORCHAO_OP_TARGET=EXECUTORCH") - add_library(linear_a8wxdq_${TORCHAO_OP_TARGET} SHARED + add_library(linear_a8wxdq_${TORCHAO_OP_TARGET} STATIC linear_a8wxdq_executorch/w2s.cpp linear_a8wxdq_executorch/w2sz.cpp linear_a8wxdq_executorch/w3s.cpp @@ -29,9 +29,9 @@ elseif(TORCHAO_OP_TARGET STREQUAL "EXECUTORCH") linear_a8wxdq_executorch/w5s.cpp linear_a8wxdq_executorch/w5sz.cpp ) - target_include_directories(linear_a8wxdq_${TORCHAO_OP_TARGET} PRIVATE ${EXECUTORCH_INCLUDE_DIRS}) + target_include_directories(linear_a8wxdq_${TORCHAO_OP_TARGET} PRIVATE "${EXECUTORCH_INCLUDE_DIRS}") target_compile_definitions(linear_a8wxdq_${TORCHAO_OP_TARGET} PRIVATE USE_EXECUTORCH=1) - target_link_libraries(linear_a8wxdq_${TORCHAO_OP_TARGET} PRIVATE ${EXECUTORCH_LIBRARIES}) + target_link_libraries(linear_a8wxdq_${TORCHAO_OP_TARGET} PRIVATE "${EXECUTORCH_LIBRARIES}") target_link_libraries(linear_a8wxdq_${TORCHAO_OP_TARGET} PRIVATE torchao_kernels_aarch64) target_link_libraries(linear_a8wxdq_${TORCHAO_OP_TARGET} PRIVATE torchao_ops_linear_${TORCHAO_PARALLEL_BACKEND}) else() diff --git a/torchao/experimental/ops/parallel-executorch-impl.h b/torchao/experimental/ops/parallel-executorch-impl.h new file mode 100644 index 0000000000..233f7250d4 --- /dev/null +++ b/torchao/experimental/ops/parallel-executorch-impl.h @@ -0,0 +1,28 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include + +template +void torchao::parallel_1d(const int64_t begin, const int64_t end, const F& f) { + torch::executorch::threadpool::get_threadpool()->run( + [&](size_t i) { + int64_t idx = begin + i; + f(idx); + }, + end - begin); +} + +inline void torchao::set_num_threads(int num_threads) { + torch::executorch::threadpool::get_threadpool()->_unsafe_reset_threadpool( + num_threads); +} + +inline int torchao::get_num_threads() { + return torch::executorch::threadpool::get_threadpool()->get_thread_count(); +} diff --git a/torchao/experimental/ops/parallel.h b/torchao/experimental/ops/parallel.h index e3949b8551..5372c5a2dd 100644 --- a/torchao/experimental/ops/parallel.h +++ b/torchao/experimental/ops/parallel.h @@ -34,8 +34,7 @@ int get_num_threads(); #ifdef TORCHAO_PARALLEL_EXECUTORCH #pragma message( \ "TORCHAO_PARALLEL_EXECUTORCH is set. Using ExecuTorch parallel backend.") - -#error "TORCHAO_PARALLEL_EXECUTORCH is not implemented yet" +#include #else #ifdef TORCHAO_PARALLEL_PTHREADPOOL