From f4e1fa94f6b8385092eaddc721d6dfe9799f9a68 Mon Sep 17 00:00:00 2001 From: Sameer Sheorey <41028320+ssheorey@users.noreply.github.com> Date: Wed, 18 Sep 2024 10:27:45 -0700 Subject: [PATCH 1/4] Add remote visualizer example. Allow changing rpc address in draw. (#6973) --- .../python/visualization/remote_visualizer.py | 71 +++++++++++++++++++ .../visualization/_external_visualizer.py | 7 ++ python/open3d/visualization/draw.py | 11 +-- 3 files changed, 85 insertions(+), 4 deletions(-) create mode 100644 examples/python/visualization/remote_visualizer.py diff --git a/examples/python/visualization/remote_visualizer.py b/examples/python/visualization/remote_visualizer.py new file mode 100644 index 00000000000..e360e55272b --- /dev/null +++ b/examples/python/visualization/remote_visualizer.py @@ -0,0 +1,71 @@ +# ---------------------------------------------------------------------------- +# - Open3D: www.open3d.org - +# ---------------------------------------------------------------------------- +# Copyright (c) 2018-2023 www.open3d.org +# SPDX-License-Identifier: MIT +# ---------------------------------------------------------------------------- +"""This example shows Open3D's remote visualization feature using RPC +communication. To run this example, start the client first by running + +python remote_visualizer.py client + +and then run the server by running + +python remote_visualizer.py server + +Port 51454 is used by default for communication. For remote visualization (client +and server running on different machines), use ssh to forward the remote server +port to your local computer: + + ssh -N -R 51454:localhost:51454 user@remote_host + +See documentation for more details (e.g. to use a different port). +""" +import sys +import numpy as np +import open3d as o3d +import open3d.visualization as vis + + +def make_point_cloud(npts, center, radius, colorize): + pts = np.random.uniform(-radius, radius, size=[npts, 3]) + center + cloud = o3d.geometry.PointCloud() + cloud.points = o3d.utility.Vector3dVector(pts) + if colorize: + colors = np.random.uniform(0.0, 1.0, size=[npts, 3]) + cloud.colors = o3d.utility.Vector3dVector(colors) + return cloud + + +def server_time_animation(): + orig = make_point_cloud(200, (0, 0, 0), 1.0, True) + clouds = [{"name": "t=0", "geometry": orig, "time": 0}] + drift_dir = (1.0, 0.0, 0.0) + expand = 1.0 + n = 20 + ev = o3d.visualization.ExternalVisualizer() + for i in range(1, n): + amount = float(i) / float(n - 1) + cloud = o3d.geometry.PointCloud() + pts = np.asarray(orig.points) + pts = pts * (1.0 + amount * expand) + [amount * v for v in drift_dir] + cloud.points = o3d.utility.Vector3dVector(pts) + cloud.colors = orig.colors + ev.set(obj=cloud, time=i, path=f"points at t={i}") + print('.', end='', flush=True) + print() + + +def client_time_animation(): + o3d.visualization.draw(title="Open3D - Remote Visualizer Client", + show_ui=True, + rpc_interface=True) + + +if __name__ == "__main__": + assert len(sys.argv) == 2 and sys.argv[1] in ('client', 'server'), ( + "Usage: python remote_visualizer.py [client|server]") + if sys.argv[1] == "client": + client_time_animation() + elif sys.argv[1] == "server": + server_time_animation() diff --git a/python/open3d/visualization/_external_visualizer.py b/python/open3d/visualization/_external_visualizer.py index ac7209127f0..774ddbaebe1 100644 --- a/python/open3d/visualization/_external_visualizer.py +++ b/python/open3d/visualization/_external_visualizer.py @@ -37,17 +37,23 @@ def set(self, obj=None, path='', time=0, layer='', connection=None): Example: To quickly send a single object just write:: + ev.set(point_cloud) To place the object at a specific location in the scene tree do:: + ev.set(point_cloud, path='group/mypoints', time=42, layer='') + Note that depending on the visualizer some arguments like time or layer may not be supported and will be ignored. To set multiple objects use a list to pass multiple objects:: + ev.set([point_cloud, mesh, camera]) + Each entry in the list can be a tuple specifying all or some of the location parameters:: + ev.set(objs=[(point_cloud,'group/mypoints', 1, 'layer1'), (mesh, 'group/mymesh'), camera @@ -147,6 +153,7 @@ def draw(self, geometry=None, *args, **kwargs): Example: Here we use draw with the default external visualizer:: + import open3d as o3d torus = o3d.geometry.TriangleMesh.create_torus() diff --git a/python/open3d/visualization/draw.py b/python/open3d/visualization/draw.py index f7764b41400..aa6e29e2b2c 100644 --- a/python/open3d/visualization/draw.py +++ b/python/open3d/visualization/draw.py @@ -83,9 +83,10 @@ def draw(geometry=None, animation_time_step (float): Duration in seconds for each animation frame. animation_duration (float): Total animation duration in seconds. - rpc_interface (bool): Start an RPC interface at http://localhost:51454 and - listen for drawing requests. The requests can be made with - :class:`open3d.visualization.ExternalVisualizer`. + rpc_interface (bool or str): Start an RPC interface at this local + address and listen for drawing requests. If rpc_interface is True, the + default address "tcp://localhost:51454" is used. The requests can be + made with :class:`open3d.visualization.ExternalVisualizer`. on_init (Callable): Extra initialization procedure for the underlying GUI window. The procedure receives a single argument of type :class:`open3d.visualization.O3DVisualizer`. @@ -202,7 +203,9 @@ def add(g, n): w.show_skybox(show_skybox) if rpc_interface: - w.start_rpc_interface(address="tcp://127.0.0.1:51454", timeout=10000) + if not isinstance(rpc_interface, str): + rpc_interface = "tcp://127.0.0.1:51454" + w.start_rpc_interface(address=rpc_interface, timeout=10000) def stop_rpc(): w.stop_rpc_interface() From e88c7b13270d961393ab1e64c03d1f7ccd047cce Mon Sep 17 00:00:00 2001 From: Sameer Sheorey <41028320+ssheorey@users.noreply.github.com> Date: Sun, 29 Sep 2024 23:07:07 -0700 Subject: [PATCH 2/4] Replace conda with pyenv to fix incorrect libstdc++ use in jammy CI. (#6966) * Replace conda with pyenv to fix incorrect libstdc++ use in jammy CI. * Use RPATH instead of RUNPATH to load libc++abi.so directly in Python. No need to find and load explicitly. * Use libc++11 to build in Ubuntu 22.04. Warn if newer version is used. * TODO: Solution for Ubuntu 24.04 --- 3rdparty/find_dependencies.cmake | 9 ++++++ cpp/open3d/core/nns/NanoFlannImpl.h | 25 ++++------------ cpp/pybind/CMakeLists.txt | 3 ++ cpp/tests/t/geometry/TensorMap.cpp | 7 +++++ docker/Dockerfile.ci | 44 +++++++++++++++++++---------- docker/docker_test.sh | 2 +- python/open3d/__init__.py | 8 ------ util/install_deps_ubuntu.sh | 11 ++++++-- 8 files changed, 63 insertions(+), 46 deletions(-) diff --git a/3rdparty/find_dependencies.cmake b/3rdparty/find_dependencies.cmake index 77b7085df69..d91377a3138 100644 --- a/3rdparty/find_dependencies.cmake +++ b/3rdparty/find_dependencies.cmake @@ -1338,6 +1338,7 @@ if(BUILD_GUI) if (CPP_LIBRARY AND CPPABI_LIBRARY) set(CLANG_LIBDIR ${llvm_lib_dir}) message(STATUS "CLANG_LIBDIR found in ubuntu-default: ${CLANG_LIBDIR}") + set(LIBCPP_VERSION ${llvm_ver}) break() endif() endforeach() @@ -1362,7 +1363,10 @@ if(BUILD_GUI) llvm-8/lib llvm-7/lib ) + file(REAL_PATH ${CPPABI_LIBRARY} CPPABI_LIBRARY) get_filename_component(CLANG_LIBDIR ${CPPABI_LIBRARY} DIRECTORY) + string(REGEX MATCH "llvm-([0-9]+)/lib" _ ${CLANG_LIBDIR}) + set(LIBCPP_VERSION ${CMAKE_MATCH_1}) endif() # Find clang libraries at the exact path ${CLANG_LIBDIR}. @@ -1378,6 +1382,11 @@ if(BUILD_GUI) target_link_libraries(3rdparty_filament INTERFACE -lstdc++ ${CPP_LIBRARY} ${CPPABI_LIBRARY}) message(STATUS "Filament C++ libraries: ${CPP_LIBRARY} ${CPPABI_LIBRARY}") + if (LIBCPP_VERSION GREATER 11) + message(WARNING "libc++ (LLVM) version ${LIBCPP_VERSION} > 11 includes libunwind that " + "interferes with the system libunwind.so.8 and may crash Python code when exceptions " + "are used. Please consider using libc++ (LLVM) v11.") + endif() endif() if (APPLE) find_library(CORE_VIDEO CoreVideo) diff --git a/cpp/open3d/core/nns/NanoFlannImpl.h b/cpp/open3d/core/nns/NanoFlannImpl.h index 027d6d0c4ee..9a9ceb100c4 100644 --- a/cpp/open3d/core/nns/NanoFlannImpl.h +++ b/cpp/open3d/core/nns/NanoFlannImpl.h @@ -118,13 +118,6 @@ void _KnnSearchCPU(NanoFlannIndexHolderBase *holder, return; } - auto points_equal = [](const T *const p1, const T *const p2, - size_t dimension) { - std::vector p1_vec(p1, p1 + dimension); - std::vector p2_vec(p2, p2 + dimension); - return p1_vec == p2_vec; - }; - std::vector> neighbors_indices(num_queries); std::vector> neighbors_distances(num_queries); std::vector neighbors_count(num_queries, 0); @@ -147,8 +140,9 @@ void _KnnSearchCPU(NanoFlannIndexHolderBase *holder, for (size_t valid_i = 0; valid_i < num_valid; ++valid_i) { TIndex idx = result_indices[valid_i]; if (ignore_query_point && - points_equal(&queries[i * dimension], - &points[idx * dimension], dimension)) { + std::equal(&queries[i * dimension], + &queries[i * dimension] + dimension, + &points[idx * dimension])) { continue; } neighbors_indices[i].push_back(idx); @@ -222,13 +216,6 @@ void _RadiusSearchCPU(NanoFlannIndexHolderBase *holder, return; } - auto points_equal = [](const T *const p1, const T *const p2, - size_t dimension) { - std::vector p1_vec(p1, p1 + dimension); - std::vector p2_vec(p2, p2 + dimension); - return p1_vec == p2_vec; - }; - std::vector> neighbors_indices(num_queries); std::vector> neighbors_distances(num_queries); std::vector neighbors_count(num_queries, 0); @@ -255,9 +242,9 @@ void _RadiusSearchCPU(NanoFlannIndexHolderBase *holder, int num_neighbors = 0; for (const auto &idx_dist : search_result) { if (ignore_query_point && - points_equal(&queries[i * dimension], - &points[idx_dist.first * dimension], - dimension)) { + std::equal(&queries[i * dimension], + &queries[i * dimension] + dimension, + &points[idx_dist.first * dimension])) { continue; } neighbors_indices[i].push_back(idx_dist.first); diff --git a/cpp/pybind/CMakeLists.txt b/cpp/pybind/CMakeLists.txt index e7a534a3eb3..c79bbd96719 100644 --- a/cpp/pybind/CMakeLists.txt +++ b/cpp/pybind/CMakeLists.txt @@ -80,6 +80,9 @@ set(PYTHON_COMPILED_MODULE_DIR if (APPLE) set_target_properties(pybind PROPERTIES BUILD_RPATH "@loader_path;@loader_path/..") elseif (UNIX) + # Use RPATH instead of RUNPATH in pybind so that needed libc++.so can find child dependant libc++abi.so in RPATH + # https://stackoverflow.com/questions/69662319/managing-secondary-dependencies-of-shared-libraries + target_link_options(pybind PRIVATE "LINKER:--disable-new-dtags") set_target_properties(pybind PROPERTIES BUILD_RPATH "$ORIGIN;$ORIGIN/..") endif() set_target_properties(pybind PROPERTIES diff --git a/cpp/tests/t/geometry/TensorMap.cpp b/cpp/tests/t/geometry/TensorMap.cpp index 372c6b82270..8512ce68ccb 100644 --- a/cpp/tests/t/geometry/TensorMap.cpp +++ b/cpp/tests/t/geometry/TensorMap.cpp @@ -32,6 +32,13 @@ TEST_P(TensorMapPermuteDevices, Constructor) { // Primary key is required. EXPECT_ANY_THROW(t::geometry::TensorMap()); + // Delete primary key. + EXPECT_ANY_THROW(tm0.Erase("positions")); + + // Reserved keys. + EXPECT_ANY_THROW(tm0.insert( + {"primary_key", core::Tensor::Zeros({2, 3}, dtype, device)})); + // Iterators. std::map tensor_map( {{"positions", core::Tensor::Zeros({10, 3}, dtype, device)}, diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index cd52a838882..73feb7bffd6 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -67,34 +67,48 @@ RUN if [ "${BUILD_SYCL_MODULE}" = "ON" ]; then \ rm -rf /etc/apt/sources.list.d/oneAPI.list; \ fi -# Dependencies: basic +# Dependencies: basic and python-build RUN apt-get update && apt-get install -y \ git \ wget \ curl \ build-essential \ pkg-config \ + zlib1g \ + zlib1g-dev \ + libssl-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + libxml2-dev \ + libxmlsec1-dev \ + libffi-dev \ + liblzma-dev \ && rm -rf /var/lib/apt/lists/* -# Miniconda or Intel conda -# The **/open3d/bin paths are used during docker run, in this way docker run +# pyenv or Intel Python +# The pyenv python paths are used during docker run, in this way docker run # does not need to activate the environment again. -ENV PATH="/root/miniconda3/bin:${PATH}" -ENV PATH="/root/miniconda3/envs/open3d/bin:${PATH}" +# The soft link from the python patch level version to the python mino version +# ensures python wheel commands (i.e. open3d) are in PATH, since we don't know +# which patch level pyenv will install (latest). +ENV PYENV_ROOT=/root/.pyenv +ENV PATH="$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PYENV_ROOT/versions/$PYTHON_VERSION/bin:$PATH" ENV PATH="/opt/intel/oneapi/intelpython/latest/bin:${PATH}" -ENV PATH="/opt/intel/oneapi/intelpython/latest/envs/open3d/bin:${PATH}" RUN if [ "${BUILD_SYCL_MODULE}" = "OFF" ]; then \ - wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh; \ - bash Miniconda3-latest-Linux-x86_64.sh -b; \ - rm Miniconda3-latest-Linux-x86_64.sh; \ + curl https://pyenv.run | bash \ + && pyenv update \ + && pyenv install $PYTHON_VERSION \ + && pyenv global $PYTHON_VERSION \ + && pyenv rehash \ + && ln -s $PYENV_ROOT/versions/${PYTHON_VERSION}* $PYENV_ROOT/versions/${PYTHON_VERSION}; \ fi -RUN conda --version \ - && conda create -y -n open3d python=${PYTHON_VERSION} +RUN python --version && pip --version -# Activate open3d virtualenv -# This works during docker build. It becomes the prefix of all RUN commands. -# Ref: https://stackoverflow.com/a/60148365/1255535 -SHELL ["conda", "run", "-n", "open3d", "/bin/bash", "-o", "pipefail", "-c"] +SHELL ["/bin/bash", "-o", "pipefail", "-c"] # Dependencies: cmake ENV PATH=${HOME}/${CMAKE_VERSION}/bin:${PATH} diff --git a/docker/docker_test.sh b/docker/docker_test.sh index 441b17ed4e3..d328d14535d 100755 --- a/docker/docker_test.sh +++ b/docker/docker_test.sh @@ -141,7 +141,7 @@ cpp_python_linking_uninstall_test() { # Python test echo "pytest is randomized, add --randomly-seed=SEED to repeat the test sequence." ${docker_run} -i --rm "${DOCKER_TAG}" /bin/bash -c " \ - python -m pytest python/test ${pytest_args} -s" + python -W default -m pytest python/test ${pytest_args} -s" restart_docker_daemon_if_on_gcloud # Command-line tools test diff --git a/python/open3d/__init__.py b/python/open3d/__init__.py index 141bf426c50..f7354b2a90c 100644 --- a/python/open3d/__init__.py +++ b/python/open3d/__init__.py @@ -45,14 +45,6 @@ def load_cdll(path): if sys.platform == "win32": # Unix: Use rpath to find libraries _win32_dll_dir = os.add_dll_directory(str(Path(__file__).parent)) -if _build_config["BUILD_GUI"] and not (find_library("c++abi") or - find_library("c++")): - try: # Preload libc++.so and libc++abi.so (required by filament) - load_cdll(str(next((Path(__file__).parent).glob("*c++abi.*")))) - load_cdll(str(next((Path(__file__).parent).glob("*c++.*")))) - except StopIteration: # Not found: check system paths while loading - pass - __DEVICE_API__ = "cpu" if _build_config["BUILD_CUDA_MODULE"]: # Load CPU pybind dll gracefully without introducing new python variable. diff --git a/util/install_deps_ubuntu.sh b/util/install_deps_ubuntu.sh index 3e359a1a6e7..2786b19c23b 100755 --- a/util/install_deps_ubuntu.sh +++ b/util/install_deps_ubuntu.sh @@ -39,17 +39,22 @@ eval $( echo DISTRIB_ID="$DISTRIB_ID"; echo DISTRIB_RELEASE="$DISTRIB_RELEASE" ) +# To avoid dependence on libunwind, we don't want to use clang / libc++ versions later than 11. +# Ubuntu 20.04's has versions 8, 10 or 12 while Ubuntu 22.04 has versions 11 and later. if [ "$DISTRIB_ID" == "Ubuntu" -a "$DISTRIB_RELEASE" == "20.04" ]; then - # Ubuntu 20.04's clang/libc++-dev/libc++abi-dev are version 8, 10 or 12. - # To avoid dependence on libunwind, we don't want to use versions later than 10. deps=("${deps[@]/clang/clang-10}") deps=("${deps[@]/libc++-dev/libc++-10-dev}") deps=("${deps[@]/libc++abi-dev/libc++abi-10-dev}") fi +if [ "$DISTRIB_ID" == "Ubuntu" -a "$DISTRIB_RELEASE" == "22.04" ]; then + deps=("${deps[@]/clang/clang-11}") + deps=("${deps[@]/libc++-dev/libc++-11-dev}") + deps=("${deps[@]/libc++abi-dev/libc++abi-11-dev}") +fi # Special case for ARM64 if [ "$(uname -m)" == "aarch64" ]; then - # For compling LAPACK in OpenBLAS + # For compiling LAPACK in OpenBLAS deps+=("gfortran") fi From dd0d35956e1f9744563c5e0631474a243c9d861d Mon Sep 17 00:00:00 2001 From: Sameer Sheorey <41028320+ssheorey@users.noreply.github.com> Date: Tue, 1 Oct 2024 13:31:47 -0700 Subject: [PATCH 3/4] jinja2 CVE fix (#6992) https://github.com/isl-org/Open3D/security/dependabot/6 --- cpp/pybind/t/geometry/boundingvolume.cpp | 48 ++++-------------------- docs/make_docs.py | 6 +-- docs/requirements.txt | 5 ++- 3 files changed, 14 insertions(+), 45 deletions(-) diff --git a/cpp/pybind/t/geometry/boundingvolume.cpp b/cpp/pybind/t/geometry/boundingvolume.cpp index 981b8f3dd46..8e4494d1b79 100644 --- a/cpp/pybind/t/geometry/boundingvolume.cpp +++ b/cpp/pybind/t/geometry/boundingvolume.cpp @@ -22,51 +22,19 @@ void pybind_boundingvolume_declarations(py::module& m) { std::shared_ptr, Geometry, DrawableGeometry> aabb(m, "AxisAlignedBoundingBox", - R"(A bounding box that is aligned along the coordinate axes -and defined by the min_bound and max_bound." -- (min_bound, max_bound): Lower and upper bounds of the bounding box for all -axes. - - Usage - - AxisAlignedBoundingBox::GetMinBound() - - AxisAlignedBoundingBox::SetMinBound(const core::Tensor &min_bound) - - AxisAlignedBoundingBox::GetMaxBound() - - AxisAlignedBoundingBox::SetMaxBound(const core::Tensor &max_bound) - - Value tensor must have shape {3,}. - - Value tensor must have the same data type and device. - - Value tensor can only be float32 (default) or float64. - - The device of the tensor determines the device of the box. + R"(A bounding box that is aligned along the coordinate axes and +has the properties: -- color: Color of the bounding box. - - Usage - - AxisAlignedBoundingBox::GetColor() - - AxisAlignedBoundingBox::SetColor(const core::Tensor &color) - - Value tensor must have shape {3,}. - - Value tensor can only be float32 (default) or float64. - - Value tensor can only be range [0.0, 1.0].)"); +- (``min_bound``, ``max_bound``): Lower and upper bounds of the bounding box for all axes. These are tensors with shape (3,) and a common data type and device. The data type can only be ``open3d.core.float32`` (default) or ``open3d.core.float64``. The device of the tensor determines the device of the box. +- ``color``: Color of the bounding box is a tensor with shape (3,) and a data type ``open3d.core.float32`` (default) or ``open3d.core.float64``. Values can only be in the range [0.0, 1.0].)"); py::class_, std::shared_ptr, Geometry, DrawableGeometry> obb(m, "OrientedBoundingBox", - R"(A bounding box oriented along an arbitrary frame of reference. -- (center, rotation, extent): The oriented bounding box is defined by its -center position, rotation maxtrix and extent. - - Usage - - OrientedBoundingBox::GetCenter() - - OrientedBoundingBox::SetCenter(const core::Tensor ¢er) - - OrientedBoundingBox::GetRotation() - - OrientedBoundingBox::SetRotation(const core::Tensor &rotation) - - Value tensor of center and extent must have shape {3,}. - - Value tensor of rotation must have shape {3, 3}. - - Value tensor must have the same data type and device. - - Value tensor can only be float32 (default) or float64. - - The device of the tensor determines the device of the box. + R"(A bounding box oriented along an arbitrary frame of reference +with the properties: -- color: Color of the bounding box. - - Usage - - OrientedBoundingBox::GetColor() - - OrientedBoundingBox::SetColor(const core::Tensor &color) - - Value tensor must have shape {3,}. - - Value tensor can only be float32 (default) or float64. - - Value tensor can only be range [0.0, 1.0].)"); +- (``center``, ``rotation``, ``extent``): The oriented bounding box is defined by its center position (shape (3,)), rotation maxtrix (shape (3,3)) and extent (shape (3,)). Each of these tensors must have the same data type and device. The data type can only be ``open3d.core.float32`` (default) or ``open3d.core.float64``. The device of the tensor determines the device of the box. +- ``color``: Color of the bounding box is a tensor with shape (3,) and a data type ``open3d.core.float32`` (default) or ``open3d.core.float64``. Values can only be in the range [0.0, 1.0].)"); } void pybind_boundingvolume_definitions(py::module& m) { auto aabb = static_cast Date: Thu, 3 Oct 2024 21:02:19 +0200 Subject: [PATCH 4/4] Faster CPU (Arg-)Reductions (#6989) * Avoid multithread access to shared array in a loop (CPU Reductions) * Add two pass CPU ArgReduction Engine to speed up argreductions with single outputs --- cpp/open3d/core/kernel/ReductionCPU.cpp | 66 +++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/cpp/open3d/core/kernel/ReductionCPU.cpp b/cpp/open3d/core/kernel/ReductionCPU.cpp index 7caa60b34f7..710f1aaea24 100644 --- a/cpp/open3d/core/kernel/ReductionCPU.cpp +++ b/cpp/open3d/core/kernel/ReductionCPU.cpp @@ -122,13 +122,14 @@ class CPUReductionEngine { for (int64_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { int64_t start = thread_idx * workload_per_thread; int64_t end = std::min(start + workload_per_thread, num_workloads); + scalar_t local_result = identity; for (int64_t workload_idx = start; workload_idx < end; ++workload_idx) { scalar_t* src = reinterpret_cast( indexer.GetInputPtr(0, workload_idx)); - thread_results[thread_idx] = - element_kernel(*src, thread_results[thread_idx]); + local_result = element_kernel(*src, local_result); } + thread_results[thread_idx] = local_result; } scalar_t* dst = reinterpret_cast(indexer.GetOutputPtr(0)); for (int64_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { @@ -190,14 +191,25 @@ class CPUArgReductionEngine { // elements. We need to keep track of the indices within each // sub-iteration. int64_t num_output_elements = indexer_.NumOutputElements(); + if (num_output_elements <= 1) { + LaunchArgReductionKernelTwoPass(indexer_, reduce_func, identity); + } else { + LaunchArgReductionParallelDim(indexer_, reduce_func, identity); + } + } + template + static void LaunchArgReductionParallelDim(const Indexer& indexer, + func_t reduce_func, + scalar_t identity) { + int64_t num_output_elements = indexer.NumOutputElements(); #pragma omp parallel for schedule(static) \ num_threads(utility::EstimateMaxThreads()) for (int64_t output_idx = 0; output_idx < num_output_elements; output_idx++) { // sub_indexer.NumWorkloads() == ipo. - // sub_indexer's workload_idx is indexer_'s ipo_idx. - Indexer sub_indexer = indexer_.GetPerOutputIndexer(output_idx); + // sub_indexer's workload_idx is indexer's ipo_idx. + Indexer sub_indexer = indexer.GetPerOutputIndexer(output_idx); scalar_t dst_val = identity; for (int64_t workload_idx = 0; workload_idx < sub_indexer.NumWorkloads(); workload_idx++) { @@ -212,6 +224,52 @@ class CPUArgReductionEngine { } } + /// Create num_threads workers to compute partial arg reductions + /// and then reduce to the final results. + /// This only applies to arg reduction op with one output. + template + static void LaunchArgReductionKernelTwoPass(const Indexer& indexer, + func_t reduce_func, + scalar_t identity) { + if (indexer.NumOutputElements() > 1) { + utility::LogError( + "Internal error: two-pass arg reduction only works for " + "single-output arg reduction ops."); + } + int64_t num_workloads = indexer.NumWorkloads(); + int64_t num_threads = utility::EstimateMaxThreads(); + int64_t workload_per_thread = + (num_workloads + num_threads - 1) / num_threads; + std::vector thread_results_idx(num_threads, 0); + std::vector thread_results_val(num_threads, identity); + +#pragma omp parallel for schedule(static) \ + num_threads(utility::EstimateMaxThreads()) + for (int64_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { + int64_t start = thread_idx * workload_per_thread; + int64_t end = std::min(start + workload_per_thread, num_workloads); + scalar_t local_result_val = identity; + int64_t local_result_idx = 0; + for (int64_t workload_idx = start; workload_idx < end; + ++workload_idx) { + int64_t src_idx = workload_idx; + scalar_t* src_val = reinterpret_cast( + indexer.GetInputPtr(0, workload_idx)); + std::tie(local_result_idx, local_result_val) = reduce_func( + src_idx, *src_val, local_result_idx, local_result_val); + } + thread_results_val[thread_idx] = local_result_val; + thread_results_idx[thread_idx] = local_result_idx; + } + scalar_t dst_val = identity; + int64_t* dst_idx = reinterpret_cast(indexer.GetOutputPtr(0)); + for (int64_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { + std::tie(*dst_idx, dst_val) = reduce_func( + thread_results_idx[thread_idx], + thread_results_val[thread_idx], *dst_idx, dst_val); + } + } + private: Indexer indexer_; };