From 1c07320bc6e0d90b3585ab89c5db807042c47961 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 15 Jan 2025 16:39:51 +0100 Subject: [PATCH] Extend CUB reduce benchmarks * Rename max.cu to custom.cu, since it uses a custom operator * Extend types covered my min.cu to all fundamental types * Add some notes on how to collect tuning parameters Fixes: #3283 --- cub/benchmarks/bench/reduce/base.cuh | 2 +- .../bench/reduce/{max.cu => custom.cu} | 9 ++++++++- cub/benchmarks/bench/reduce/min.cu | 17 +++++++++++++---- cub/benchmarks/bench/reduce/sum.cu | 9 ++++++++- .../nvbench_helper/nvbench_helper.cuh | 12 ++++++------ 5 files changed, 36 insertions(+), 13 deletions(-) rename cub/benchmarks/bench/reduce/{max.cu => custom.cu} (81%) diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh index 9de575d0686..579d3757d3c 100644 --- a/cub/benchmarks/bench/reduce/base.cuh +++ b/cub/benchmarks/bench/reduce/base.cuh @@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list) }); } -NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types)) +NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); diff --git a/cub/benchmarks/bench/reduce/max.cu b/cub/benchmarks/bench/reduce/custom.cu similarity index 81% rename from cub/benchmarks/bench/reduce/max.cu rename to cub/benchmarks/bench/reduce/custom.cu index 791d5bfe167..0203ef60b8c 100644 --- a/cub/benchmarks/bench/reduce/max.cu +++ b/cub/benchmarks/bench/reduce/custom.cu @@ -25,11 +25,18 @@ * ******************************************************************************/ +// This benchmark uses a custom reduction operation, max_t, which is not known to CUB, so no operator specific +// optimizations (e.g. using redux or DPX instructions) are performed. This benchmark covers the unoptimized code path. + +// Because CUB cannot detect this operator, we cannot add any tunings based on the results of this benchmark. Its main +// use is to detect regressions. + #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 -using op_t = max_t; +using value_types = all_types; +using op_t = max_t; #include "base.cuh" diff --git a/cub/benchmarks/bench/reduce/min.cu b/cub/benchmarks/bench/reduce/min.cu index 177d7628f6f..50b175f4ca8 100644 --- a/cub/benchmarks/bench/reduce/min.cu +++ b/cub/benchmarks/bench/reduce/min.cu @@ -24,14 +24,23 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ -// NOTE: this benchmark is intended to cover DPX instructions on Hopper+ architectures. -// It specifically uses cuda::minimum<> instead of a user-defined operator. -#define TUNE_T int16_t + +// This benchmark is intended to cover DPX instructions on Hopper+ architectures. It specifically uses cuda::minimum<> +// instead of a user-defined operator, which CUB recognizes to select an optimized code path. + +// Tuning parameters found for ::cuda::minimum<> apply equally for ::cuda::maximum<> +// Tuning parameters found for signed integer types apply equally for unsigned integer types +// TODO(bgruber): do tuning parameters found for int16_t apply equally for __half or __nv_bfloat16 on SM90+? + +#include + #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 -using op_t = ::cuda::minimum<>; +// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them. +using value_types = fundamental_types; +using op_t = ::cuda::minimum<>; #include "base.cuh" diff --git a/cub/benchmarks/bench/reduce/sum.cu b/cub/benchmarks/bench/reduce/sum.cu index 4433724f090..ab65d7fe847 100644 --- a/cub/benchmarks/bench/reduce/sum.cu +++ b/cub/benchmarks/bench/reduce/sum.cu @@ -25,11 +25,18 @@ * ******************************************************************************/ +// This benchmark is intended to cover redux instructions on Ampere+ architectures. It specifically uses +// cuda::std::plus<> instead of a user-defined operator, which CUB recognizes to select an optimized code path. + +// Tuning parameters found for signed integer types apply equally for unsigned integer types + #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 -using op_t = ::cuda::std::plus<>; +// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them. +using value_types = all_types; +using op_t = ::cuda::std::plus<>; #include "base.cuh" diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh index 451d289f6c2..8324650d044 100644 --- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh +++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh @@ -52,20 +52,20 @@ struct nvbench::type_strings<::cuda::std::integral_constant> namespace detail { -template +template struct push_back {}; -template -struct push_back> +template +struct push_back, Ts...> { - using type = nvbench::type_list; + using type = nvbench::type_list; }; } // namespace detail -template -using push_back_t = typename detail::push_back::type; +template +using push_back_t = typename detail::push_back::type; #ifdef TUNE_OffsetT using offset_types = nvbench::type_list;