NVIDIA · bernhardmgruber · Jan 16, 2025 · Jan 15, 2025
@@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   });
 }
 
-NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
+NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types))
   .set_name("base")
   .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
@@ -25,11 +25,18 @@
  *
  ******************************************************************************/
 
+// This benchmark uses a custom reduction operation, max_t, which is not known to CUB, so no operator specific
+// optimizations (e.g. using redux or DPX instructions) are performed. This benchmark covers the unoptimized code path.
+
+// Because CUB cannot detect this operator, we cannot add any tunings based on the results of this benchmark. Its main
+// use is to detect regressions.
+
 #include <nvbench_helper.cuh>
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
-using op_t = max_t;
+using value_types = all_types;
+using op_t        = max_t;
 #include "base.cuh"
@@ -24,14 +24,23 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-// NOTE: this benchmark is intended to cover DPX instructions on Hopper+ architectures.
-//       It specifically uses cuda::minimum<> instead of a user-defined operator.
-#define TUNE_T int16_t
+
+// This benchmark is intended to cover DPX instructions on Hopper+ architectures. It specifically uses cuda::minimum<>
+// instead of a user-defined operator, which CUB recognizes to select an optimized code path.
+
+// Tuning parameters found for ::cuda::minimum<> apply equally for ::cuda::maximum<>
+// Tuning parameters found for signed integer types apply equally for unsigned integer types
+// TODO(bgruber): do tuning parameters found for int16_t apply equally for __half or __nv_bfloat16 on SM90+?
+
+#include <cuda/functional>
+
 #include <nvbench_helper.cuh>
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
-using op_t = ::cuda::minimum<>;
+// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them.
+using value_types = fundamental_types;
+using op_t        = ::cuda::minimum<>;
 #include "base.cuh"
@@ -25,11 +25,18 @@
  *
  ******************************************************************************/
 
+// This benchmark is intended to cover redux instructions on Ampere+ architectures. It specifically uses
+// cuda::std::plus<> instead of a user-defined operator, which CUB recognizes to select an optimized code path.
+
+// Tuning parameters found for signed integer types apply equally for unsigned integer types
+
 #include <nvbench_helper.cuh>
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
-using op_t = ::cuda::std::plus<>;
+// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them.
+using value_types = all_types;
+using op_t        = ::cuda::std::plus<>;
 #include "base.cuh"
@@ -52,20 +52,20 @@ struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
 namespace detail
 {
 
-template <class T, class List>
+template <class List, class... Ts>
 struct push_back
 {};
 
-template <class T, class... As>
-struct push_back<T, nvbench::type_list<As...>>
+template <class... As, class... Ts>
+struct push_back<nvbench::type_list<As...>, Ts...>
 {
-  using type = nvbench::type_list<As..., T>;
+  using type = nvbench::type_list<As..., Ts...>;
 };
 
 } // namespace detail
 
-template <class T, class List>
-using push_back_t = typename detail::push_back<T, List>::type;
+template <class List, class... Ts>
+using push_back_t = typename detail::push_back<List, Ts...>::type;
 
 #ifdef TUNE_OffsetT
 using offset_types = nvbench::type_list<TUNE_OffsetT>;