Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend CUB reduce benchmarks #3401

Merged
merged 1 commit into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/reduce/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
});
}

NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types))
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,18 @@
*
******************************************************************************/

// This benchmark uses a custom reduction operation, max_t, which is not known to CUB, so no operator specific
// optimizations (e.g. using redux or DPX instructions) are performed. This benchmark covers the unoptimized code path.

// Because CUB cannot detect this operator, we cannot add any tunings based on the results of this benchmark. Its main
// use is to detect regressions.

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = max_t;
using value_types = all_types;
using op_t = max_t;
#include "base.cuh"
17 changes: 13 additions & 4 deletions cub/benchmarks/bench/reduce/min.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,23 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
// NOTE: this benchmark is intended to cover DPX instructions on Hopper+ architectures.
// It specifically uses cuda::minimum<> instead of a user-defined operator.
#define TUNE_T int16_t

// This benchmark is intended to cover DPX instructions on Hopper+ architectures. It specifically uses cuda::minimum<>
// instead of a user-defined operator, which CUB recognizes to select an optimized code path.

// Tuning parameters found for ::cuda::minimum<> apply equally for ::cuda::maximum<>
// Tuning parameters found for signed integer types apply equally for unsigned integer types
// TODO(bgruber): do tuning parameters found for int16_t apply equally for __half or __nv_bfloat16 on SM90+?

#include <cuda/functional>

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = ::cuda::minimum<>;
// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them.
using value_types = fundamental_types;
using op_t = ::cuda::minimum<>;
#include "base.cuh"
9 changes: 8 additions & 1 deletion cub/benchmarks/bench/reduce/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,18 @@
*
******************************************************************************/

// This benchmark is intended to cover redux instructions on Ampere+ architectures. It specifically uses
// cuda::std::plus<> instead of a user-defined operator, which CUB recognizes to select an optimized code path.

// Tuning parameters found for signed integer types apply equally for unsigned integer types

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = ::cuda::std::plus<>;
// TODO(bgruber): let's add __half and __nv_bfloat16 eventually when they compile, since we have fast paths for them.
using value_types = all_types;
using op_t = ::cuda::std::plus<>;
#include "base.cuh"
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,20 @@ struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
namespace detail
{

template <class T, class List>
template <class List, class... Ts>
struct push_back
{};

template <class T, class... As>
struct push_back<T, nvbench::type_list<As...>>
template <class... As, class... Ts>
struct push_back<nvbench::type_list<As...>, Ts...>
{
using type = nvbench::type_list<As..., T>;
using type = nvbench::type_list<As..., Ts...>;
};

} // namespace detail

template <class T, class List>
using push_back_t = typename detail::push_back<T, List>::type;
template <class List, class... Ts>
using push_back_t = typename detail::push_back<List, Ts...>::type;

#ifdef TUNE_OffsetT
using offset_types = nvbench::type_list<TUNE_OffsetT>;
Expand Down
Loading