From cf7af81e0cb0cedeb4b6ac0a1f6102f1f9b345dc Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 9 Apr 2024 19:59:06 -0400 Subject: [PATCH 1/4] Update strings contains benchmarks to nvbench --- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/benchmarks/string/find.cpp | 104 +++++++++++++-------------- cpp/src/strings/search/find.cu | 128 ++++++++++++++++++++++++++++++--- 3 files changed, 168 insertions(+), 66 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 798e4e76141..bfb6f601aca 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -299,7 +299,6 @@ ConfigureBench( string/copy.cu string/factory.cu string/filter.cpp - string/find.cpp string/repeat_strings.cpp string/replace.cpp string/slice.cpp @@ -316,6 +315,7 @@ ConfigureNVBench( string/copy_range.cpp string/count.cpp string/extract.cpp + string/find.cpp string/gather.cpp string/join_strings.cpp string/lengths.cpp diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index e866092f3a3..188c3a7d8e9 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -16,78 +16,70 @@ #include #include -#include #include +#include #include +#include #include #include #include #include -#include +#include -enum FindAPI { find, find_multi, contains, starts_with, ends_with }; +std::unique_ptr build_input_column(cudf::size_type n_rows, + cudf::size_type row_width, + int32_t hit_rate); -class StringFindScalar : public cudf::benchmark {}; - -static void BM_find_scalar(benchmark::State& state, FindAPI find_api) +static void bench_find_string(nvbench::state& state) { - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); - cudf::strings_column_view input(column->view()); - cudf::string_scalar target("+"); - cudf::test::strings_column_wrapper targets({"+", "-"}); + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + auto const api = state.get_string("api"); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (find_api) { - case find: cudf::strings::find(input, target); break; - case find_multi: - cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); - break; - case contains: cudf::strings::contains(input, target); break; - case starts_with: cudf::strings::starts_with(input, target); break; - case ends_with: cudf::strings::ends_with(input, target); break; - } + if (static_cast(n_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); } - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); -} + auto stream = cudf::get_default_stream(); + auto col = build_input_column(n_rows, row_width, hit_rate); + auto input = cudf::strings_column_view(col->view()); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 2; - for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { - for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { - // avoid generating combinations that exceed the cudf column limit - size_t total_chars = static_cast(row_count) * rowlen; - if (total_chars < static_cast(std::numeric_limits::max())) { - b->Args({row_count, rowlen}); - } + std::vector h_targets({"5W", "5W43", "0987 5W43"}); + cudf::string_scalar target(h_targets[2]); + cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = input.chars_size(stream); + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + if (api.substr(0, 4) == "find") + state.add_global_memory_writes(input.size()); + else + state.add_global_memory_writes(input.size()); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if (api == "find") { + cudf::strings::find(input, target); + } else if (api == "find_multi") { + cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); + } else if (api == "contains") { + cudf::strings::contains(input, target); + } else if (api == "starts_with") { + cudf::strings::starts_with(input, target); + } else if (api == "ends_with") { + cudf::strings::ends_with(input, target); } - } + }); } -#define STRINGS_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringFindScalar, name) \ - (::benchmark::State & st) { BM_find_scalar(st, name); } \ - BENCHMARK_REGISTER_F(StringFindScalar, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(find) -STRINGS_BENCHMARK_DEFINE(find_multi) -STRINGS_BENCHMARK_DEFINE(contains) -STRINGS_BENCHMARK_DEFINE(starts_with) -STRINGS_BENCHMARK_DEFINE(ends_with) +NVBENCH_BENCH(bench_find_string) + .set_name("find_string") + .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"}) + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) + .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 598d48157d9..a0588aae9d5 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -335,6 +335,110 @@ std::unique_ptr find(strings_column_view const& input, namespace detail { namespace { +#if 0 +// neater than compare_vector1() but not faster +struct vloader_unaligned { + uint32_t const* ptr; + int const offset{}; + uint32_t value{}; + + __device__ inline vloader_unaligned(unsigned char const* in) + : offset(static_cast(reinterpret_cast(in) & 3) * 8) + { + ptr = reinterpret_cast(in - offset / 8); + value = *ptr++; + } + + //__device__ inline uint32_t next() + //{ + // uint32_t const block = *ptr++; + // uint32_t const rtn = __funnelshift_r(value, block, offset); + // value = block; + // return rtn; + //} + + // way cool, but not faster + __device__ inline uint64_t next2() + { + uint32_t const block1 = *ptr++; + uint32_t const block2 = *ptr++; + uint32_t const rtn1 = __funnelshift_r(value, block1, offset); + uint32_t const rtn2 = __funnelshift_r(block1, block2, offset); + value = block2; + return static_cast(rtn1) << 32 | static_cast(rtn2); + } +}; + +__device__ inline int compare_vload(const char* data1, int len1, const char* data2, int len2) +{ + unsigned char const* ptr1 = reinterpret_cast(data1); + unsigned char const* ptr2 = reinterpret_cast(data2); + + int const len = min(len1, len2); + int idx = 0; + + if (len >= 8) { + vloader_unaligned loader1{ptr1}; + vloader_unaligned loader2{ptr2}; + do { + auto const a = loader1.next2(); + auto const b = loader2.next2(); + // if (a != b) { return __byte_perm(a, 0, 0x0123) < __byte_perm(b, 0, 0x0123) ? -1 : 1; } + if (a != b) { return 1; } + idx += sizeof(a); + } while (idx + 8 <= len); // 4 + } + + while (idx < len) { + auto const a = ptr1[idx]; + auto const b = ptr2[idx]; + if (a != b) { return static_cast(a) - static_cast(b); } + ++idx; + } + if (len1 < len2) return -1; + if (len2 < len1) return 1; + return 0; +} + + +// vector loading is not showing up faster even for long strings +__device__ int compare_vector1(const char* data1, int len1, const char* data2, int len2) +{ + unsigned char const* ptr1 = reinterpret_cast(data1); + unsigned char const* ptr2 = reinterpret_cast(data2); + + int const len = min(len1, len2); + int idx = 0; + + if (len >= 8) { + uint32_t const align_a = (3 & reinterpret_cast(ptr1)); + uint32_t const align_b = (3 & reinterpret_cast(ptr2)); + auto s32_a = reinterpret_cast(ptr1 - align_a) + 1; + auto s32_b = reinterpret_cast(ptr2 - align_b) + 1; + uint32_t const offset_a = align_a * 8; + uint32_t const offset_b = align_b * 8; + do { + uint32_t const a = __funnelshift_r(*(s32_a - 1), *s32_a, offset_a); + uint32_t const b = __funnelshift_r(*(s32_b - 1), *s32_b, offset_b); + if (a != b) { return __byte_perm(a, 0, 0x0123) < __byte_perm(b, 0, 0x0123) ? -1 : 1; } + idx += 4; + ++s32_a; // value_a = *s32_a++; // block_a; + ++s32_b; // value_b = *s32_b++; // block_b; + } while (idx + 4 <= len); + } + while (idx < len) { + auto const a = ptr1[idx]; + auto const b = ptr2[idx]; + if (a != b) { return static_cast(a) - static_cast(b); } + ++idx; + } + + if (len1 < len2) return -1; + if (len2 < len1) return 1; + return 0; +} +#endif + /** * @brief Check if `d_target` appears in a row in `d_strings`. * @@ -357,19 +461,27 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, auto const str_idx = idx / cudf::detail::warp_size; auto const lane_idx = idx % cudf::detail::warp_size; + if (lane_idx) { d_results[str_idx] = false; } // not faster if (d_strings.is_null(str_idx)) { return; } // get the string for this warp auto const d_str = d_strings.element(str_idx); // each thread of the warp will check just part of the string auto found = false; - for (auto i = static_cast(idx % cudf::detail::warp_size); - !found && ((i + d_target.size_bytes()) <= d_str.size_bytes()); + for (auto i = lane_idx; !found && ((i + d_target.size_bytes()) <= d_str.size_bytes()); i += cudf::detail::warp_size) { // check the target matches this part of the d_str data - if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; } + found = (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0); + // not faster + // found = compare_vload( + // d_target.data(), d_target.size_bytes(), d_str.data() + i, d_target.size_bytes()) == + // 0; + // not faster + // auto result = warp_reduce(temp_storage).Reduce(found, cub::Max()); + // found = result; } auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max()); if (lane_idx == 0) { d_results[str_idx] = result; } + // if (lane_idx == 0) { d_results[str_idx] = found; } } std::unique_ptr contains_warp_parallel(strings_column_view const& input, @@ -390,12 +502,10 @@ std::unique_ptr contains_warp_parallel(strings_column_view const& input, // fill the output with `false` unless the `d_target` is empty auto results_view = results->mutable_view(); - thrust::fill(rmm::exec_policy(stream), - results_view.begin(), - results_view.end(), - d_target.empty()); - - if (!d_target.empty()) { + if (d_target.empty()) { + thrust::fill( + rmm::exec_policy_nosync(stream), results_view.begin(), results_view.end(), true); + } else { // launch warp per string auto const d_strings = column_device_view::create(input.parent(), stream); constexpr int block_size = 256; From 532a0cb88a6d8ca2a46127012852b2984d809365 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 13 May 2024 12:41:28 -0400 Subject: [PATCH 2/4] remove commented out code --- cpp/src/strings/search/find.cu | 134 ++++----------------------------- 1 file changed, 14 insertions(+), 120 deletions(-) diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 036786ab39d..af9049b8228 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -336,110 +336,6 @@ std::unique_ptr find(strings_column_view const& input, namespace detail { namespace { -#if 0 -// neater than compare_vector1() but not faster -struct vloader_unaligned { - uint32_t const* ptr; - int const offset{}; - uint32_t value{}; - - __device__ inline vloader_unaligned(unsigned char const* in) - : offset(static_cast(reinterpret_cast(in) & 3) * 8) - { - ptr = reinterpret_cast(in - offset / 8); - value = *ptr++; - } - - //__device__ inline uint32_t next() - //{ - // uint32_t const block = *ptr++; - // uint32_t const rtn = __funnelshift_r(value, block, offset); - // value = block; - // return rtn; - //} - - // way cool, but not faster - __device__ inline uint64_t next2() - { - uint32_t const block1 = *ptr++; - uint32_t const block2 = *ptr++; - uint32_t const rtn1 = __funnelshift_r(value, block1, offset); - uint32_t const rtn2 = __funnelshift_r(block1, block2, offset); - value = block2; - return static_cast(rtn1) << 32 | static_cast(rtn2); - } -}; - -__device__ inline int compare_vload(const char* data1, int len1, const char* data2, int len2) -{ - unsigned char const* ptr1 = reinterpret_cast(data1); - unsigned char const* ptr2 = reinterpret_cast(data2); - - int const len = min(len1, len2); - int idx = 0; - - if (len >= 8) { - vloader_unaligned loader1{ptr1}; - vloader_unaligned loader2{ptr2}; - do { - auto const a = loader1.next2(); - auto const b = loader2.next2(); - // if (a != b) { return __byte_perm(a, 0, 0x0123) < __byte_perm(b, 0, 0x0123) ? -1 : 1; } - if (a != b) { return 1; } - idx += sizeof(a); - } while (idx + 8 <= len); // 4 - } - - while (idx < len) { - auto const a = ptr1[idx]; - auto const b = ptr2[idx]; - if (a != b) { return static_cast(a) - static_cast(b); } - ++idx; - } - if (len1 < len2) return -1; - if (len2 < len1) return 1; - return 0; -} - - -// vector loading is not showing up faster even for long strings -__device__ int compare_vector1(const char* data1, int len1, const char* data2, int len2) -{ - unsigned char const* ptr1 = reinterpret_cast(data1); - unsigned char const* ptr2 = reinterpret_cast(data2); - - int const len = min(len1, len2); - int idx = 0; - - if (len >= 8) { - uint32_t const align_a = (3 & reinterpret_cast(ptr1)); - uint32_t const align_b = (3 & reinterpret_cast(ptr2)); - auto s32_a = reinterpret_cast(ptr1 - align_a) + 1; - auto s32_b = reinterpret_cast(ptr2 - align_b) + 1; - uint32_t const offset_a = align_a * 8; - uint32_t const offset_b = align_b * 8; - do { - uint32_t const a = __funnelshift_r(*(s32_a - 1), *s32_a, offset_a); - uint32_t const b = __funnelshift_r(*(s32_b - 1), *s32_b, offset_b); - if (a != b) { return __byte_perm(a, 0, 0x0123) < __byte_perm(b, 0, 0x0123) ? -1 : 1; } - idx += 4; - ++s32_a; // value_a = *s32_a++; // block_a; - ++s32_b; // value_b = *s32_b++; // block_b; - } while (idx + 4 <= len); - } - while (idx < len) { - auto const a = ptr1[idx]; - auto const b = ptr2[idx]; - if (a != b) { return static_cast(a) - static_cast(b); } - ++idx; - } - - if (len1 < len2) return -1; - if (len2 < len1) return 1; - return 0; -} -#endif - /** * @brief Check if `d_target` appears in a row in `d_strings`. * @@ -462,27 +358,26 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, auto const str_idx = idx / cudf::detail::warp_size; auto const lane_idx = idx % cudf::detail::warp_size; - if (lane_idx) { d_results[str_idx] = false; } // not faster + if (lane_idx) { d_results[str_idx] = false; } if (d_strings.is_null(str_idx)) { return; } // get the string for this warp auto const d_str = d_strings.element(str_idx); - // each thread of the warp will check just part of the string + // each warp processes 4 starting bytes auto found = false; - for (auto i = lane_idx; !found && ((i + d_target.size_bytes()) <= d_str.size_bytes()); - i += cudf::detail::warp_size) { + for (auto i = lane_idx * 4; !found && ((i + d_target.size_bytes()) <= d_str.size_bytes()); + i += cudf::detail::warp_size * 4) { // check the target matches this part of the d_str data - found = (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0); - // not faster - // found = compare_vload( - // d_target.data(), d_target.size_bytes(), d_str.data() + i, d_target.size_bytes()) == - // 0; - // not faster - // auto result = warp_reduce(temp_storage).Reduce(found, cub::Max()); - // found = result; + // this is definitely faster for very long strings > 128B + for (auto j = 0; j < 4; j++) { + if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) && + d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) { + found = true; + } + } } + auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max()); if (lane_idx == 0) { d_results[str_idx] = result; } - // if (lane_idx == 0) { d_results[str_idx] = found; } } std::unique_ptr contains_warp_parallel(strings_column_view const& input, @@ -571,9 +466,8 @@ std::unique_ptr contains_fn(strings_column_view const& strings, thrust::make_counting_iterator(strings_count), d_results, [d_strings, pfn, d_target] __device__(size_type idx) { - if (!d_strings.is_null(idx)) - return bool{pfn(d_strings.element(idx), d_target)}; - return false; + return !d_strings.is_null(idx) && + bool{pfn(d_strings.element(idx), d_target)}; }); results->set_null_count(strings.null_count()); return results; From 656a0c82a483b1fa3d53bc3b54e64606b40c93f7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 14 May 2024 11:10:16 -0400 Subject: [PATCH 3/4] remove unneeded output init --- cpp/benchmarks/string/find.cpp | 13 +++++++------ cpp/src/strings/search/find.cu | 11 ++++++----- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index 188c3a7d8e9..c037290b550 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -45,22 +45,23 @@ static void bench_find_string(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - auto stream = cudf::get_default_stream(); - auto col = build_input_column(n_rows, row_width, hit_rate); - auto input = cudf::strings_column_view(col->view()); + auto const stream = cudf::get_default_stream(); + auto const col = build_input_column(n_rows, row_width, hit_rate); + auto const input = cudf::strings_column_view(col->view()); std::vector h_targets({"5W", "5W43", "0987 5W43"}); cudf::string_scalar target(h_targets[2]); cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - auto chars_size = input.chars_size(stream); + auto const chars_size = input.chars_size(stream); state.add_element_count(chars_size, "chars_size"); state.add_global_memory_reads(chars_size); - if (api.substr(0, 4) == "find") + if (api.substr(0, 4) == "find") { state.add_global_memory_writes(input.size()); - else + } else { state.add_global_memory_writes(input.size()); + } state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { if (api == "find") { diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index af9049b8228..45eba39f413 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -358,17 +358,18 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, auto const str_idx = idx / cudf::detail::warp_size; auto const lane_idx = idx % cudf::detail::warp_size; - if (lane_idx) { d_results[str_idx] = false; } if (d_strings.is_null(str_idx)) { return; } // get the string for this warp auto const d_str = d_strings.element(str_idx); // each warp processes 4 starting bytes - auto found = false; - for (auto i = lane_idx * 4; !found && ((i + d_target.size_bytes()) <= d_str.size_bytes()); - i += cudf::detail::warp_size * 4) { + auto constexpr bytes_per_warp = 4; + auto found = false; + for (auto i = lane_idx * bytes_per_warp; + !found && ((i + d_target.size_bytes()) <= d_str.size_bytes()); + i += cudf::detail::warp_size * bytes_per_warp) { // check the target matches this part of the d_str data // this is definitely faster for very long strings > 128B - for (auto j = 0; j < 4; j++) { + for (auto j = 0; j < bytes_per_warp; j++) { if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) && d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) { found = true; From f89151aa529ec811ba5cfd617270fb2c991649d3 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 May 2024 10:38:17 -0400 Subject: [PATCH 4/4] place state.exec inside each if-stmt --- cpp/benchmarks/string/find.cpp | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index c037290b550..a9c620e4bf0 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -63,19 +63,23 @@ static void bench_find_string(nvbench::state& state) state.add_global_memory_writes(input.size()); } - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - if (api == "find") { - cudf::strings::find(input, target); - } else if (api == "find_multi") { + if (api == "find") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::find(input, target); }); + } else if (api == "find_multi") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); - } else if (api == "contains") { - cudf::strings::contains(input, target); - } else if (api == "starts_with") { - cudf::strings::starts_with(input, target); - } else if (api == "ends_with") { - cudf::strings::ends_with(input, target); - } - }); + }); + } else if (api == "contains") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); + } else if (api == "starts_with") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); }); + } else if (api == "ends_with") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); }); + } } NVBENCH_BENCH(bench_find_string)