Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update strings contains benchmarks to nvbench #15495

Merged
merged 27 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
cf7af81
Update strings contains benchmarks to nvbench
davidwendt Apr 9, 2024
bd437e5
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 9, 2024
485ab21
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 10, 2024
69ae69c
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 10, 2024
ebcc0a4
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 12, 2024
72a4cc5
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 16, 2024
283bc7b
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 17, 2024
09b4d22
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 22, 2024
3e07623
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 23, 2024
6772f18
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 24, 2024
aa3aa5a
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 30, 2024
68cef20
Merge branch 'branch-24.06' into perf-contains
davidwendt Apr 30, 2024
2a7db73
Merge branch 'branch-24.06' into perf-contains
davidwendt May 2, 2024
ab7af12
Merge branch 'branch-24.06' into perf-contains
davidwendt May 6, 2024
c6620e4
Merge branch 'branch-24.06' into perf-contains
davidwendt May 8, 2024
00275f5
Merge branch 'branch-24.06' into perf-contains
davidwendt May 13, 2024
532a0cb
remove commented out code
davidwendt May 13, 2024
0608a62
Merge branch 'perf-contains' of github.com:davidwendt/cudf into perf-…
davidwendt May 13, 2024
3a912aa
Merge branch 'branch-24.06' into perf-contains
davidwendt May 13, 2024
5c9b87b
Merge branch 'branch-24.06' into perf-contains
davidwendt May 14, 2024
656a0c8
remove unneeded output init
davidwendt May 14, 2024
429e9ad
Merge branch 'branch-24.06' into perf-contains
davidwendt May 14, 2024
b70edcf
Merge branch 'branch-24.06' into perf-contains
davidwendt May 15, 2024
24f3a4a
Merge branch 'branch-24.06' into perf-contains
davidwendt May 15, 2024
755b7ee
Merge branch 'branch-24.06' into perf-contains
davidwendt May 16, 2024
b416cdb
Merge branch 'branch-24.06' into perf-contains
davidwendt May 16, 2024
f89151a
place state.exec inside each if-stmt
davidwendt May 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,6 @@ ConfigureBench(
string/copy.cu
string/factory.cu
string/filter.cpp
string/find.cpp
string/repeat_strings.cpp
string/replace.cpp
string/slice.cpp
Expand All @@ -318,6 +317,7 @@ ConfigureNVBench(
string/copy_range.cpp
string/count.cpp
string/extract.cpp
string/find.cpp
string/gather.cpp
string/join_strings.cpp
string/lengths.cpp
Expand Down
105 changes: 49 additions & 56 deletions cpp/benchmarks/string/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,78 +16,71 @@

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/filling.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/find.hpp>
#include <cudf/strings/find_multiple.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <limits>
#include <nvbench/nvbench.cuh>

enum FindAPI { find, find_multi, contains, starts_with, ends_with };
std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
cudf::size_type row_width,
int32_t hit_rate);

class StringFindScalar : public cudf::benchmark {};

static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
static void bench_find_string(nvbench::state& state)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile const profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
cudf::strings_column_view input(column->view());
cudf::string_scalar target("+");
cudf::test::strings_column_wrapper targets({"+", "-"});
auto const n_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const hit_rate = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
auto const api = state.get_string("api");

for (auto _ : state) {
cuda_event_timer raii(state, true, cudf::get_default_stream());
switch (find_api) {
case find: cudf::strings::find(input, target); break;
case find_multi:
cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
break;
case contains: cudf::strings::contains(input, target); break;
case starts_with: cudf::strings::starts_with(input, target); break;
case ends_with: cudf::strings::ends_with(input, target); break;
}
if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
}
auto const stream = cudf::get_default_stream();
auto const col = build_input_column(n_rows, row_width, hit_rate);
auto const input = cudf::strings_column_view(col->view());

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 2;
for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
// avoid generating combinations that exceed the cudf column limit
size_t total_chars = static_cast<size_t>(row_count) * rowlen;
if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
b->Args({row_count, rowlen});
}
}
std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
cudf::string_scalar target(h_targets[2]);
cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto const chars_size = input.chars_size(stream);
state.add_element_count(chars_size, "chars_size");
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
if (api.substr(0, 4) == "find") {
state.add_global_memory_writes<nvbench::int32_t>(input.size());
} else {
state.add_global_memory_writes<nvbench::int8_t>(input.size());
}
}

#define STRINGS_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringFindScalar, name) \
(::benchmark::State & st) { BM_find_scalar(st, name); } \
BENCHMARK_REGISTER_F(StringFindScalar, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
if (api == "find") {
cudf::strings::find(input, target);
} else if (api == "find_multi") {
cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
} else if (api == "contains") {
cudf::strings::contains(input, target);
} else if (api == "starts_with") {
cudf::strings::starts_with(input, target);
} else if (api == "ends_with") {
cudf::strings::ends_with(input, target);
}
Copy link
Contributor

@ttnghia ttnghia May 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will multiple if/else inside one state.exec lambda affect the benchmark run (as this will be executed in thousands of iterations)?
How about placing separate state.exec in each if/else branch?

});
}

STRINGS_BENCHMARK_DEFINE(find)
STRINGS_BENCHMARK_DEFINE(find_multi)
STRINGS_BENCHMARK_DEFINE(contains)
STRINGS_BENCHMARK_DEFINE(starts_with)
STRINGS_BENCHMARK_DEFINE(ends_with)
NVBENCH_BENCH(bench_find_string)
.set_name("find_string")
.add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
.add_int64_axis("hit_rate", {20, 80}); // percentage
33 changes: 19 additions & 14 deletions cpp/src/strings/search/find.cu
Original file line number Diff line number Diff line change
Expand Up @@ -361,14 +361,22 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
if (d_strings.is_null(str_idx)) { return; }
// get the string for this warp
auto const d_str = d_strings.element<string_view>(str_idx);
// each thread of the warp will check just part of the string
auto found = false;
for (auto i = static_cast<size_type>(idx % cudf::detail::warp_size);
// each warp processes 4 starting bytes
auto constexpr bytes_per_warp = 4;
auto found = false;
for (auto i = lane_idx * bytes_per_warp;
!found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
i += cudf::detail::warp_size) {
i += cudf::detail::warp_size * bytes_per_warp) {
// check the target matches this part of the d_str data
if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; }
// this is definitely faster for very long strings > 128B
for (auto j = 0; j < bytes_per_warp; j++) {
if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) &&
d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) {
found = true;
}
Comment on lines +373 to +376
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will any early termination be helpful here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I benchmarked that and it did not help -- technically a bit slower.

}
}

auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
if (lane_idx == 0) { d_results[str_idx] = result; }
}
Expand All @@ -391,12 +399,10 @@ std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,

// fill the output with `false` unless the `d_target` is empty
auto results_view = results->mutable_view();
thrust::fill(rmm::exec_policy(stream),
results_view.begin<bool>(),
results_view.end<bool>(),
d_target.empty());

if (!d_target.empty()) {
if (d_target.empty()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 This didn't occur to me, and I've been staring at this code for a bit now.

thrust::fill(
rmm::exec_policy_nosync(stream), results_view.begin<bool>(), results_view.end<bool>(), true);
} else {
// launch warp per string
auto const d_strings = column_device_view::create(input.parent(), stream);
constexpr int block_size = 256;
Expand Down Expand Up @@ -461,9 +467,8 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
thrust::make_counting_iterator<size_type>(strings_count),
d_results,
[d_strings, pfn, d_target] __device__(size_type idx) {
if (!d_strings.is_null(idx))
return bool{pfn(d_strings.element<string_view>(idx), d_target)};
return false;
return !d_strings.is_null(idx) &&
bool{pfn(d_strings.element<string_view>(idx), d_target)};
});
results->set_null_count(strings.null_count());
return results;
Expand Down
Loading