-
Notifications
You must be signed in to change notification settings - Fork 928
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update strings contains benchmarks to nvbench #15495
Changes from 25 commits
cf7af81
bd437e5
485ab21
69ae69c
ebcc0a4
72a4cc5
283bc7b
09b4d22
3e07623
6772f18
aa3aa5a
68cef20
2a7db73
ab7af12
c6620e4
00275f5
532a0cb
0608a62
3a912aa
5c9b87b
656a0c8
429e9ad
b70edcf
24f3a4a
755b7ee
b416cdb
f89151a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -361,14 +361,22 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, | |
if (d_strings.is_null(str_idx)) { return; } | ||
// get the string for this warp | ||
auto const d_str = d_strings.element<string_view>(str_idx); | ||
// each thread of the warp will check just part of the string | ||
auto found = false; | ||
for (auto i = static_cast<size_type>(idx % cudf::detail::warp_size); | ||
// each warp processes 4 starting bytes | ||
auto constexpr bytes_per_warp = 4; | ||
auto found = false; | ||
for (auto i = lane_idx * bytes_per_warp; | ||
!found && ((i + d_target.size_bytes()) <= d_str.size_bytes()); | ||
i += cudf::detail::warp_size) { | ||
i += cudf::detail::warp_size * bytes_per_warp) { | ||
// check the target matches this part of the d_str data | ||
if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; } | ||
// this is definitely faster for very long strings > 128B | ||
for (auto j = 0; j < bytes_per_warp; j++) { | ||
if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) && | ||
d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) { | ||
found = true; | ||
} | ||
Comment on lines
+373
to
+376
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will any early termination be helpful here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I benchmarked that and it did not help -- technically a bit slower. |
||
} | ||
} | ||
|
||
auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max()); | ||
if (lane_idx == 0) { d_results[str_idx] = result; } | ||
} | ||
|
@@ -391,12 +399,10 @@ std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input, | |
|
||
// fill the output with `false` unless the `d_target` is empty | ||
auto results_view = results->mutable_view(); | ||
thrust::fill(rmm::exec_policy(stream), | ||
results_view.begin<bool>(), | ||
results_view.end<bool>(), | ||
d_target.empty()); | ||
|
||
if (!d_target.empty()) { | ||
if (d_target.empty()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 This didn't occur to me, and I've been staring at this code for a bit now. |
||
thrust::fill( | ||
rmm::exec_policy_nosync(stream), results_view.begin<bool>(), results_view.end<bool>(), true); | ||
} else { | ||
// launch warp per string | ||
auto const d_strings = column_device_view::create(input.parent(), stream); | ||
constexpr int block_size = 256; | ||
|
@@ -461,9 +467,8 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings, | |
thrust::make_counting_iterator<size_type>(strings_count), | ||
d_results, | ||
[d_strings, pfn, d_target] __device__(size_type idx) { | ||
if (!d_strings.is_null(idx)) | ||
return bool{pfn(d_strings.element<string_view>(idx), d_target)}; | ||
return false; | ||
return !d_strings.is_null(idx) && | ||
bool{pfn(d_strings.element<string_view>(idx), d_target)}; | ||
}); | ||
results->set_null_count(strings.null_count()); | ||
return results; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will multiple if/else inside one
state.exec
lambda affect the benchmark run (as this will be executed in thousands of iterations)?How about placing separate
state.exec
in each if/else branch?