From cf7af81e0cb0cedeb4b6ac0a1f6102f1f9b345dc Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 9 Apr 2024 19:59:06 -0400
Subject: [PATCH 1/4] Update strings contains benchmarks to nvbench

---
 cpp/benchmarks/CMakeLists.txt  |   2 +-
 cpp/benchmarks/string/find.cpp | 104 +++++++++++++--------------
 cpp/src/strings/search/find.cu | 128 ++++++++++++++++++++++++++++++---
 3 files changed, 168 insertions(+), 66 deletions(-)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 798e4e76141..bfb6f601aca 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -299,7 +299,6 @@ ConfigureBench(
   string/copy.cu
   string/factory.cu
   string/filter.cpp
-  string/find.cpp
   string/repeat_strings.cpp
   string/replace.cpp
   string/slice.cpp
@@ -316,6 +315,7 @@ ConfigureNVBench(
   string/copy_range.cpp
   string/count.cpp
   string/extract.cpp
+  string/find.cpp
   string/gather.cpp
   string/join_strings.cpp
   string/lengths.cpp
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index e866092f3a3..188c3a7d8e9 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -16,78 +16,70 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <limits>
+#include <nvbench/nvbench.cuh>
 
-enum FindAPI { find, find_multi, contains, starts_with, ends_with };
+std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
+                                                 cudf::size_type row_width,
+                                                 int32_t hit_rate);
 
-class StringFindScalar : public cudf::benchmark {};
-
-static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
+static void bench_find_string(nvbench::state& state)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
-  cudf::strings_column_view input(column->view());
-  cudf::string_scalar target("+");
-  cudf::test::strings_column_wrapper targets({"+", "-"});
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const hit_rate  = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const api       = state.get_string("api");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (find_api) {
-      case find: cudf::strings::find(input, target); break;
-      case find_multi:
-        cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
-        break;
-      case contains: cudf::strings::contains(input, target); break;
-      case starts_with: cudf::strings::starts_with(input, target); break;
-      case ends_with: cudf::strings::ends_with(input, target); break;
-    }
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
+  auto stream = cudf::get_default_stream();
+  auto col    = build_input_column(n_rows, row_width, hit_rate);
+  auto input  = cudf::strings_column_view(col->view());
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 2;
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
+  std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
+  cudf::string_scalar target(h_targets[2]);
+  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = input.chars_size(stream);
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  if (api.substr(0, 4) == "find")
+    state.add_global_memory_writes<nvbench::int32_t>(input.size());
+  else
+    state.add_global_memory_writes<nvbench::int8_t>(input.size());
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    if (api == "find") {
+      cudf::strings::find(input, target);
+    } else if (api == "find_multi") {
+      cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
+    } else if (api == "contains") {
+      cudf::strings::contains(input, target);
+    } else if (api == "starts_with") {
+      cudf::strings::starts_with(input, target);
+    } else if (api == "ends_with") {
+      cudf::strings::ends_with(input, target);
     }
-  }
+  });
 }
 
-#define STRINGS_BENCHMARK_DEFINE(name)                    \
-  BENCHMARK_DEFINE_F(StringFindScalar, name)              \
-  (::benchmark::State & st) { BM_find_scalar(st, name); } \
-  BENCHMARK_REGISTER_F(StringFindScalar, name)            \
-    ->Apply(generate_bench_args)                          \
-    ->UseManualTime()                                     \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(find)
-STRINGS_BENCHMARK_DEFINE(find_multi)
-STRINGS_BENCHMARK_DEFINE(contains)
-STRINGS_BENCHMARK_DEFINE(starts_with)
-STRINGS_BENCHMARK_DEFINE(ends_with)
+NVBENCH_BENCH(bench_find_string)
+  .set_name("find_string")
+  .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
+  .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 598d48157d9..a0588aae9d5 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -335,6 +335,110 @@ std::unique_ptr<column> find(strings_column_view const& input,
 namespace detail {
 namespace {
 
+#if 0
+// neater than compare_vector1() but not faster
+struct vloader_unaligned {
+  uint32_t const* ptr;
+  int const offset{};
+  uint32_t value{};
+
+  __device__ inline vloader_unaligned(unsigned char const* in)
+    : offset(static_cast<int>(reinterpret_cast<uintptr_t>(in) & 3) * 8)
+  {
+    ptr   = reinterpret_cast<uint32_t const*>(in - offset / 8);
+    value = *ptr++;
+  }
+
+  //__device__ inline uint32_t next()
+  //{
+  //  uint32_t const block = *ptr++;
+  //  uint32_t const rtn   = __funnelshift_r(value, block, offset);
+  //  value                = block;
+  //  return rtn;
+  //}
+
+  // way cool, but not faster
+  __device__ inline uint64_t next2()
+  {
+    uint32_t const block1 = *ptr++;
+    uint32_t const block2 = *ptr++;
+    uint32_t const rtn1   = __funnelshift_r(value, block1, offset);
+    uint32_t const rtn2   = __funnelshift_r(block1, block2, offset);
+    value                 = block2;
+    return static_cast<uint64_t>(rtn1) << 32 | static_cast<uint64_t>(rtn2);
+  }
+};
+
+__device__ inline int compare_vload(const char* data1, int len1, const char* data2, int len2)
+{
+  unsigned char const* ptr1 = reinterpret_cast<const unsigned char*>(data1);
+  unsigned char const* ptr2 = reinterpret_cast<const unsigned char*>(data2);
+
+  int const len = min(len1, len2);
+  int idx       = 0;
+
+  if (len >= 8) {
+    vloader_unaligned loader1{ptr1};
+    vloader_unaligned loader2{ptr2};
+    do {
+      auto const a = loader1.next2();
+      auto const b = loader2.next2();
+      // if (a != b) { return __byte_perm(a, 0, 0x0123) < __byte_perm(b, 0, 0x0123) ? -1 : 1; }
+      if (a != b) { return 1; }
+      idx += sizeof(a);
+    } while (idx + 8 <= len);  // 4
+  }
+
+  while (idx < len) {
+    auto const a = ptr1[idx];
+    auto const b = ptr2[idx];
+    if (a != b) { return static_cast<int>(a) - static_cast<int>(b); }
+    ++idx;
+  }
+  if (len1 < len2) return -1;
+  if (len2 < len1) return 1;
+  return 0;
+}
+
+
+// vector loading is not showing up faster even for long strings
+__device__ int compare_vector1(const char* data1, int len1, const char* data2, int len2)
+{
+  unsigned char const* ptr1 = reinterpret_cast<const unsigned char*>(data1);
+  unsigned char const* ptr2 = reinterpret_cast<const unsigned char*>(data2);
+
+  int const len = min(len1, len2);
+  int idx       = 0;
+
+  if (len >= 8) {
+    uint32_t const align_a  = (3 & reinterpret_cast<uintptr_t>(ptr1));
+    uint32_t const align_b  = (3 & reinterpret_cast<uintptr_t>(ptr2));
+    auto s32_a              = reinterpret_cast<uint32_t const*>(ptr1 - align_a) + 1;
+    auto s32_b              = reinterpret_cast<uint32_t const*>(ptr2 - align_b) + 1;
+    uint32_t const offset_a = align_a * 8;
+    uint32_t const offset_b = align_b * 8;
+    do {
+      uint32_t const a = __funnelshift_r(*(s32_a - 1), *s32_a, offset_a);
+      uint32_t const b = __funnelshift_r(*(s32_b - 1), *s32_b, offset_b);
+      if (a != b) { return __byte_perm(a, 0, 0x0123) < __byte_perm(b, 0, 0x0123) ? -1 : 1; }
+      idx += 4;
+      ++s32_a;  // value_a = *s32_a++; // block_a;
+      ++s32_b;  // value_b = *s32_b++; // block_b;
+    } while (idx + 4 <= len);
+  }
+  while (idx < len) {
+    auto const a = ptr1[idx];
+    auto const b = ptr2[idx];
+    if (a != b) { return static_cast<int>(a) - static_cast<int>(b); }
+    ++idx;
+  }
+
+  if (len1 < len2) return -1;
+  if (len2 < len1) return 1;
+  return 0;
+}
+#endif
+
 /**
  * @brief Check if `d_target` appears in a row in `d_strings`.
  *
@@ -357,19 +461,27 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
 
   auto const str_idx  = idx / cudf::detail::warp_size;
   auto const lane_idx = idx % cudf::detail::warp_size;
+  if (lane_idx) { d_results[str_idx] = false; }  // not faster
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
   // each thread of the warp will check just part of the string
   auto found = false;
-  for (auto i = static_cast<size_type>(idx % cudf::detail::warp_size);
-       !found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
+  for (auto i = lane_idx; !found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
        i += cudf::detail::warp_size) {
     // check the target matches this part of the d_str data
-    if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; }
+    found = (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0);
+    // not faster
+    // found = compare_vload(
+    //           d_target.data(), d_target.size_bytes(), d_str.data() + i, d_target.size_bytes()) ==
+    //           0;
+    // not faster
+    // auto result = warp_reduce(temp_storage).Reduce(found, cub::Max());
+    // found       = result;
   }
   auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
   if (lane_idx == 0) { d_results[str_idx] = result; }
+  // if (lane_idx == 0) { d_results[str_idx] = found; }
 }
 
 std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
@@ -390,12 +502,10 @@ std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
 
   // fill the output with `false` unless the `d_target` is empty
   auto results_view = results->mutable_view();
-  thrust::fill(rmm::exec_policy(stream),
-               results_view.begin<bool>(),
-               results_view.end<bool>(),
-               d_target.empty());
-
-  if (!d_target.empty()) {
+  if (d_target.empty()) {
+    thrust::fill(
+      rmm::exec_policy_nosync(stream), results_view.begin<bool>(), results_view.end<bool>(), true);
+  } else {
     // launch warp per string
     auto const d_strings     = column_device_view::create(input.parent(), stream);
     constexpr int block_size = 256;

From 532a0cb88a6d8ca2a46127012852b2984d809365 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 13 May 2024 12:41:28 -0400
Subject: [PATCH 2/4] remove commented out code

---
 cpp/src/strings/search/find.cu | 134 ++++-----------------------------
 1 file changed, 14 insertions(+), 120 deletions(-)

diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 036786ab39d..af9049b8228 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -336,110 +336,6 @@ std::unique_ptr<column> find(strings_column_view const& input,
 namespace detail {
 namespace {
 
-#if 0
-// neater than compare_vector1() but not faster
-struct vloader_unaligned {
-  uint32_t const* ptr;
-  int const offset{};
-  uint32_t value{};
-
-  __device__ inline vloader_unaligned(unsigned char const* in)
-    : offset(static_cast<int>(reinterpret_cast<uintptr_t>(in) & 3) * 8)
-  {
-    ptr   = reinterpret_cast<uint32_t const*>(in - offset / 8);
-    value = *ptr++;
-  }
-
-  //__device__ inline uint32_t next()
-  //{
-  //  uint32_t const block = *ptr++;
-  //  uint32_t const rtn   = __funnelshift_r(value, block, offset);
-  //  value                = block;
-  //  return rtn;
-  //}
-
-  // way cool, but not faster
-  __device__ inline uint64_t next2()
-  {
-    uint32_t const block1 = *ptr++;
-    uint32_t const block2 = *ptr++;
-    uint32_t const rtn1   = __funnelshift_r(value, block1, offset);
-    uint32_t const rtn2   = __funnelshift_r(block1, block2, offset);
-    value                 = block2;
-    return static_cast<uint64_t>(rtn1) << 32 | static_cast<uint64_t>(rtn2);
-  }
-};
-
-__device__ inline int compare_vload(const char* data1, int len1, const char* data2, int len2)
-{
-  unsigned char const* ptr1 = reinterpret_cast<const unsigned char*>(data1);
-  unsigned char const* ptr2 = reinterpret_cast<const unsigned char*>(data2);
-
-  int const len = min(len1, len2);
-  int idx       = 0;
-
-  if (len >= 8) {
-    vloader_unaligned loader1{ptr1};
-    vloader_unaligned loader2{ptr2};
-    do {
-      auto const a = loader1.next2();
-      auto const b = loader2.next2();
-      // if (a != b) { return __byte_perm(a, 0, 0x0123) < __byte_perm(b, 0, 0x0123) ? -1 : 1; }
-      if (a != b) { return 1; }
-      idx += sizeof(a);
-    } while (idx + 8 <= len);  // 4
-  }
-
-  while (idx < len) {
-    auto const a = ptr1[idx];
-    auto const b = ptr2[idx];
-    if (a != b) { return static_cast<int>(a) - static_cast<int>(b); }
-    ++idx;
-  }
-  if (len1 < len2) return -1;
-  if (len2 < len1) return 1;
-  return 0;
-}
-
-
-// vector loading is not showing up faster even for long strings
-__device__ int compare_vector1(const char* data1, int len1, const char* data2, int len2)
-{
-  unsigned char const* ptr1 = reinterpret_cast<const unsigned char*>(data1);
-  unsigned char const* ptr2 = reinterpret_cast<const unsigned char*>(data2);
-
-  int const len = min(len1, len2);
-  int idx       = 0;
-
-  if (len >= 8) {
-    uint32_t const align_a  = (3 & reinterpret_cast<uintptr_t>(ptr1));
-    uint32_t const align_b  = (3 & reinterpret_cast<uintptr_t>(ptr2));
-    auto s32_a              = reinterpret_cast<uint32_t const*>(ptr1 - align_a) + 1;
-    auto s32_b              = reinterpret_cast<uint32_t const*>(ptr2 - align_b) + 1;
-    uint32_t const offset_a = align_a * 8;
-    uint32_t const offset_b = align_b * 8;
-    do {
-      uint32_t const a = __funnelshift_r(*(s32_a - 1), *s32_a, offset_a);
-      uint32_t const b = __funnelshift_r(*(s32_b - 1), *s32_b, offset_b);
-      if (a != b) { return __byte_perm(a, 0, 0x0123) < __byte_perm(b, 0, 0x0123) ? -1 : 1; }
-      idx += 4;
-      ++s32_a;  // value_a = *s32_a++; // block_a;
-      ++s32_b;  // value_b = *s32_b++; // block_b;
-    } while (idx + 4 <= len);
-  }
-  while (idx < len) {
-    auto const a = ptr1[idx];
-    auto const b = ptr2[idx];
-    if (a != b) { return static_cast<int>(a) - static_cast<int>(b); }
-    ++idx;
-  }
-
-  if (len1 < len2) return -1;
-  if (len2 < len1) return 1;
-  return 0;
-}
-#endif
-
 /**
  * @brief Check if `d_target` appears in a row in `d_strings`.
  *
@@ -462,27 +358,26 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
 
   auto const str_idx  = idx / cudf::detail::warp_size;
   auto const lane_idx = idx % cudf::detail::warp_size;
-  if (lane_idx) { d_results[str_idx] = false; }  // not faster
+  if (lane_idx) { d_results[str_idx] = false; }
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
-  // each thread of the warp will check just part of the string
+  // each warp processes 4 starting bytes
   auto found = false;
-  for (auto i = lane_idx; !found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
-       i += cudf::detail::warp_size) {
+  for (auto i = lane_idx * 4; !found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
+       i += cudf::detail::warp_size * 4) {
     // check the target matches this part of the d_str data
-    found = (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0);
-    // not faster
-    // found = compare_vload(
-    //           d_target.data(), d_target.size_bytes(), d_str.data() + i, d_target.size_bytes()) ==
-    //           0;
-    // not faster
-    // auto result = warp_reduce(temp_storage).Reduce(found, cub::Max());
-    // found       = result;
+    // this is definitely faster for very long strings > 128B
+    for (auto j = 0; j < 4; j++) {
+      if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) &&
+          d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) {
+        found = true;
+      }
+    }
   }
+
   auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
   if (lane_idx == 0) { d_results[str_idx] = result; }
-  // if (lane_idx == 0) { d_results[str_idx] = found; }
 }
 
 std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
@@ -571,9 +466,8 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                     thrust::make_counting_iterator<size_type>(strings_count),
                     d_results,
                     [d_strings, pfn, d_target] __device__(size_type idx) {
-                      if (!d_strings.is_null(idx))
-                        return bool{pfn(d_strings.element<string_view>(idx), d_target)};
-                      return false;
+                      return !d_strings.is_null(idx) &&
+                             bool{pfn(d_strings.element<string_view>(idx), d_target)};
                     });
   results->set_null_count(strings.null_count());
   return results;

From 656a0c82a483b1fa3d53bc3b54e64606b40c93f7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 14 May 2024 11:10:16 -0400
Subject: [PATCH 3/4] remove unneeded output init

---
 cpp/benchmarks/string/find.cpp | 13 +++++++------
 cpp/src/strings/search/find.cu | 11 ++++++-----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index 188c3a7d8e9..c037290b550 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -45,22 +45,23 @@ static void bench_find_string(nvbench::state& state)
     state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  auto stream = cudf::get_default_stream();
-  auto col    = build_input_column(n_rows, row_width, hit_rate);
-  auto input  = cudf::strings_column_view(col->view());
+  auto const stream = cudf::get_default_stream();
+  auto const col    = build_input_column(n_rows, row_width, hit_rate);
+  auto const input  = cudf::strings_column_view(col->view());
 
   std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
   cudf::string_scalar target(h_targets[2]);
   cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
-  auto chars_size = input.chars_size(stream);
+  auto const chars_size = input.chars_size(stream);
   state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
-  if (api.substr(0, 4) == "find")
+  if (api.substr(0, 4) == "find") {
     state.add_global_memory_writes<nvbench::int32_t>(input.size());
-  else
+  } else {
     state.add_global_memory_writes<nvbench::int8_t>(input.size());
+  }
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     if (api == "find") {
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index af9049b8228..45eba39f413 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -358,17 +358,18 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
 
   auto const str_idx  = idx / cudf::detail::warp_size;
   auto const lane_idx = idx % cudf::detail::warp_size;
-  if (lane_idx) { d_results[str_idx] = false; }
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
   // each warp processes 4 starting bytes
-  auto found = false;
-  for (auto i = lane_idx * 4; !found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
-       i += cudf::detail::warp_size * 4) {
+  auto constexpr bytes_per_warp = 4;
+  auto found                    = false;
+  for (auto i = lane_idx * bytes_per_warp;
+       !found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
+       i += cudf::detail::warp_size * bytes_per_warp) {
     // check the target matches this part of the d_str data
     // this is definitely faster for very long strings > 128B
-    for (auto j = 0; j < 4; j++) {
+    for (auto j = 0; j < bytes_per_warp; j++) {
       if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) &&
           d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) {
         found = true;

From f89151aa529ec811ba5cfd617270fb2c991649d3 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 16 May 2024 10:38:17 -0400
Subject: [PATCH 4/4] place state.exec inside each if-stmt

---
 cpp/benchmarks/string/find.cpp | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index c037290b550..a9c620e4bf0 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -63,19 +63,23 @@ static void bench_find_string(nvbench::state& state)
     state.add_global_memory_writes<nvbench::int8_t>(input.size());
   }
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    if (api == "find") {
-      cudf::strings::find(input, target);
-    } else if (api == "find_multi") {
+  if (api == "find") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
+  } else if (api == "find_multi") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
       cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
-    } else if (api == "contains") {
-      cudf::strings::contains(input, target);
-    } else if (api == "starts_with") {
-      cudf::strings::starts_with(input, target);
-    } else if (api == "ends_with") {
-      cudf::strings::ends_with(input, target);
-    }
-  });
+    });
+  } else if (api == "contains") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
+  } else if (api == "starts_with") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
+  } else if (api == "ends_with") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); });
+  }
 }
 
 NVBENCH_BENCH(bench_find_string)