Skip to content

Commit

Permalink
Implement string join in cudf-polars (#17755)
Browse files Browse the repository at this point in the history
A small new string feature.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17755
  • Loading branch information
wence- authored Jan 28, 2025
1 parent 328605f commit d4e94ec
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 3 deletions.
18 changes: 16 additions & 2 deletions python/cudf_polars/cudf_polars/dsl/expressions/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def __init__(

def _validate_input(self):
if self.name not in (
StringFunction.Name.ConcatVertical,
StringFunction.Name.Contains,
StringFunction.Name.EndsWith,
StringFunction.Name.Lowercase,
Expand All @@ -125,7 +126,7 @@ def _validate_input(self):
StringFunction.Name.StripCharsEnd,
StringFunction.Name.Uppercase,
):
raise NotImplementedError(f"String function {self.name}")
raise NotImplementedError(f"String function {self.name!r}")
if self.name is StringFunction.Name.Contains:
literal, strict = self.options
if not literal:
Expand Down Expand Up @@ -205,7 +206,20 @@ def do_evaluate(
mapping: Mapping[Expr, Column] | None = None,
) -> Column:
"""Evaluate this expression given a dataframe for context."""
if self.name is StringFunction.Name.Contains:
if self.name is StringFunction.Name.ConcatVertical:
(child,) = self.children
column = child.evaluate(df, context=context, mapping=mapping)
delimiter, ignore_nulls = self.options
if column.obj.null_count() > 0 and not ignore_nulls:
return Column(plc.Column.all_null_like(column.obj, 1))
return Column(
plc.strings.combine.join_strings(
column.obj,
plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())),
plc.interop.from_arrow(pa.scalar(None, type=pa.string())),
)
)
elif self.name is StringFunction.Name.Contains:
child, arg = self.children
column = child.evaluate(df, context=context, mapping=mapping)

Expand Down
9 changes: 8 additions & 1 deletion python/cudf_polars/tests/expressions/test_stringfunction.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

Expand Down Expand Up @@ -454,3 +454,10 @@ def test_string_to_numeric_invalid(numeric_type):
polars_except=pl.exceptions.InvalidOperationError,
cudf_except=pl.exceptions.ComputeError,
)


@pytest.mark.parametrize("ignore_nulls", [False, True])
@pytest.mark.parametrize("delimiter", ["", "/"])
def test_string_join(ldf, ignore_nulls, delimiter):
q = ldf.select(pl.col("a").str.join(delimiter, ignore_nulls=ignore_nulls))
assert_gpu_result_equal(q)

0 comments on commit d4e94ec

Please sign in to comment.