Skip to content

Commit

Permalink
Add string.split APIs to pylibcudf (#16940)
Browse files Browse the repository at this point in the history
Contributes to #15162

Includes `split/split.pxd` and `split/partition.pxd`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller

URL: #16940
  • Loading branch information
mroeschke authored Oct 2, 2024
1 parent 6c9064a commit bac81cb
Show file tree
Hide file tree
Showing 19 changed files with 750 additions and 232 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ strings
repeat
replace
slice
split
strip
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=====
split
=====

.. automodule:: pylibcudf.strings.split
:members:
59 changes: 10 additions & 49 deletions python/cudf/cudf/_lib/strings/split/partition.pyx
Original file line number Diff line number Diff line change
@@ -1,21 +1,10 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.split.partition cimport (
partition as cpp_partition,
rpartition as cpp_rpartition,
)
from pylibcudf.libcudf.table.table cimport table

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport data_from_unique_ptr

import pylibcudf as plc


@acquire_spill_lock()
Expand All @@ -25,25 +14,11 @@ def partition(Column source_strings,
Returns data by splitting the `source_strings`
column at the first occurrence of the specified `py_delimiter`.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_partition(
source_view,
scalar_str[0]
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.partition.partition(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))


@acquire_spill_lock()
Expand All @@ -53,22 +28,8 @@ def rpartition(Column source_strings,
Returns a Column by splitting the `source_strings`
column at the last occurrence of the specified `py_delimiter`.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_rpartition(
source_view,
scalar_str[0]
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.partition.rpartition(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
217 changes: 54 additions & 163 deletions python/cudf/cudf/_lib/strings/split/split.pyx
Original file line number Diff line number Diff line change
@@ -1,33 +1,12 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from pylibcudf.libcudf.strings.regex_program cimport regex_program
from pylibcudf.libcudf.strings.split.split cimport (
rsplit as cpp_rsplit,
rsplit_re as cpp_rsplit_re,
rsplit_record as cpp_rsplit_record,
rsplit_record_re as cpp_rsplit_record_re,
split as cpp_split,
split_re as cpp_split_re,
split_record as cpp_split_record,
split_record_re as cpp_split_record_re,
)
from pylibcudf.libcudf.table.table cimport table
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport data_from_unique_ptr

import pylibcudf as plc


@acquire_spill_lock()
Expand All @@ -39,26 +18,12 @@ def split(Column source_strings,
column around the specified `py_delimiter`.
The split happens from beginning.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_split(
source_view,
scalar_str[0],
maxsplit
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.split.split(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value,
maxsplit,
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))


@acquire_spill_lock()
Expand All @@ -70,25 +35,12 @@ def split_record(Column source_strings,
column around the specified `py_delimiter`.
The split happens from beginning.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_split_record(
source_view,
scalar_str[0],
maxsplit
))

return Column.from_unique_ptr(
move(c_result),
plc_column = plc.strings.split.split.split_record(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value,
maxsplit,
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -100,26 +52,12 @@ def rsplit(Column source_strings,
column around the specified `py_delimiter`.
The split happens from the end.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_rsplit(
source_view,
scalar_str[0],
maxsplit
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.split.rsplit(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value,
maxsplit,
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))


@acquire_spill_lock()
Expand All @@ -131,25 +69,12 @@ def rsplit_record(Column source_strings,
column around the specified `py_delimiter`.
The split happens from the end.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
delimiter.get_raw_ptr()
)

with nogil:
c_result = move(cpp_rsplit_record(
source_view,
scalar_str[0],
maxsplit
))

return Column.from_unique_ptr(
move(c_result),
plc_column = plc.strings.split.split.rsplit_record(
source_strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value,
maxsplit,
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -160,24 +85,15 @@ def split_re(Column source_strings,
Returns data by splitting the `source_strings`
column around the delimiters identified by `pattern`.
"""
cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()
cdef regex_flags c_flags = regex_flags.DEFAULT
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_split_re(
source_view,
dereference(c_prog),
maxsplit
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.split.split_re(
source_strings.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
str(pattern),
plc.strings.regex_flags.RegexFlags.DEFAULT,
),
maxsplit,
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))


@acquire_spill_lock()
Expand All @@ -189,24 +105,15 @@ def rsplit_re(Column source_strings,
column around the delimiters identified by `pattern`.
The delimiters are searched starting from the end of each string.
"""
cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()
cdef regex_flags c_flags = regex_flags.DEFAULT
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_rsplit_re(
source_view,
dereference(c_prog),
maxsplit
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
plc_table = plc.strings.split.split.rsplit_re(
source_strings.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
str(pattern),
plc.strings.regex_flags.RegexFlags.DEFAULT,
),
maxsplit,
)
return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))


@acquire_spill_lock()
Expand All @@ -217,23 +124,15 @@ def split_record_re(Column source_strings,
Returns a Column by splitting the `source_strings`
column around the delimiters identified by `pattern`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()
cdef regex_flags c_flags = regex_flags.DEFAULT
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_split_record_re(
source_view,
dereference(c_prog),
maxsplit
))

return Column.from_unique_ptr(
move(c_result),
plc_column = plc.strings.split.split.split_record_re(
source_strings.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
str(pattern),
plc.strings.regex_flags.RegexFlags.DEFAULT,
),
maxsplit,
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings,
column around the delimiters identified by `pattern`.
The delimiters are searched starting from the end of each string.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()
cdef regex_flags c_flags = regex_flags.DEFAULT
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_rsplit_record_re(
source_view,
dereference(c_prog),
maxsplit
))

return Column.from_unique_ptr(
move(c_result),
plc_column = plc.strings.split.split.rsplit_record_re(
source_strings.to_pylibcudf(mode="read"),
plc.strings.regex_program.RegexProgram.create(
str(pattern),
plc.strings.regex_flags.RegexFlags.DEFAULT,
),
maxsplit,
)
return Column.from_pylibcudf(plc_column)
Loading

0 comments on commit bac81cb

Please sign in to comment.