Skip to content

Commit

Permalink
ENH: Support skipna parameter in GroupBy mean and sum (#60741)
Browse files Browse the repository at this point in the history
* ENH: Support skipna parameter in GroupBy mean and sum

* Move numba tests to test_numba.py

* Fix docstring and failing future string test
  • Loading branch information
snitish authored Jan 21, 2025
1 parent 7234104 commit 42bf375
Show file tree
Hide file tree
Showing 9 changed files with 255 additions and 10 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ Other enhancements
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def group_sum(
result_mask: np.ndarray | None = ...,
min_count: int = ...,
is_datetimelike: bool = ...,
skipna: bool = ...,
) -> None: ...
def group_prod(
out: np.ndarray, # int64float_t[:, ::1]
Expand Down Expand Up @@ -115,6 +116,7 @@ def group_mean(
is_datetimelike: bool = ..., # bint
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_ohlc(
out: np.ndarray, # floatingintuint_t[:, ::1]
Expand Down
48 changes: 47 additions & 1 deletion pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -700,13 +700,14 @@ def group_sum(
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint is_datetimelike=False,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0 using Kahan summation
"""
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
sum_t val, t, y
sum_t val, t, y, nan_val
sum_t[:, ::1] sumx, compensation
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
Expand All @@ -722,6 +723,15 @@ def group_sum(
compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)

N, K = (<object>values).shape
if uses_mask:
nan_val = 0
elif is_datetimelike:
nan_val = NPY_NAT
elif sum_t is int64_t or sum_t is uint64_t:
# This has no effect as int64 can't be nan. Setting to 0 to avoid type error
nan_val = 0
else:
nan_val = NAN

with nogil(sum_t is not object):
for i in range(N):
Expand All @@ -734,6 +744,16 @@ def group_sum(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and sumx[lab, j] == NPY_NAT) or
_treat_as_na(sumx[lab, j], False)
):
# If sum is already NA, don't add to it. This is important for
# datetimelikebecause adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
else:
Expand Down Expand Up @@ -765,6 +785,11 @@ def group_sum(
# because of no gil
compensation[lab, j] = 0
sumx[lab, j] = t
elif not skipna:
if uses_mask:
result_mask[lab, j] = True
else:
sumx[lab, j] = nan_val

_check_below_mincount(
out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
Expand Down Expand Up @@ -1100,6 +1125,7 @@ def group_mean(
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""
Compute the mean per label given a label assignment for each value.
Expand All @@ -1125,6 +1151,8 @@ def group_mean(
Mask of the input values.
result_mask : ndarray[bool, ndim=2], optional
Mask of the out array
skipna : bool, optional
If True, ignore nans in `values`.

Notes
-----
Expand Down Expand Up @@ -1168,6 +1196,16 @@ def group_mean(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and sumx[lab, j] == NPY_NAT) or
_treat_as_na(sumx[lab, j], False)
):
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
elif is_datetimelike:
Expand All @@ -1191,6 +1229,14 @@ def group_mean(
# because of no gil
compensation[lab, j] = 0.
sumx[lab, j] = t
elif not skipna:
# Set the nobs to 0 so that in case of datetimelike,
# dividing NPY_NAT by nobs may not result in a NPY_NAT
nobs[lab, j] = 0
if uses_mask:
result_mask[lab, j] = True
else:
sumx[lab, j] = nan_val

for i in range(ncounts):
for j in range(K):
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/_numba/kernels/mean_.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,10 @@ def grouped_mean(
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
skipna: bool,
) -> tuple[np.ndarray, list[int]]:
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
values, result_dtype, labels, ngroups, skipna
)

# Post-processing, replace sums that don't satisfy min_periods
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/_numba/kernels/sum_.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def grouped_kahan_sum(
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
skipna: bool,
) -> tuple[
np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray
]:
Expand All @@ -180,7 +181,15 @@ def grouped_kahan_sum(
lab = labels[i]
val = values[i]

if lab < 0:
if lab < 0 or np.isnan(output[lab]):
continue

if not skipna and np.isnan(val):
output[lab] = np.nan
nobs_arr[lab] += 1
comp_arr[lab] = np.nan
consecutive_counts[lab] = 1
prev_vals[lab] = np.nan
continue

sum_x = output[lab]
Expand Down Expand Up @@ -219,11 +228,12 @@ def grouped_sum(
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
skipna: bool,
) -> tuple[np.ndarray, list[int]]:
na_pos = []

output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
values, result_dtype, labels, ngroups, skipna
)

# Post-processing, replace sums that don't satisfy min_periods
Expand Down
74 changes: 72 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,61 @@ class providing the base-class of operations.
{example}
"""

_groupby_agg_method_skipna_engine_template = """
Compute {fname} of group values.
Parameters
----------
numeric_only : bool, default {no}
Include only float, int, boolean columns.
.. versionchanged:: 2.0.0
numeric_only no longer accepts ``None``.
min_count : int, default {mc}
The required number of valid values to perform the operation. If fewer
than ``min_count`` non-NA values are present the result will be NA.
skipna : bool, default {s}
Exclude NA/null values. If the entire group is NA and ``skipna`` is
``True``, the result will be NA.
.. versionchanged:: 3.0.0
engine : str, default None {e}
* ``'cython'`` : Runs rolling apply through C-extensions from cython.
* ``'numba'`` : Runs rolling apply through JIT compiled code from numba.
Only available when ``raw`` is set to ``True``.
* ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
engine_kwargs : dict, default None {ek}
* For ``'cython'`` engine, there are no accepted ``engine_kwargs``
* For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
and ``parallel`` dictionary keys. The values must either be ``True`` or
``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
applied to both the ``func`` and the ``apply`` groupby aggregation.
Returns
-------
Series or DataFrame
Computed {fname} of values within each group.
See Also
--------
SeriesGroupBy.min : Return the min of the group values.
DataFrameGroupBy.min : Return the min of the group values.
SeriesGroupBy.max : Return the max of the group values.
DataFrameGroupBy.max : Return the max of the group values.
SeriesGroupBy.sum : Return the sum of the group values.
DataFrameGroupBy.sum : Return the sum of the group values.
Examples
--------
{example}
"""

_pipe_template = """
Apply a ``func`` with arguments to this %(klass)s object and return its result.
Expand Down Expand Up @@ -2091,6 +2146,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
def mean(
self,
numeric_only: bool = False,
skipna: bool = True,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
Expand All @@ -2106,6 +2162,12 @@ def mean(
numeric_only no longer accepts ``None`` and defaults to ``False``.
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.
.. versionadded:: 3.0.0
engine : str, default None
* ``'cython'`` : Runs the operation through C-extensions from cython.
* ``'numba'`` : Runs the operation through JIT compiled code from numba.
Expand Down Expand Up @@ -2172,12 +2234,16 @@ def mean(
executor.float_dtype_mapping,
engine_kwargs,
min_periods=0,
skipna=skipna,
)
else:
result = self._cython_agg_general(
"mean",
alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),
alt=lambda x: Series(x, copy=False).mean(
numeric_only=numeric_only, skipna=skipna
),
numeric_only=numeric_only,
skipna=skipna,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -2817,10 +2883,11 @@ def size(self) -> DataFrame | Series:

@final
@doc(
_groupby_agg_method_engine_template,
_groupby_agg_method_skipna_engine_template,
fname="sum",
no=False,
mc=0,
s=True,
e=None,
ek=None,
example=dedent(
Expand Down Expand Up @@ -2862,6 +2929,7 @@ def sum(
self,
numeric_only: bool = False,
min_count: int = 0,
skipna: bool = True,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
Expand All @@ -2873,6 +2941,7 @@ def sum(
executor.default_dtype_mapping,
engine_kwargs,
min_periods=min_count,
skipna=skipna,
)
else:
# If we are grouping on categoricals we want unobserved categories to
Expand All @@ -2884,6 +2953,7 @@ def sum(
min_count=min_count,
alias="sum",
npfunc=np.sum,
skipna=skipna,
)

return result
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/groupby/aggregate/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,23 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("func", ["sum", "mean"])
def test_multifunc_numba_vs_cython_frame_noskipna(func):
pytest.importorskip("numba")
data = DataFrame(
{
0: ["a", "a", "b", "b", "a"],
1: [1.0, np.nan, 3.0, 4.0, 5.0],
2: [1, 2, 3, 4, 5],
},
columns=[0, 1, 2],
)
grouped = data.groupby(0)
result = grouped.agg(func, skipna=False, engine="numba")
expected = grouped.agg(func, skipna=False, engine="cython")
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"agg_kwargs,expected_func",
[
Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/groupby/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,10 @@ def test_frame_consistency(groupby_func):
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("mean", "std", "sum", "var"):
elif groupby_func in ("sum", "mean"):
exclude_expected = {"axis", "kwargs"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("std", "var"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("median", "prod", "sem"):
Expand Down Expand Up @@ -234,7 +237,10 @@ def test_series_consistency(request, groupby_func):
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("mean", "std", "sum", "var"):
elif groupby_func in ("sum", "mean"):
exclude_expected = {"axis", "kwargs"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("std", "var"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("median", "prod", "sem"):
Expand Down
Loading

0 comments on commit 42bf375

Please sign in to comment.