Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Support skipna parameter in GroupBy mean and sum #60741

Merged
merged 3 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ Other enhancements
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def group_sum(
result_mask: np.ndarray | None = ...,
min_count: int = ...,
is_datetimelike: bool = ...,
skipna: bool = ...,
) -> None: ...
def group_prod(
out: np.ndarray, # int64float_t[:, ::1]
Expand Down Expand Up @@ -115,6 +116,7 @@ def group_mean(
is_datetimelike: bool = ..., # bint
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_ohlc(
out: np.ndarray, # floatingintuint_t[:, ::1]
Expand Down
48 changes: 47 additions & 1 deletion pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -700,13 +700,14 @@ def group_sum(
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint is_datetimelike=False,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0 using Kahan summation
"""
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
sum_t val, t, y
sum_t val, t, y, nan_val
sum_t[:, ::1] sumx, compensation
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
Expand All @@ -722,6 +723,15 @@ def group_sum(
compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)

N, K = (<object>values).shape
if uses_mask:
nan_val = 0
elif is_datetimelike:
nan_val = NPY_NAT
elif sum_t is int64_t or sum_t is uint64_t:
# This has no effect as int64 can't be nan. Setting to 0 to avoid type error
nan_val = 0
else:
nan_val = NAN

with nogil(sum_t is not object):
for i in range(N):
Expand All @@ -734,6 +744,16 @@ def group_sum(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and sumx[lab, j] == NPY_NAT) or
_treat_as_na(sumx[lab, j], False)
):
# If sum is already NA, don't add to it. This is important for
# datetimelikebecause adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
else:
Expand Down Expand Up @@ -765,6 +785,11 @@ def group_sum(
# because of no gil
compensation[lab, j] = 0
sumx[lab, j] = t
elif not skipna:
if uses_mask:
result_mask[lab, j] = True
else:
sumx[lab, j] = nan_val

_check_below_mincount(
out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
Expand Down Expand Up @@ -1100,6 +1125,7 @@ def group_mean(
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""
Compute the mean per label given a label assignment for each value.
Expand All @@ -1125,6 +1151,8 @@ def group_mean(
Mask of the input values.
result_mask : ndarray[bool, ndim=2], optional
Mask of the out array
skipna : bool, optional
If True, ignore nans in `values`.

Notes
-----
Expand Down Expand Up @@ -1168,6 +1196,16 @@ def group_mean(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and sumx[lab, j] == NPY_NAT) or
_treat_as_na(sumx[lab, j], False)
):
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
elif is_datetimelike:
Expand All @@ -1191,6 +1229,14 @@ def group_mean(
# because of no gil
compensation[lab, j] = 0.
sumx[lab, j] = t
elif not skipna:
# Set the nobs to 0 so that in case of datetimelike,
# dividing NPY_NAT by nobs may not result in a NPY_NAT
nobs[lab, j] = 0
if uses_mask:
result_mask[lab, j] = True
else:
sumx[lab, j] = nan_val

for i in range(ncounts):
for j in range(K):
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/_numba/kernels/mean_.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,10 @@ def grouped_mean(
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
skipna: bool,
) -> tuple[np.ndarray, list[int]]:
output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
values, result_dtype, labels, ngroups, skipna
)

# Post-processing, replace sums that don't satisfy min_periods
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/_numba/kernels/sum_.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def grouped_kahan_sum(
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
skipna: bool,
) -> tuple[
np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray
]:
Expand All @@ -180,7 +181,15 @@ def grouped_kahan_sum(
lab = labels[i]
val = values[i]

if lab < 0:
if lab < 0 or np.isnan(output[lab]):
continue

if not skipna and np.isnan(val):
output[lab] = np.nan
nobs_arr[lab] += 1
comp_arr[lab] = np.nan
consecutive_counts[lab] = 1
prev_vals[lab] = np.nan
continue

sum_x = output[lab]
Expand Down Expand Up @@ -219,11 +228,12 @@ def grouped_sum(
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
skipna: bool,
) -> tuple[np.ndarray, list[int]]:
na_pos = []

output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
values, result_dtype, labels, ngroups
values, result_dtype, labels, ngroups, skipna
)

# Post-processing, replace sums that don't satisfy min_periods
Expand Down
74 changes: 72 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,61 @@ class providing the base-class of operations.
{example}
"""

_groupby_agg_method_skipna_engine_template = """
Compute {fname} of group values.

Parameters
----------
numeric_only : bool, default {no}
Include only float, int, boolean columns.

.. versionchanged:: 2.0.0

numeric_only no longer accepts ``None``.

min_count : int, default {mc}
The required number of valid values to perform the operation. If fewer
than ``min_count`` non-NA values are present the result will be NA.

skipna : bool, default {s}
Exclude NA/null values. If the entire group is NA and ``skipna`` is
``True``, the result will be NA.

.. versionchanged:: 3.0.0

engine : str, default None {e}
* ``'cython'`` : Runs rolling apply through C-extensions from cython.
* ``'numba'`` : Runs rolling apply through JIT compiled code from numba.
Only available when ``raw`` is set to ``True``.
* ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``

engine_kwargs : dict, default None {ek}
* For ``'cython'`` engine, there are no accepted ``engine_kwargs``
* For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
and ``parallel`` dictionary keys. The values must either be ``True`` or
``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
applied to both the ``func`` and the ``apply`` groupby aggregation.

Returns
-------
Series or DataFrame
Computed {fname} of values within each group.

See Also
--------
SeriesGroupBy.min : Return the min of the group values.
DataFrameGroupBy.min : Return the min of the group values.
SeriesGroupBy.max : Return the max of the group values.
DataFrameGroupBy.max : Return the max of the group values.
SeriesGroupBy.sum : Return the sum of the group values.
DataFrameGroupBy.sum : Return the sum of the group values.

Examples
--------
{example}
"""

_pipe_template = """
Apply a ``func`` with arguments to this %(klass)s object and return its result.

Expand Down Expand Up @@ -2091,6 +2146,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
def mean(
self,
numeric_only: bool = False,
skipna: bool = True,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
Expand All @@ -2106,6 +2162,12 @@ def mean(

numeric_only no longer accepts ``None`` and defaults to ``False``.

skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.

.. versionadded:: 3.0.0

engine : str, default None
* ``'cython'`` : Runs the operation through C-extensions from cython.
* ``'numba'`` : Runs the operation through JIT compiled code from numba.
Expand Down Expand Up @@ -2172,12 +2234,16 @@ def mean(
executor.float_dtype_mapping,
engine_kwargs,
min_periods=0,
skipna=skipna,
)
else:
result = self._cython_agg_general(
"mean",
alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),
alt=lambda x: Series(x, copy=False).mean(
numeric_only=numeric_only, skipna=skipna
),
numeric_only=numeric_only,
skipna=skipna,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -2817,10 +2883,11 @@ def size(self) -> DataFrame | Series:

@final
@doc(
_groupby_agg_method_engine_template,
_groupby_agg_method_skipna_engine_template,
fname="sum",
no=False,
mc=0,
s=True,
e=None,
ek=None,
example=dedent(
Expand Down Expand Up @@ -2862,6 +2929,7 @@ def sum(
self,
numeric_only: bool = False,
min_count: int = 0,
skipna: bool = True,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
Expand All @@ -2873,6 +2941,7 @@ def sum(
executor.default_dtype_mapping,
engine_kwargs,
min_periods=min_count,
skipna=skipna,
)
else:
# If we are grouping on categoricals we want unobserved categories to
Expand All @@ -2884,6 +2953,7 @@ def sum(
min_count=min_count,
alias="sum",
npfunc=np.sum,
skipna=skipna,
)

return result
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/groupby/aggregate/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,23 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("func", ["sum", "mean"])
def test_multifunc_numba_vs_cython_frame_noskipna(func):
pytest.importorskip("numba")
data = DataFrame(
{
0: ["a", "a", "b", "b", "a"],
1: [1.0, np.nan, 3.0, 4.0, 5.0],
2: [1, 2, 3, 4, 5],
},
columns=[0, 1, 2],
)
grouped = data.groupby(0)
result = grouped.agg(func, skipna=False, engine="numba")
expected = grouped.agg(func, skipna=False, engine="cython")
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"agg_kwargs,expected_func",
[
Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/groupby/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,10 @@ def test_frame_consistency(groupby_func):
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("mean", "std", "sum", "var"):
elif groupby_func in ("sum", "mean"):
exclude_expected = {"axis", "kwargs"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("std", "var"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("median", "prod", "sem"):
Expand Down Expand Up @@ -234,7 +237,10 @@ def test_series_consistency(request, groupby_func):
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("mean", "std", "sum", "var"):
elif groupby_func in ("sum", "mean"):
exclude_expected = {"axis", "kwargs"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("std", "var"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("median", "prod", "sem"):
Expand Down
Loading
Loading