ENH: Support skipna parameter in GroupBy mean and sum (#60741)

* ENH: Support skipna parameter in GroupBy mean and sum * Move numba tests to test_numba.py * Fix docstring and failing future string test
pandas-dev · Jan 21, 2025 · 42bf375 · 42bf375
1 parent 7234104
commit 42bf375
Show file tree

Hide file tree

Showing 9 changed files with 255 additions and 10 deletions.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -60,6 +60,7 @@ Other enhancements
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
+- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`)
 - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
 - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
 - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)

diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -66,6 +66,7 @@ def group_sum(
     result_mask: np.ndarray | None = ...,
     min_count: int = ...,
     is_datetimelike: bool = ...,
+    skipna: bool = ...,
 ) -> None: ...
 def group_prod(
     out: np.ndarray,  # int64float_t[:, ::1]
@@ -115,6 +116,7 @@ def group_mean(
     is_datetimelike: bool = ...,  # bint
     mask: np.ndarray | None = ...,
     result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
 ) -> None: ...
 def group_ohlc(
     out: np.ndarray,  # floatingintuint_t[:, ::1]

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -700,13 +700,14 @@ def group_sum(
     uint8_t[:, ::1] result_mask=None,
     Py_ssize_t min_count=0,
     bint is_datetimelike=False,
+    bint skipna=True,
 ) -> None:
     """
     Only aggregates on axis=0 using Kahan summation
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        sum_t val, t, y
+        sum_t val, t, y, nan_val
         sum_t[:, ::1] sumx, compensation
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
@@ -722,6 +723,15 @@ def group_sum(
     compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
 
     N, K = (<object>values).shape
+    if uses_mask:
+        nan_val = 0
+    elif is_datetimelike:
+        nan_val = NPY_NAT
+    elif sum_t is int64_t or sum_t is uint64_t:
+        # This has no effect as int64 can't be nan. Setting to 0 to avoid type error
+        nan_val = 0
+    else:
+        nan_val = NAN
 
     with nogil(sum_t is not object):
         for i in range(N):
@@ -734,6 +744,16 @@ def group_sum(
             for j in range(K):
                 val = values[i, j]
 
+                if not skipna and (
+                    (uses_mask and result_mask[lab, j]) or
+                    (is_datetimelike and sumx[lab, j] == NPY_NAT) or
+                    _treat_as_na(sumx[lab, j], False)
+                ):
+                    # If sum is already NA, don't add to it. This is important for
+                    # datetimelikebecause adding a value to NPY_NAT may not result
+                    # in a NPY_NAT
+                    continue
+
                 if uses_mask:
                     isna_entry = mask[i, j]
                 else:
@@ -765,6 +785,11 @@ def group_sum(
                             # because of no gil
                             compensation[lab, j] = 0
                         sumx[lab, j] = t
+                elif not skipna:
+                    if uses_mask:
+                        result_mask[lab, j] = True
+                    else:
+                        sumx[lab, j] = nan_val
 
     _check_below_mincount(
         out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
@@ -1100,6 +1125,7 @@ def group_mean(
     bint is_datetimelike=False,
     const uint8_t[:, ::1] mask=None,
     uint8_t[:, ::1] result_mask=None,
+    bint skipna=True,
 ) -> None:
     """
     Compute the mean per label given a label assignment for each value.
@@ -1125,6 +1151,8 @@ def group_mean(
         Mask of the input values.
     result_mask : ndarray[bool, ndim=2], optional
         Mask of the out array
+    skipna : bool, optional
+        If True, ignore nans in `values`.
 
     Notes
     -----
@@ -1168,6 +1196,16 @@ def group_mean(
             for j in range(K):
                 val = values[i, j]
 
+                if not skipna and (
+                    (uses_mask and result_mask[lab, j]) or
+                    (is_datetimelike and sumx[lab, j] == NPY_NAT) or
+                    _treat_as_na(sumx[lab, j], False)
+                ):
+                    # If sum is already NA, don't add to it. This is important for
+                    # datetimelike because adding a value to NPY_NAT may not result
+                    # in NPY_NAT
+                    continue
+
                 if uses_mask:
                     isna_entry = mask[i, j]
                 elif is_datetimelike:
@@ -1191,6 +1229,14 @@ def group_mean(
                         # because of no gil
                         compensation[lab, j] = 0.
                     sumx[lab, j] = t
+                elif not skipna:
+                    # Set the nobs to 0 so that in case of datetimelike,
+                    # dividing NPY_NAT by nobs may not result in a NPY_NAT
+                    nobs[lab, j] = 0
+                    if uses_mask:
+                        result_mask[lab, j] = True
+                    else:
+                        sumx[lab, j] = nan_val
 
         for i in range(ncounts):
             for j in range(K):

diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py
@@ -169,9 +169,10 @@ def grouped_mean(
     labels: npt.NDArray[np.intp],
     ngroups: int,
     min_periods: int,
+    skipna: bool,
 ) -> tuple[np.ndarray, list[int]]:
     output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
-        values, result_dtype, labels, ngroups
+        values, result_dtype, labels, ngroups, skipna
     )
 
     # Post-processing, replace sums that don't satisfy min_periods

diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py
@@ -165,6 +165,7 @@ def grouped_kahan_sum(
     result_dtype: np.dtype,
     labels: npt.NDArray[np.intp],
     ngroups: int,
+    skipna: bool,
 ) -> tuple[
     np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray
 ]:
@@ -180,7 +181,15 @@ def grouped_kahan_sum(
         lab = labels[i]
         val = values[i]
 
-        if lab < 0:
+        if lab < 0 or np.isnan(output[lab]):
+            continue
+
+        if not skipna and np.isnan(val):
+            output[lab] = np.nan
+            nobs_arr[lab] += 1
+            comp_arr[lab] = np.nan
+            consecutive_counts[lab] = 1
+            prev_vals[lab] = np.nan
             continue
 
         sum_x = output[lab]
@@ -219,11 +228,12 @@ def grouped_sum(
     labels: npt.NDArray[np.intp],
     ngroups: int,
     min_periods: int,
+    skipna: bool,
 ) -> tuple[np.ndarray, list[int]]:
     na_pos = []
 
     output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum(
-        values, result_dtype, labels, ngroups
+        values, result_dtype, labels, ngroups, skipna
     )
 
     # Post-processing, replace sums that don't satisfy min_periods

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -214,6 +214,61 @@ class providing the base-class of operations.
 {example}
 """
 
+_groupby_agg_method_skipna_engine_template = """
+Compute {fname} of group values.
+
+Parameters
+----------
+numeric_only : bool, default {no}
+    Include only float, int, boolean columns.
+
+    .. versionchanged:: 2.0.0
+
+        numeric_only no longer accepts ``None``.
+
+min_count : int, default {mc}
+    The required number of valid values to perform the operation. If fewer
+    than ``min_count`` non-NA values are present the result will be NA.
+
+skipna : bool, default {s}
+    Exclude NA/null values. If the entire group is NA and ``skipna`` is
+    ``True``, the result will be NA.
+
+    .. versionchanged:: 3.0.0
+
+engine : str, default None {e}
+    * ``'cython'`` : Runs rolling apply through C-extensions from cython.
+    * ``'numba'`` : Runs rolling apply through JIT compiled code from numba.
+        Only available when ``raw`` is set to ``True``.
+    * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
+
+engine_kwargs : dict, default None {ek}
+    * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
+    * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
+        and ``parallel`` dictionary keys. The values must either be ``True`` or
+        ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
+        ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
+        applied to both the ``func`` and the ``apply`` groupby aggregation.
+
+Returns
+-------
+Series or DataFrame
+    Computed {fname} of values within each group.
+
+See Also
+--------
+SeriesGroupBy.min : Return the min of the group values.
+DataFrameGroupBy.min : Return the min of the group values.
+SeriesGroupBy.max : Return the max of the group values.
+DataFrameGroupBy.max : Return the max of the group values.
+SeriesGroupBy.sum : Return the sum of the group values.
+DataFrameGroupBy.sum : Return the sum of the group values.
+
+Examples
+--------
+{example}
+"""
+
 _pipe_template = """
 Apply a ``func`` with arguments to this %(klass)s object and return its result.
 
@@ -2091,6 +2146,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
     def mean(
         self,
         numeric_only: bool = False,
+        skipna: bool = True,
         engine: Literal["cython", "numba"] | None = None,
         engine_kwargs: dict[str, bool] | None = None,
     ):
@@ -2106,6 +2162,12 @@ def mean(
 
                 numeric_only no longer accepts ``None`` and defaults to ``False``.
 
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+
+            .. versionadded:: 3.0.0
+
         engine : str, default None
             * ``'cython'`` : Runs the operation through C-extensions from cython.
             * ``'numba'`` : Runs the operation through JIT compiled code from numba.
@@ -2172,12 +2234,16 @@ def mean(
                 executor.float_dtype_mapping,
                 engine_kwargs,
                 min_periods=0,
+                skipna=skipna,
             )
         else:
             result = self._cython_agg_general(
                 "mean",
-                alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),
+                alt=lambda x: Series(x, copy=False).mean(
+                    numeric_only=numeric_only, skipna=skipna
+                ),
                 numeric_only=numeric_only,
+                skipna=skipna,
             )
             return result.__finalize__(self.obj, method="groupby")
 
@@ -2817,10 +2883,11 @@ def size(self) -> DataFrame | Series:
 
     @final
     @doc(
-        _groupby_agg_method_engine_template,
+        _groupby_agg_method_skipna_engine_template,
         fname="sum",
         no=False,
         mc=0,
+        s=True,
         e=None,
         ek=None,
         example=dedent(
@@ -2862,6 +2929,7 @@ def sum(
         self,
         numeric_only: bool = False,
         min_count: int = 0,
+        skipna: bool = True,
         engine: Literal["cython", "numba"] | None = None,
         engine_kwargs: dict[str, bool] | None = None,
     ):
@@ -2873,6 +2941,7 @@ def sum(
                 executor.default_dtype_mapping,
                 engine_kwargs,
                 min_periods=min_count,
+                skipna=skipna,
             )
         else:
             # If we are grouping on categoricals we want unobserved categories to
@@ -2884,6 +2953,7 @@ def sum(
                     min_count=min_count,
                     alias="sum",
                     npfunc=np.sum,
+                    skipna=skipna,
                 )
 
             return result

diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py
@@ -186,6 +186,23 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("func", ["sum", "mean"])
+def test_multifunc_numba_vs_cython_frame_noskipna(func):
+    pytest.importorskip("numba")
+    data = DataFrame(
+        {
+            0: ["a", "a", "b", "b", "a"],
+            1: [1.0, np.nan, 3.0, 4.0, 5.0],
+            2: [1, 2, 3, 4, 5],
+        },
+        columns=[0, 1, 2],
+    )
+    grouped = data.groupby(0)
+    result = grouped.agg(func, skipna=False, engine="numba")
+    expected = grouped.agg(func, skipna=False, engine="cython")
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "agg_kwargs,expected_func",
     [

diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py
@@ -176,7 +176,10 @@ def test_frame_consistency(groupby_func):
     elif groupby_func in ("max", "min"):
         exclude_expected = {"axis", "kwargs", "skipna"}
         exclude_result = {"min_count", "engine", "engine_kwargs"}
-    elif groupby_func in ("mean", "std", "sum", "var"):
+    elif groupby_func in ("sum", "mean"):
+        exclude_expected = {"axis", "kwargs"}
+        exclude_result = {"engine", "engine_kwargs"}
+    elif groupby_func in ("std", "var"):
         exclude_expected = {"axis", "kwargs", "skipna"}
         exclude_result = {"engine", "engine_kwargs"}
     elif groupby_func in ("median", "prod", "sem"):
@@ -234,7 +237,10 @@ def test_series_consistency(request, groupby_func):
     elif groupby_func in ("max", "min"):
         exclude_expected = {"axis", "kwargs", "skipna"}
         exclude_result = {"min_count", "engine", "engine_kwargs"}
-    elif groupby_func in ("mean", "std", "sum", "var"):
+    elif groupby_func in ("sum", "mean"):
+        exclude_expected = {"axis", "kwargs"}
+        exclude_result = {"engine", "engine_kwargs"}
+    elif groupby_func in ("std", "var"):
         exclude_expected = {"axis", "kwargs", "skipna"}
         exclude_result = {"engine", "engine_kwargs"}
     elif groupby_func in ("median", "prod", "sem"):