diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 47838d1e49d61..9fd0c112c98a5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -55,6 +55,7 @@ Other enhancements - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) +- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 34367f55d2bbb..e3909203d1f5a 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -66,6 +66,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_prod( out: np.ndarray, # int64float_t[:, ::1] @@ -115,6 +116,7 @@ def group_mean( is_datetimelike: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_ohlc( out: np.ndarray, # floatingintuint_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 59bc59135a8ff..fd288dff01f32 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -700,13 +700,14 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 using Kahan summation """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - sum_t val, t, y + sum_t val, t, y, nan_val sum_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -722,6 +723,15 @@ def group_sum( compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape + if uses_mask: + nan_val = 0 + elif is_datetimelike: + nan_val = NPY_NAT + elif sum_t is int64_t or sum_t is uint64_t: + # This has no effect as int64 can't be nan. Setting to 0 to avoid type error + nan_val = 0 + else: + nan_val = NAN with nogil(sum_t is not object): for i in range(N): @@ -734,6 +744,16 @@ def group_sum( for j in range(K): val = values[i, j] + if not skipna and ( + (uses_mask and result_mask[lab, j]) or + (is_datetimelike and sumx[lab, j] == NPY_NAT) or + _treat_as_na(sumx[lab, j], False) + ): + # If sum is already NA, don't add to it. This is important for + # datetimelikebecause adding a value to NPY_NAT may not result + # in a NPY_NAT + continue + if uses_mask: isna_entry = mask[i, j] else: @@ -765,6 +785,11 @@ def group_sum( # because of no gil compensation[lab, j] = 0 sumx[lab, j] = t + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx @@ -1100,6 +1125,7 @@ def group_mean( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """ Compute the mean per label given a label assignment for each value. @@ -1125,6 +1151,8 @@ def group_mean( Mask of the input values. result_mask : ndarray[bool, ndim=2], optional Mask of the out array + skipna : bool, optional + If True, ignore nans in `values`. Notes ----- @@ -1168,6 +1196,16 @@ def group_mean( for j in range(K): val = values[i, j] + if not skipna and ( + (uses_mask and result_mask[lab, j]) or + (is_datetimelike and sumx[lab, j] == NPY_NAT) or + _treat_as_na(sumx[lab, j], False) + ): + # If sum is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in NPY_NAT + continue + if uses_mask: isna_entry = mask[i, j] elif is_datetimelike: @@ -1191,6 +1229,14 @@ def group_mean( # because of no gil compensation[lab, j] = 0. sumx[lab, j] = t + elif not skipna: + # Set the nobs to 0 so that in case of datetimelike, + # dividing NPY_NAT by nobs may not result in a NPY_NAT + nobs[lab, j] = 0 + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = nan_val for i in range(ncounts): for j in range(K): diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index cc10bd003af7e..2b59ea2fe12a5 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -169,9 +169,10 @@ def grouped_mean( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, ) -> tuple[np.ndarray, list[int]]: output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index 76f4e22b43c4b..9f2e9541b31d0 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -165,6 +165,7 @@ def grouped_kahan_sum( result_dtype: np.dtype, labels: npt.NDArray[np.intp], ngroups: int, + skipna: bool, ) -> tuple[ np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray ]: @@ -180,7 +181,15 @@ def grouped_kahan_sum( lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or np.isnan(output[lab]): + continue + + if not skipna and np.isnan(val): + output[lab] = np.nan + nobs_arr[lab] += 1 + comp_arr[lab] = np.nan + consecutive_counts[lab] = 1 + prev_vals[lab] = np.nan continue sum_x = output[lab] @@ -219,11 +228,12 @@ def grouped_sum( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, ) -> tuple[np.ndarray, list[int]]: na_pos = [] output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f4ba40e275a8d..f9059e6e8896f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -214,6 +214,61 @@ class providing the base-class of operations. {example} """ +_groupby_agg_method_skipna_engine_template = """ +Compute {fname} of group values. + +Parameters +---------- +numeric_only : bool, default {no} + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + +min_count : int, default {mc} + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + +skipna : bool, default {s} + Exclude NA/null values. If the entire group is NA and ``skipna`` is + ``True``, the result will be NA. + + .. versionchanged:: 3.0.0 + +engine : str, default None {e} + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None {ek} + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` groupby aggregation. + +Returns +------- +Series or DataFrame + Computed {fname} of values within each group. + +See Also +-------- +SeriesGroupBy.min : Return the min of the group values. +DataFrameGroupBy.min : Return the min of the group values. +SeriesGroupBy.max : Return the max of the group values. +DataFrameGroupBy.max : Return the max of the group values. +SeriesGroupBy.sum : Return the sum of the group values. +DataFrameGroupBy.sum : Return the sum of the group values. + +Examples +-------- +{example} +""" + _pipe_template = """ Apply a ``func`` with arguments to this %(klass)s object and return its result. @@ -2091,6 +2146,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def mean( self, numeric_only: bool = False, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2106,6 +2162,12 @@ def mean( numeric_only no longer accepts ``None`` and defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. * ``'numba'`` : Runs the operation through JIT compiled code from numba. @@ -2172,12 +2234,16 @@ def mean( executor.float_dtype_mapping, engine_kwargs, min_periods=0, + skipna=skipna, ) else: result = self._cython_agg_general( "mean", - alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).mean( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @@ -2817,10 +2883,11 @@ def size(self) -> DataFrame | Series: @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="sum", no=False, mc=0, + s=True, e=None, ek=None, example=dedent( @@ -2862,6 +2929,7 @@ def sum( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2873,6 +2941,7 @@ def sum( executor.default_dtype_mapping, engine_kwargs, min_periods=min_count, + skipna=skipna, ) else: # If we are grouping on categoricals we want unobserved categories to @@ -2884,6 +2953,7 @@ def sum( min_count=min_count, alias="sum", npfunc=np.sum, + skipna=skipna, ) return result diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 15c1efe5fd1ff..ca265a1d1108b 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -186,6 +186,23 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["sum", "mean"]) +def test_multifunc_numba_vs_cython_frame_noskipna(func): + pytest.importorskip("numba") + data = DataFrame( + { + 0: ["a", "a", "b", "b", "a"], + 1: [1.0, np.nan, 3.0, 4.0, 5.0], + 2: [1, 2, 3, 4, 5], + }, + columns=[0, 1, 2], + ) + grouped = data.groupby(0) + result = grouped.agg(func, skipna=False, engine="numba") + expected = grouped.agg(func, skipna=False, engine="cython") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "agg_kwargs,expected_func", [ diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index baec3ed1a5024..cc69de2581a79 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -176,7 +176,10 @@ def test_frame_consistency(groupby_func): elif groupby_func in ("max", "min"): exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("mean", "std", "sum", "var"): + elif groupby_func in ("sum", "mean"): + exclude_expected = {"axis", "kwargs"} + exclude_result = {"engine", "engine_kwargs"} + elif groupby_func in ("std", "var"): exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): @@ -234,7 +237,10 @@ def test_series_consistency(request, groupby_func): elif groupby_func in ("max", "min"): exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("mean", "std", "sum", "var"): + elif groupby_func in ("sum", "mean"): + exclude_expected = {"axis", "kwargs"} + exclude_result = {"engine", "engine_kwargs"} + elif groupby_func in ("std", "var"): exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index a17200c123d22..1db12f05e821f 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -422,6 +422,98 @@ def test_mean_on_timedelta(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "values, dtype, result_dtype", + [ + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64", "float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Float64", "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Int64", "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]", "timedelta64[ns]"), + ( + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ], +) +def test_mean_skipna(values, dtype, result_dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the result_dtype because + # Series.mean() changes the dtype to float64/object depending on the input dtype + expected = ( + df.groupby("cat")["val"] + .apply(lambda x: x.mean(skipna=skipna)) + .astype(result_dtype) + ) + result = df.groupby("cat")["val"].mean(skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Int64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]"), + ], +) +def test_sum_skipna(values, dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the original dtype because + # Series.sum() changes the dtype + expected = ( + df.groupby("cat")["val"].apply(lambda x: x.sum(skipna=skipna)).astype(dtype) + ) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + +def test_sum_skipna_object(skipna): + # GH#15675 + df = DataFrame( + { + "val": ["a", "b", np.nan, "d", "e", "f", "g", "h", "i", "j"], + "cat": ["A", "B"] * 5, + } + ).astype({"val": object}) + if skipna: + expected = Series( + ["aegi", "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + else: + expected = Series( + [np.nan, "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cython_median(): arr = np.random.default_rng(2).standard_normal(1000) arr[::2] = np.nan @@ -1128,8 +1220,8 @@ def test_regression_allowlist_methods(op, skipna, sort): grouped = frame.groupby(level=0, sort=sort) - if op in ["skew", "kurt"]: - # skew and kurt have skipna + if op in ["skew", "kurt", "sum", "mean"]: + # skew, kurt, sum, mean have skipna result = getattr(grouped, op)(skipna=skipna) expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(skipna=skipna)) if sort: