From 0e4a52101d750717440e987f093c4bc622cb58c3 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 7 Jun 2022 14:08:51 -0700 Subject: [PATCH] fix typo and add solution for series + index Signed-off-by: Rehan Durrani --- modin/pandas/dataframe.py | 29 +++++++++++------ modin/pandas/test/test_groupby.py | 54 +++++++++++++++++++++++++------ 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 79bbd5147c6..4fa2ecdfb3b 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -425,17 +425,21 @@ def groupby( # groupby takes place. drop = False # Check that there is no ambiguity in the parameter we were given. - _by_check = by if is_list_like(by) else [by] - for k in _by_check: - if k in self.index.names and k in self.axes[axis]: - level_name, index_name = "an index", "a column" - if axis == 1: - level_name, index_name = index_name, level_name - raise ValueError( - f"{k} is both {level_name} level and {index_name} label, which is ambiguous." - ) + # We don't need to check if `by` is a Series or Index, since those + # won't be referencing labels + if not isinstance(by, (pandas.Series, Series, pandas.Index)): + _by_check = by if is_list_like(by) else [by] + for k in _by_check: + if not isinstance(k, (Series, pandas.Series, pandas.Index)): + if k in self.index.names and k in self.axes[axis ^ 1]: + level_name, index_name = "an index", "a column" + if axis == 1: + level_name, index_name = index_name, level_name + raise ValueError( + f"{k} is both {level_name} level and {index_name} label, which is ambiguous." + ) if ( - not isinstance(by, (pandas.Series, Series)) + not isinstance(by, (pandas.Series, Series, pandas.Index)) and is_list_like(by) and len(by) == 1 ): @@ -452,6 +456,11 @@ def groupby( level, by = by, None elif level is None: by = self.__getitem__(by)._query_compiler + elif isinstance(by, (pandas.Series, pandas.Index)): + if isinstance(by, pandas.Index) and len(by) != len(self.axes[axis]): + raise ValueError("Grouper and axis must be same length") + idx_name = by.name + by = Series(by)._query_compiler elif isinstance(by, Series): drop = by._parent is self idx_name = by.name diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index d7f7445e053..8cc518a3ec3 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -2076,15 +2076,16 @@ def test_by_in_index_and_columns(): modin_df, pandas_df, lambda df: df.groupby(by="a").count(), - raising_exceptions=True, - check_exception_type=True, ) eval_general( modin_df, pandas_df, lambda df: df.groupby(by=["a", "b"]).count(), - raising_exceptions=True, - check_exception_type=True, + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=[df["b"], "a"]).count(), ) pandas_df = pandas.DataFrame( [[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", "b"]), columns=["a", "b", "c"] @@ -2094,20 +2095,53 @@ def test_by_in_index_and_columns(): modin_df, pandas_df, lambda df: df.groupby(by="a").count(), - raising_exceptions=True, - check_exception_type=True, ) eval_general( modin_df, pandas_df, lambda df: df.groupby(by=["a", "c"]).count(), - raising_exceptions=True, - check_exception_type=True, ) eval_general( modin_df, pandas_df, lambda df: df.groupby(by=["a", "b"]).count(), - raising_exceptions=True, - check_exception_type=True, + ) + + +def test_by_series(): + pandas_df = pandas.DataFrame( + [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"] + ) + modin_df = from_pandas(pandas_df) + + def make_appropriately_typed_series(df, values=["a"]): + """Return a Series from either pandas or modin.pandas depending on type of `df`.""" + if isinstance(df, pd.DataFrame): + return pd.Series(values) + return pandas.Series(values) + + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=make_appropriately_typed_series(df)).count(), + ) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby( + by=make_appropriately_typed_series(df, ["a", "b"]) + ).count(), + ) + + +def test_by_index(): + pandas_df = pandas.DataFrame( + [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"] + ) + modin_df = from_pandas(pandas_df) + eval_general(modin_df, pandas_df, lambda df: df.groupby(by=pd.Index(["a"])).count()) + eval_general( + modin_df, + pandas_df, + lambda df: df.groupby(by=pd.Index(["a", "b"])).count(), )