modin-project · devin-petersohn · Jun 2, 2022 · Jun 2, 2022 · Jun 2, 2022 · Jun 3, 2022
@@ -29,6 +29,7 @@ Key Features and Updates
   * FIX-#4503: Stop the memory logging thread after session exit (#4515)
   * FIX-#4531: Fix a makedirs race condition in to_parquet (#4533)
   * FIX-#4464: Refactor Ray utils and quick fix groupby.count failing on virtual partitions (#4490)
+  * FIX-#4522: Correct multiindex metadata with groupby (#4523)
   * FIX-#4436: Fix to_pydatetime dtype for timezone None (#4437)
 * Performance enhancements
   * FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346)

@@ -133,6 +133,14 @@ def map(
                     axis=1,
                 )
                 other = list(other.columns)
+                # GH#4522: Vile as this may be, it is necessary to avoid the case where we are
+                # grouping by columns that were recently added to the data via
+                # `from_labels`. The internal dataframe doesn't know what to do when
+                # the label matches a column name.
+                # We ensure that the columns, index, and by don't intersect in the API level,
+                # so if we hit this if statement, we know its a result of a deferred re-index.
+                if len(df.columns.intersection(df.index.names)) > 0:
+                    df = df.reset_index(drop=True)
             by_part = other
         else:
             by_part = by

@@ -3461,7 +3461,9 @@ def value_counts(
         Count unique values in the `BasePandasDataset`.
         """
         if subset is None:
-            subset = self._query_compiler.columns
+            # Need to get column names as array rather than as Index, since `groupby` does not
+            # treat `Index` arguments to `by` as a list of labels.
+            subset = self._query_compiler.columns.values
         counted_values = self.groupby(by=subset, dropna=dropna, observed=True).size()
         if sort:
             counted_values.sort_values(ascending=ascending, inplace=True)

@@ -424,9 +424,22 @@ def groupby(
         # strings is passed in, the data used for the groupby is dropped before the
         # groupby takes place.
         drop = False
-
+        # Check that there is no ambiguity in the parameter we were given.
+        # We don't need to check if `by` is a Series or Index, since those
+        # won't be referencing labels
+        if not isinstance(by, (pandas.Series, Series, pandas.Index)):
+            _by_list = by if is_list_like(by) else [by]
+            for k in _by_list:
+                if not isinstance(k, (Series, pandas.Series, pandas.Index)):
+                    if k in self.index.names and k in self.axes[axis ^ 1]:
+                        level_name, index_name = "an index", "a column"
+                        if axis == 1:
+                            level_name, index_name = index_name, level_name
+                        raise ValueError(
+                            f"{k} is both {level_name} level and {index_name} label, which is ambiguous."
+                        )
         if (
-            not isinstance(by, (pandas.Series, Series))
+            not isinstance(by, (pandas.Series, Series, pandas.Index))
             and is_list_like(by)
             and len(by) == 1
         ):
@@ -443,6 +456,11 @@ def groupby(
                 level, by = by, None
             elif level is None:
                 by = self.__getitem__(by)._query_compiler
+        elif isinstance(by, (pandas.Series, pandas.Index)):
+            if isinstance(by, pandas.Index) and len(by) != len(self.axes[axis]):
+                raise ValueError("Grouper and axis must be same length")
+            idx_name = by.name
+            by = Series(by)._query_compiler
         elif isinstance(by, Series):
             drop = by._parent is self
             idx_name = by.name

@@ -1570,7 +1570,7 @@ def test_agg_exceptions(operation):
         },
     ],
 )
-def test_to_pandas_convertion(kwargs):
+def test_to_pandas_conversion(kwargs):
     data = {"a": [1, 2], "b": [3, 4], "c": [5, 6]}
     by = ["a", "b"]
 
@@ -2032,3 +2032,126 @@ def test_sum_with_level():
     }
     modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)
     eval_general(modin_df, pandas_df, lambda df: df.set_index("C").groupby("C").sum())
+
+
+def test_reset_index_groupby():
+    # Due to `reset_index` deferring the actual reindexing of partitions,
+    # when we call groupby after a `reset_index` with a `by` column name
+    # that was moved from the index to the columns via `from_labels` the
+    # algebra layer incorrectly thinks that the `by` key is duplicated
+    # across both the columns and labels, and fails, when it should
+    # succeed. We have this test to ensure that that case is correctly
+    # handled, and passes. For more details, checkout
+    # https://github.com/modin-project/modin/issues/4522.
+    frame_data = np.random.randint(97, 198, size=(2**6, 2**4))
+    pandas_df = pandas.DataFrame(
+        frame_data,
+        index=pandas.MultiIndex.from_tuples(
+            [(i // 4, i // 2, i) for i in range(2**6)]
+        ),
+    ).add_prefix("col")
+    pandas_df.index.names = [f"index_{i}" for i in range(len(pandas_df.index.names))]
+    # Convert every even column to string
+    for col in pandas_df.iloc[
+        :, [i for i in range(len(pandas_df.columns)) if i % 2 == 0]
+    ]:
+        pandas_df[col] = [str(chr(i)) for i in pandas_df[col]]
+    # The `pandas_df` contains a multi-index with 3 levels, named `index_0`, `index_1`,
+    # and `index_2`, and 16 columns, named `col0` through `col15`. Every even column
+    # has dtype `str`, while odd columns have dtype `int64`.
+    modin_df = from_pandas(pandas_df)
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.reset_index().groupby(["index_0", "index_1"]).count(),
+    )
+
+
+def test_by_in_index_and_columns():
+    pandas_df = pandas.DataFrame(
+        [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"]
+    )
+    modin_df = from_pandas(pandas_df)
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by="a").count(),
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=["a", "b"]).count(),
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=[df["b"], "a"]).count(),
+    )
+    pandas_df = pandas.DataFrame(
+        [[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", "b"]), columns=["a", "b", "c"]
+    )
+    modin_df = from_pandas(pandas_df)
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by="a").count(),
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=["a", "c"]).count(),
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=["a", "b"]).count(),
+    )
+
+
+def test_by_series():
+    pandas_df = pandas.DataFrame(
+        [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"]
+    )
+    modin_df = from_pandas(pandas_df)
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=pandas.Series(["a"])).count(),
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=pandas.Series(["a", "b"])).count(),
+    )
+
+    def make_appropriately_typed_series(df, values=["a"]):
+        """Return a Series from either pandas or modin.pandas depending on type of `df`."""
+        if isinstance(df, pd.DataFrame):
+            return pd.Series(values)
+        return pandas.Series(values)
+
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=make_appropriately_typed_series(df)).count(),
+    )
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(
+            by=make_appropriately_typed_series(df, ["a", "b"])
+        ).count(),
+    )
+
+
+def test_by_index():
+    pandas_df = pandas.DataFrame(
+        [[1, 2, 3]], index=pd.Index([0], name="a"), columns=["a", "b", "c"]
+    )
+    modin_df = from_pandas(pandas_df)
+    eval_general(modin_df, pandas_df, lambda df: df.groupby(by=pd.Index(["a"])).count())
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.groupby(by=pd.Index(["a", "b"])).count(),
+    )