From b78a5f2974d04c40e0f738069f0e14eff36f62fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Dock=C3=A8s?= Date: Thu, 1 Aug 2024 15:28:16 +0200 Subject: [PATCH] no "index" column in aggtarget output (#1020) --- CHANGES.rst | 5 +++++ skrub/_agg_joiner.py | 1 - skrub/_dataframe/_pandas.py | 4 ++-- skrub/_dataframe/tests/test_pandas.py | 5 +++-- skrub/tests/test_agg_joiner.py | 1 - 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 863bcfeaa..97632ca44 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -34,6 +34,11 @@ Minor changes duplicate column names, now the output names are always the same. :pr:`1013` by :user:`Jérôme Dockès `. +* In some cases :class:`AggJoiner` and :class:`AggTarget` inserted a column in + the output named "index" containing the pandas index of the auxiliary table. + This has been corrected. + :pr:`1020` by :user:`Jérôme Dockès `. + Release 0.2.0 ============= diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py index a63f95726..ad3fcb2aa 100644 --- a/skrub/_agg_joiner.py +++ b/skrub/_agg_joiner.py @@ -447,7 +447,6 @@ def fit_transform(self, X, y): y_[self.main_key_] = X[self.main_key_] num_operations, categ_operations = split_num_categ_operations(self.operation_) - self.y_ = skrub_px.aggregate( y_, key=self.main_key_, diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py index db21a8c28..9280f1fcc 100644 --- a/skrub/_dataframe/_pandas.py +++ b/skrub/_dataframe/_pandas.py @@ -70,7 +70,7 @@ def aggregate( named_agg = {**num_named_agg, **categ_named_agg} if named_agg: - base_group = table.groupby(key).agg(**named_agg) + base_group = table.groupby(key).agg(**named_agg).reset_index(drop=False) else: base_group = None @@ -104,7 +104,7 @@ def aggregate( ] sorted_cols = sorted(base_group.columns) - return base_group[sorted_cols].reset_index(drop=False) + return base_group[sorted_cols] def get_named_agg(table, cols, operations): diff --git a/skrub/_dataframe/tests/test_pandas.py b/skrub/_dataframe/tests/test_pandas.py index 25ed23223..06db7f1ea 100644 --- a/skrub/_dataframe/tests/test_pandas.py +++ b/skrub/_dataframe/tests/test_pandas.py @@ -30,6 +30,7 @@ def test_simple_agg(): "rating_mean": ("rating", "mean"), } expected = main.groupby("movieId").agg(**aggfunc).reset_index() + expected = expected.loc[:, sorted(expected.columns)] assert_frame_equal(aggregated, expected) @@ -49,7 +50,7 @@ def test_value_counts_agg(): "rating_4.0_user": [3.0, 1.0], "userId": [1, 2], } - ).reset_index(drop=False) + ) assert_frame_equal(aggregated, expected) aggregated = aggregate( @@ -66,7 +67,7 @@ def test_value_counts_agg(): "rating_(3.0, 4.0]_user": [3, 1], "userId": [1, 2], } - ).reset_index(drop=False) + ) assert_frame_equal(aggregated, expected) diff --git a/skrub/tests/test_agg_joiner.py b/skrub/tests/test_agg_joiner.py index f37a57661..b2a613785 100644 --- a/skrub/tests/test_agg_joiner.py +++ b/skrub/tests/test_agg_joiner.py @@ -400,7 +400,6 @@ def test_agg_target(main_table, y, col_name): "movieId": [1, 3, 6, 318, 6, 1704], "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0], "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"], - "index": [0, 0, 0, 1, 1, 1], f"{col_name}_(1.999, 3.0]_user": [0, 0, 0, 2, 2, 2], f"{col_name}_(3.0, 4.0]_user": [3, 3, 3, 1, 1, 1], f"{col_name}_2.0_user": [0.0, 0.0, 0.0, 1.0, 1.0, 1.0],