From 12bd4129062298fc41d784ab196aea9ea3cc9277 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 9 Feb 2024 09:07:14 -0500 Subject: [PATCH 01/10] remove upper pandas limit --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d33e84b8ee..045187c6b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "holidays >= 0.17", "numpy >= 1.21.0", "packaging >= 20.0", - "pandas >= 1.5.0,<2.2.0", + "pandas >= 1.5.0", "psutil >= 5.6.6", "scipy >= 1.10.0", "tqdm >= 4.32.0", From fe16980f81a604612de788a5662778f9553a0e1c Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 9 Feb 2024 15:43:27 -0500 Subject: [PATCH 02/10] Add workaround for nunique bug --- .../feature_set_calculator.py | 15 ++++++++++++ .../test_feature_set_calculator.py | 24 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/featuretools/computational_backends/feature_set_calculator.py b/featuretools/computational_backends/feature_set_calculator.py index d67ce2f7d8..1731dc5f28 100644 --- a/featuretools/computational_backends/feature_set_calculator.py +++ b/featuretools/computational_backends/feature_set_calculator.py @@ -822,6 +822,21 @@ def last_n(df): to_merge = base_frame.groupby(groupby_col).agg(to_agg) else: + # TODO: Remove when https://github.com/pandas-dev/pandas/issues/57317 is fixed + cols_to_fix = [] + for col in base_frame.columns: + dtype = base_frame[col].dtype + if ( + isinstance(dtype, pd.CategoricalDtype) + and str(dtype.categories.dtype) == "int64" + ): + cols_to_fix.append(col) + + if cols_to_fix: + base_frame[cols_to_fix] = base_frame[cols_to_fix].astype( + "int64", + ) + to_merge = base_frame.groupby( base_frame[groupby_col], observed=True, diff --git a/featuretools/tests/computational_backend/test_feature_set_calculator.py b/featuretools/tests/computational_backend/test_feature_set_calculator.py index a0ad533425..68684d203b 100644 --- a/featuretools/tests/computational_backend/test_feature_set_calculator.py +++ b/featuretools/tests/computational_backend/test_feature_set_calculator.py @@ -40,6 +40,7 @@ Trend, ) from featuretools.primitives.base import AggregationPrimitive +from featuretools.primitives.standard.aggregation.num_unique import NumUnique from featuretools.tests.testing_utils import backward_path, to_pandas from featuretools.utils import Trie from featuretools.utils.gen_utils import Library, import_or_none, is_instance @@ -1293,3 +1294,26 @@ def error(s): # Calculating without precalculated features should error. with pytest.raises(RuntimeError, match=error_msg): FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct])).run(instance_ids) + + +def test_nunique_nested_with_agg_bug(pd_es): + """Pandas 2.2.0 has a bug where pd.Series.nunique produces columns with + the category dtype instead of int64 dtype, causing an error when we attempt + another aggregation""" + num_unique_feature = AggregationFeature( + Feature(pd_es["log"].ww["priority_level"]), + "sessions", + primitive=NumUnique, + ) + + mean_nunique_feature = AggregationFeature( + num_unique_feature, + "customers", + primitive=Mean, + ) + feature_set = FeatureSet([mean_nunique_feature]) + calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set) + df = calculator.run(np.array([0])) + df = to_pandas(df, index="id") + + assert df.iloc[0, 0].round(4) == 1.6667 From 66723c3962f308d1f59535733df98cca6c63f6f6 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 9 Feb 2024 15:47:41 -0500 Subject: [PATCH 03/10] Add release note --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 928465bf0b..fa442921c0 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -8,6 +8,7 @@ Future Release * Enhancements * Fixes * Fix dependency issues (:pr:`2644`, :pr:`2656`) + * Add workaround for pandas 2.2.0 bug with nunique and unpin pandas (:pr:`2657`) * Changes * Documentation Changes * Testing Changes From ce8cce52b902e0b0aedde295fd489f688a93798d Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 9 Feb 2024 16:08:36 -0500 Subject: [PATCH 04/10] update test with woodwork main yml --- .github/workflows/tests_with_woodwork_main_branch.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests_with_woodwork_main_branch.yaml b/.github/workflows/tests_with_woodwork_main_branch.yaml index 4e71077a75..772a8e82f0 100644 --- a/.github/workflows/tests_with_woodwork_main_branch.yaml +++ b/.github/workflows/tests_with_woodwork_main_branch.yaml @@ -62,7 +62,7 @@ jobs: slack_alert_failure: name: Send Slack alert if failure - needs: unit_tests_woodwork_main + needs: tests_woodwork_main runs-on: ubuntu-latest if: ${{ always() }} steps: From 7d88d6a0fd05826df3beaa5b0b62ac6a154224cc Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 9 Feb 2024 16:15:42 -0500 Subject: [PATCH 05/10] update python version in ww main test --- .github/workflows/tests_with_woodwork_main_branch.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests_with_woodwork_main_branch.yaml b/.github/workflows/tests_with_woodwork_main_branch.yaml index 772a8e82f0..a27e8ddc0f 100644 --- a/.github/workflows/tests_with_woodwork_main_branch.yaml +++ b/.github/workflows/tests_with_woodwork_main_branch.yaml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: true matrix: - python_version: ["3.8", "3.9", "3.10"] + python_version: ["3.9", "3.10", "3.11"] libraries: ["core", "spark - misc", "spark - computational", "spark - entityset_1", "spark - entityset_2", "spark - primitives"] steps: @@ -67,7 +67,7 @@ jobs: if: ${{ always() }} steps: - name: Send Slack alert if failure - if: ${{ needs.unit_tests_woodwork_main.result != 'success' }} + if: ${{ needs.unit_tests_woodwork_main.result != 'success' }} id: slack uses: slackapi/slack-github-action@v1 with: From 8c00c1e5da9cdb9da09769ced94dd58f708650f6 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 9 Feb 2024 16:20:22 -0500 Subject: [PATCH 06/10] fix accidental yaml change --- .github/workflows/tests_with_woodwork_main_branch.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests_with_woodwork_main_branch.yaml b/.github/workflows/tests_with_woodwork_main_branch.yaml index a27e8ddc0f..1fdfa47f11 100644 --- a/.github/workflows/tests_with_woodwork_main_branch.yaml +++ b/.github/workflows/tests_with_woodwork_main_branch.yaml @@ -67,7 +67,7 @@ jobs: if: ${{ always() }} steps: - name: Send Slack alert if failure - if: ${{ needs.unit_tests_woodwork_main.result != 'success' }} + if: ${{ needs.unit_tests_woodwork_main.result != 'success' }} id: slack uses: slackapi/slack-github-action@v1 with: From d9219b99c20b56124085da0649b3962e08bffa87 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 9 Feb 2024 16:58:16 -0500 Subject: [PATCH 07/10] Just use nunique string in primitive as fix --- .../feature_set_calculator.py | 16 ---------------- .../standard/aggregation/num_unique.py | 6 +----- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/featuretools/computational_backends/feature_set_calculator.py b/featuretools/computational_backends/feature_set_calculator.py index 1731dc5f28..26837ca8d3 100644 --- a/featuretools/computational_backends/feature_set_calculator.py +++ b/featuretools/computational_backends/feature_set_calculator.py @@ -820,23 +820,7 @@ def last_n(df): # work) if is_instance(base_frame, (dd, ps), "DataFrame"): to_merge = base_frame.groupby(groupby_col).agg(to_agg) - else: - # TODO: Remove when https://github.com/pandas-dev/pandas/issues/57317 is fixed - cols_to_fix = [] - for col in base_frame.columns: - dtype = base_frame[col].dtype - if ( - isinstance(dtype, pd.CategoricalDtype) - and str(dtype.categories.dtype) == "int64" - ): - cols_to_fix.append(col) - - if cols_to_fix: - base_frame[cols_to_fix] = base_frame[cols_to_fix].astype( - "int64", - ) - to_merge = base_frame.groupby( base_frame[groupby_col], observed=True, diff --git a/featuretools/primitives/standard/aggregation/num_unique.py b/featuretools/primitives/standard/aggregation/num_unique.py index b0021e5dea..4b03f333e7 100644 --- a/featuretools/primitives/standard/aggregation/num_unique.py +++ b/featuretools/primitives/standard/aggregation/num_unique.py @@ -1,4 +1,3 @@ -import pandas as pd from woodwork.column_schema import ColumnSchema from woodwork.logical_types import IntegerNullable @@ -51,7 +50,4 @@ def finalize(s): return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize) - elif agg_type == Library.SPARK: - return "nunique" - - return pd.Series.nunique + return "nunique" From 05c8f21b371eda63aafbfed764e4b5535e8a9929 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Mon, 12 Feb 2024 11:07:15 -0500 Subject: [PATCH 08/10] fix docstring failure by allowing function to be used via param to primitive --- .../standard/aggregation/num_unique.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/featuretools/primitives/standard/aggregation/num_unique.py b/featuretools/primitives/standard/aggregation/num_unique.py index 4b03f333e7..dcba481803 100644 --- a/featuretools/primitives/standard/aggregation/num_unique.py +++ b/featuretools/primitives/standard/aggregation/num_unique.py @@ -1,3 +1,4 @@ +import pandas as pd from woodwork.column_schema import ColumnSchema from woodwork.logical_types import IntegerNullable @@ -10,8 +11,14 @@ class NumUnique(AggregationPrimitive): """Determines the number of distinct values, ignoring `NaN` values. + Args: + use_string_for_pd_calc (bool): Determines if the string 'nunique' or the function + pd.Series.nunique is used for making the primitive calculation. Put in place to + account for the bug https://github.com/pandas-dev/pandas/issues/57317. + Defaults to using the string. + Examples: - >>> num_unique = NumUnique() + >>> num_unique = NumUnique(use_string_for_pd_calc=False) >>> num_unique(['red', 'blue', 'green', 'yellow']) 4 @@ -28,6 +35,9 @@ class NumUnique(AggregationPrimitive): compatibility = [Library.PANDAS, Library.DASK, Library.SPARK] description_template = "the number of unique elements in {}" + def __init__(self, use_string_for_pd_calc=True): + self.use_string_for_pd_calc = use_string_for_pd_calc + def get_function(self, agg_type=Library.PANDAS): if agg_type == Library.DASK: @@ -50,4 +60,6 @@ def finalize(s): return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize) - return "nunique" + if self.use_string_for_pd_calc: + return "nunique" + return pd.Series.nunique From ef5a6691ba9611bbe9714d6c1bc2c6e4a71f9427 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Mon, 12 Feb 2024 11:13:09 -0500 Subject: [PATCH 09/10] Add release note for ww main tests --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index fa442921c0..585222e1e5 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -9,6 +9,7 @@ Future Release * Fixes * Fix dependency issues (:pr:`2644`, :pr:`2656`) * Add workaround for pandas 2.2.0 bug with nunique and unpin pandas (:pr:`2657`) + * Fix the tests that run with Woodwork main so they can be triggered (:pr:`2657`) * Changes * Documentation Changes * Testing Changes From 7286c078c9dda7d1cf3fc64fe604dcf2c7be3d97 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Mon, 12 Feb 2024 11:38:48 -0500 Subject: [PATCH 10/10] move testing release note --- docs/source/release_notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 585222e1e5..f1cf6afe37 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -9,7 +9,6 @@ Future Release * Fixes * Fix dependency issues (:pr:`2644`, :pr:`2656`) * Add workaround for pandas 2.2.0 bug with nunique and unpin pandas (:pr:`2657`) - * Fix the tests that run with Woodwork main so they can be triggered (:pr:`2657`) * Changes * Documentation Changes * Testing Changes @@ -17,6 +16,7 @@ Future Release * Update ruff to 0.1.6 and use ruff linter/formatter (:pr:`2639`) * Update ``release.yaml`` to use trusted publisher for PyPI releases (:pr:`2646`, :pr:`2653`, :pr:`2654`) * Update dependency checkers and tests to include Dask (:pr:`2658`) + * Fix the tests that run with Woodwork main so they can be triggered (:pr:`2657`) Thanks to the following people for contributing to this release: