From c5110c3176b797d654647e81f7254a51b02e1181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 16:42:18 +0200 Subject: [PATCH 01/15] Skip test_ffill_limit_area test added in Pandas 2.2 --- spatialpandas/tests/test_fixedextensionarray.py | 3 +++ spatialpandas/tests/test_listextensionarray.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/spatialpandas/tests/test_fixedextensionarray.py b/spatialpandas/tests/test_fixedextensionarray.py index 6261b33..b0d3e3f 100644 --- a/spatialpandas/tests/test_fixedextensionarray.py +++ b/spatialpandas/tests/test_fixedextensionarray.py @@ -276,6 +276,9 @@ def test_fillna_series(self): def test_fillna_series_method(self): pass + @pytest.mark.skip(reason="not implemented") + def test_ffill_limit_area(self): + pass class TestGeometryReshaping(eb.BaseReshapingTests): @pytest.mark.skip(reason="__setitem__ not supported") diff --git a/spatialpandas/tests/test_listextensionarray.py b/spatialpandas/tests/test_listextensionarray.py index 4f55c12..bfe7606 100644 --- a/spatialpandas/tests/test_listextensionarray.py +++ b/spatialpandas/tests/test_listextensionarray.py @@ -275,6 +275,9 @@ def test_fillna_series(self): def test_fillna_series_method(self): pass + @pytest.mark.skip(reason="not implemented") + def test_ffill_limit_area(self): + pass class TestGeometryReshaping(eb.BaseReshapingTests): From 75174105a83a1b7b2176fdfb0ed4f34b739a2ad4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 16:42:28 +0200 Subject: [PATCH 02/15] Remove pandas pin --- pixi.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pixi.toml b/pixi.toml index 5932f4e..c8b0d20 100644 --- a/pixi.toml +++ b/pixi.toml @@ -24,7 +24,7 @@ lint = ["py311", "lint"] dask-core = "*" fsspec = "*" numba = "*" -pandas = "<2.2" # FIX: Temporary upper pin +pandas = "*" pip = "*" pyarrow = ">=10,<15" # FIX: Temporary upper pin retrying = "*" From 36ef36cb397dd319dbc569d0a8dce5964f9d9076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 17:00:48 +0200 Subject: [PATCH 03/15] Implement _constructor_*_from_mgr --- spatialpandas/geodataframe.py | 13 +++++++++++++ spatialpandas/geoseries.py | 22 +++++----------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/spatialpandas/geodataframe.py b/spatialpandas/geodataframe.py index 84ded98..304091f 100644 --- a/spatialpandas/geodataframe.py +++ b/spatialpandas/geodataframe.py @@ -63,10 +63,23 @@ def __init__(self, data=None, index=None, geometry=None, **kwargs): def _constructor(self): return _MaybeGeoDataFrame + def _constructor_from_mgr(self, mgr, axes): + if not any(isinstance(block.dtype, GeometryDtype) for block in mgr.blocks): + return pd.DataFrame._from_mgr(mgr, axes) + + return GeoDataFrame._from_mgr(mgr, axes) + @property def _constructor_sliced(self): return _MaybeGeoSeries + def _constructor_sliced_from_mgr(self, mgr, axes): + is_row_proxy = mgr.index.is_(self.columns) + + if isinstance(mgr.blocks[0].dtype, GeometryDtype) and not is_row_proxy: + return GeoSeries._from_mgr(mgr, axes) + return pd.Series._from_mgr(mgr, axes) + def set_geometry(self, geometry, inplace=False): if (geometry not in self or not isinstance(self[geometry].dtype, GeometryDtype)): diff --git a/spatialpandas/geoseries.py b/spatialpandas/geoseries.py index 80a06db..0d60856 100644 --- a/spatialpandas/geoseries.py +++ b/spatialpandas/geoseries.py @@ -1,5 +1,4 @@ import pandas as pd -from packaging.version import Version from .geometry import GeometryDtype, Geometry @@ -36,24 +35,13 @@ def __init__(self, data, index=None, name=None, dtype=None, **kwargs): super().__init__(data, index=index, name=name, **kwargs) def _constructor_from_mgr(self, mgr, axes): - if Version(pd.__version__) < Version('2.1'): - return super()._constructor_from_mgr(mgr, axes) + from pandas.core.internals import SingleBlockManager + assert isinstance(mgr, SingleBlockManager) - # Copied from pandas.Series._constructor_from_mgr - # https://github.com/pandas-dev/pandas/blob/80a1a8bc3e07972376284ffce425a2abd1e58604/pandas/core/series.py#L582-L590 - # Changed if statement to use GeoSeries instead of Series. - # REF: https://github.com/pandas-dev/pandas/pull/52132 - # REF: https://github.com/pandas-dev/pandas/pull/53871 + if not isinstance(mgr.blocks[0].dtype, GeometryDtype): + return pd.Series._from_mgr(mgr, axes) - ser = self._from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name - - if type(self) is GeoSeries: # Changed line - # fastpath avoiding constructor call - return ser - else: - assert axes is mgr.axes - return self._constructor(ser, copy=False) + return GeoSeries._from_mgr(mgr, axes) @property def _constructor(self): From 2f4775b2db49b99c29a81220a4247ba20d09284e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 17:48:30 +0200 Subject: [PATCH 04/15] Add workaround for test_unstack --- .../tests/test_fixedextensionarray.py | 71 +++++++++++++++++++ .../tests/test_listextensionarray.py | 71 +++++++++++++++++++ 2 files changed, 142 insertions(+) diff --git a/spatialpandas/tests/test_fixedextensionarray.py b/spatialpandas/tests/test_fixedextensionarray.py index b0d3e3f..b336d1e 100644 --- a/spatialpandas/tests/test_fixedextensionarray.py +++ b/spatialpandas/tests/test_fixedextensionarray.py @@ -1,5 +1,7 @@ +import itertools import pandas.tests.extension.base as eb import pytest +import pandas as pd from spatialpandas.geometry import PointArray, PointDtype @@ -288,3 +290,72 @@ def test_ravel(self): @pytest.mark.skip(reason="transpose with numpy array elements seems not supported") def test_transpose(self): pass + + # NOTE: this test is copied from pandas/tests/extension/base/reshaping.py + # because starting with pandas 3.0 the assert_frame_equal is strict regarding + # the exact missing value (None vs NaN) + # Our `result` uses None, but the way the `expected` is created results in + # NaNs (and specifying to use None as fill value in unstack also does not + # help) + # -> the only real change compared to the upstream test is marked + # Change from: https://github.com/geopandas/geopandas/pull/3234 + @pytest.mark.parametrize( + "index", + [ + # Two levels, uniform. + pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]), + # non-uniform + pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]), + # three levels, non-uniform + pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]), + pd.MultiIndex.from_tuples( + [ + ("A", "a", 1), + ("A", "b", 0), + ("A", "a", 0), + ("B", "a", 0), + ("B", "c", 1), + ] + ), + ], + ) + @pytest.mark.parametrize("obj", ["series", "frame"]) + def test_unstack(self, data, index, obj): + data = data[: len(index)] + if obj == "series": + ser = pd.Series(data, index=index) + else: + ser = pd.DataFrame({"A": data, "B": data}, index=index) + + n = index.nlevels + levels = list(range(n)) + # [0, 1, 2] + # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)] + combinations = itertools.chain.from_iterable( + itertools.permutations(levels, i) for i in range(1, n) + ) + + for level in combinations: + result = ser.unstack(level=level) + assert all( + isinstance(result[col].array, type(data)) for col in result.columns + ) + + if obj == "series": + # We should get the same result with to_frame+unstack+droplevel + df = ser.to_frame() + + alt = df.unstack(level=level).droplevel(0, axis=1) + pd.testing.assert_frame_equal(result, alt) + + obj_ser = ser.astype(object) + + expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value) + if obj == "series": + assert (expected.dtypes == object).all() # noqa: E721 + # <------------ next line is added + expected[expected.isna()] = None + # -------------> + + result = result.astype(object) + pd.testing.assert_frame_equal(result, expected) diff --git a/spatialpandas/tests/test_listextensionarray.py b/spatialpandas/tests/test_listextensionarray.py index bfe7606..fe2d857 100644 --- a/spatialpandas/tests/test_listextensionarray.py +++ b/spatialpandas/tests/test_listextensionarray.py @@ -1,5 +1,7 @@ +import itertools import pandas.tests.extension.base as eb import pytest +import pandas as pd from spatialpandas.geometry import LineArray, LineDtype @@ -288,3 +290,72 @@ def test_ravel(self): @pytest.mark.skip(reason="transpose with numpy array elements seems not supported") def test_transpose(self): pass + + # NOTE: this test is copied from pandas/tests/extension/base/reshaping.py + # because starting with pandas 3.0 the assert_frame_equal is strict regarding + # the exact missing value (None vs NaN) + # Our `result` uses None, but the way the `expected` is created results in + # NaNs (and specifying to use None as fill value in unstack also does not + # help) + # -> the only real change compared to the upstream test is marked + # Change from: https://github.com/geopandas/geopandas/pull/3234 + @pytest.mark.parametrize( + "index", + [ + # Two levels, uniform. + pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]), + # non-uniform + pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]), + # three levels, non-uniform + pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]), + pd.MultiIndex.from_tuples( + [ + ("A", "a", 1), + ("A", "b", 0), + ("A", "a", 0), + ("B", "a", 0), + ("B", "c", 1), + ] + ), + ], + ) + @pytest.mark.parametrize("obj", ["series", "frame"]) + def test_unstack(self, data, index, obj): + data = data[: len(index)] + if obj == "series": + ser = pd.Series(data, index=index) + else: + ser = pd.DataFrame({"A": data, "B": data}, index=index) + + n = index.nlevels + levels = list(range(n)) + # [0, 1, 2] + # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)] + combinations = itertools.chain.from_iterable( + itertools.permutations(levels, i) for i in range(1, n) + ) + + for level in combinations: + result = ser.unstack(level=level) + assert all( + isinstance(result[col].array, type(data)) for col in result.columns + ) + + if obj == "series": + # We should get the same result with to_frame+unstack+droplevel + df = ser.to_frame() + + alt = df.unstack(level=level).droplevel(0, axis=1) + pd.testing.assert_frame_equal(result, alt) + + obj_ser = ser.astype(object) + + expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value) + if obj == "series": + assert (expected.dtypes == object).all() # noqa: E721 + # <------------ next line is added + expected[expected.isna()] = None + # -------------> + + result = result.astype(object) + pd.testing.assert_frame_equal(result, expected) From dfb8a5f39b1cd8dac643afc267cabe38e1bb136a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 18:10:21 +0200 Subject: [PATCH 05/15] Remove dask warning --- spatialpandas/dask.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spatialpandas/dask.py b/spatialpandas/dask.py index 53d50a0..9ce8d85 100644 --- a/spatialpandas/dask.py +++ b/spatialpandas/dask.py @@ -190,7 +190,11 @@ def pack_partitions(self, npartitions=None, p=15, shuffle='tasks'): # Set index to distance. This will trigger an expensive shuffle # sort operation - ddf = ddf.set_index('hilbert_distance', npartitions=npartitions, shuffle=shuffle) + try: + ddf = ddf.set_index('hilbert_distance', npartitions=npartitions, shuffle_method=shuffle) + except TypeError: + # Changed to shuffle_method in 2024.1, https://github.com/dask/dask/pull/10738 + ddf = ddf.set_index('hilbert_distance', npartitions=npartitions, shuffle=shuffle) if ddf.npartitions != npartitions: # set_index doesn't change the number of partitions if the partitions From 30d0972a3fece0781cc4cb46c1ccff5066276626 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 18:22:31 +0200 Subject: [PATCH 06/15] Add HYPOTHESIS_MAX_EXAMPLES environment variable --- spatialpandas/tests/geometry/strategies.py | 3 ++- .../tests/spatialindex/test_hilbert_curve.py | 3 ++- spatialpandas/tests/spatialindex/test_rtree.py | 3 ++- spatialpandas/tests/test_parquet.py | 12 +++++++----- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/spatialpandas/tests/geometry/strategies.py b/spatialpandas/tests/geometry/strategies.py index ff52061..a6ffb88 100644 --- a/spatialpandas/tests/geometry/strategies.py +++ b/spatialpandas/tests/geometry/strategies.py @@ -1,3 +1,4 @@ +import os import numpy as np from geopandas import GeoSeries from geopandas.array import from_shapely @@ -11,7 +12,7 @@ hyp_settings = settings( deadline=None, - max_examples=100, + max_examples=int(os.environ.get('HYPOTHESIS_MAX_EXAMPLES', 100)), suppress_health_check=[HealthCheck.too_slow], ) diff --git a/spatialpandas/tests/spatialindex/test_hilbert_curve.py b/spatialpandas/tests/spatialindex/test_hilbert_curve.py index 421cd03..e493700 100644 --- a/spatialpandas/tests/spatialindex/test_hilbert_curve.py +++ b/spatialpandas/tests/spatialindex/test_hilbert_curve.py @@ -1,3 +1,4 @@ +import os from itertools import product import hypothesis.strategies as st @@ -18,7 +19,7 @@ distances_from_coordinates, ) -hyp_settings = settings(deadline=None) +hyp_settings = settings(deadline=None, max_examples=int(os.environ.get('HYPOTHESIS_MAX_EXAMPLES', 100))) # ### strategies ### st_p = st.integers(min_value=1, max_value=5) diff --git a/spatialpandas/tests/spatialindex/test_rtree.py b/spatialpandas/tests/spatialindex/test_rtree.py index 78a6b2c..3ec03d2 100644 --- a/spatialpandas/tests/spatialindex/test_rtree.py +++ b/spatialpandas/tests/spatialindex/test_rtree.py @@ -1,3 +1,4 @@ +import os import pickle import hypothesis.strategies as st @@ -9,7 +10,7 @@ from spatialpandas.spatialindex import HilbertRtree # ### hypothesis settings ### -hyp_settings = settings(deadline=None) +hyp_settings = settings(deadline=None, max_examples=int(os.environ.get('HYPOTHESIS_MAX_EXAMPLES', 100))) # ### Custom strategies ### diff --git a/spatialpandas/tests/test_parquet.py b/spatialpandas/tests/test_parquet.py index 0f556ea..92d8414 100644 --- a/spatialpandas/tests/test_parquet.py +++ b/spatialpandas/tests/test_parquet.py @@ -1,3 +1,5 @@ +import os + import dask import dask.dataframe as dd import hypothesis.strategies as hs @@ -21,7 +23,7 @@ hyp_settings = settings( deadline=None, - max_examples=100, + max_examples=int(os.environ.get('HYPOTHESIS_MAX_EXAMPLES', 100)), suppress_health_check=[HealthCheck.too_slow], ) @@ -133,7 +135,7 @@ def test_parquet_dask(gp_multipoint, gp_multiline, tmp_path_factory): gp_multipoint=st_multipoint_array(min_size=10, max_size=40, geoseries=True), gp_multiline=st_multiline_array(min_size=10, max_size=40, geoseries=True), ) -@settings(deadline=None, max_examples=30) +@settings(deadline=None, max_examples=int(os.environ.get('HYPOTHESIS_MAX_EXAMPLES', 30))) def test_pack_partitions(gp_multipoint, gp_multiline): # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) @@ -172,7 +174,7 @@ def test_pack_partitions(gp_multipoint, gp_multiline): ) @settings( deadline=None, - max_examples=30, + max_examples=int(os.environ.get('HYPOTHESIS_MAX_EXAMPLES', 30)), suppress_health_check=[HealthCheck.too_slow], phases=[ Phase.explicit, @@ -246,7 +248,7 @@ def test_pack_partitions_to_parquet(gp_multipoint, gp_multiline, gp_multipoint2=st_multipoint_array(min_size=10, max_size=40, geoseries=True), gp_multiline2=st_multiline_array(min_size=10, max_size=40, geoseries=True), ) -@settings(deadline=None, max_examples=30, suppress_health_check=[HealthCheck.too_slow]) +@settings(deadline=None, max_examples=int(os.environ.get('HYPOTHESIS_MAX_EXAMPLES', 30)), suppress_health_check=[HealthCheck.too_slow]) def test_pack_partitions_to_parquet_glob(gp_multipoint1, gp_multiline1, gp_multipoint2, gp_multiline2, tmp_path_factory): @@ -316,7 +318,7 @@ def test_pack_partitions_to_parquet_glob(gp_multipoint1, gp_multiline1, gp_multiline2=st_multiline_array(min_size=10, max_size=40, geoseries=True), bounds=st_bounds(), ) -@settings(deadline=None, max_examples=30, suppress_health_check=[HealthCheck.too_slow]) +@settings(deadline=None, max_examples=int(os.environ.get('HYPOTHESIS_MAX_EXAMPLES', 30)), suppress_health_check=[HealthCheck.too_slow]) def test_pack_partitions_to_parquet_list_bounds( gp_multipoint1, gp_multiline1, gp_multipoint2, gp_multiline2, From 6128bf8c0836eec1df6483fb51f9139a2cbb7fbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 18:28:45 +0200 Subject: [PATCH 07/15] Set filterwarnings to error --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index dd6b5cc..bf15e9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,9 @@ addopts = [ minversion = "7" xfail_strict = true log_cli_level = "INFO" -filterwarnings = [] +filterwarnings = [ + "error", +] [tool.ruff] fix = true From 245466a665c0f98bf3d1ecdb97fec5d363398fc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 18:37:52 +0200 Subject: [PATCH 08/15] Add packaging to required dependencies --- pixi.toml | 1 + pyproject.toml | 14 ++++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pixi.toml b/pixi.toml index c8b0d20..b38a4b0 100644 --- a/pixi.toml +++ b/pixi.toml @@ -24,6 +24,7 @@ lint = ["py311", "lint"] dask-core = "*" fsspec = "*" numba = "*" +packaging = "*" pandas = "*" pip = "*" pyarrow = ">=10,<15" # FIX: Temporary upper pin diff --git a/pyproject.toml b/pyproject.toml index bf15e9b..4a78079 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,15 @@ classifiers = [ "Topic :: Scientific/Engineering", "Topic :: Software Development :: Libraries", ] -dependencies = ['dask', 'fsspec >=2022.8', 'numba', 'pandas', 'pyarrow >=10', 'retrying'] +dependencies = [ + 'dask', + 'fsspec >=2022.8', + 'numba', + 'packaging', + 'pandas', + 'pyarrow >=10', + 'retrying', +] [project.urls] Homepage = "https://github.com/holoviz/spatialpandas" @@ -61,9 +69,7 @@ addopts = [ minversion = "7" xfail_strict = true log_cli_level = "INFO" -filterwarnings = [ - "error", -] +filterwarnings = ["error"] [tool.ruff] fix = true From 7d3e00c6343bcd91dbad42667c0bcea5e54d89a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 18:47:35 +0200 Subject: [PATCH 09/15] Unpin pyarrow --- pixi.toml | 2 +- spatialpandas/io/parquet.py | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/pixi.toml b/pixi.toml index b38a4b0..e0fd681 100644 --- a/pixi.toml +++ b/pixi.toml @@ -27,7 +27,7 @@ numba = "*" packaging = "*" pandas = "*" pip = "*" -pyarrow = ">=10,<15" # FIX: Temporary upper pin +pyarrow = ">=10" retrying = "*" [feature.py39.dependencies] diff --git a/spatialpandas/io/parquet.py b/spatialpandas/io/parquet.py index 2c5e2d7..3706622 100644 --- a/spatialpandas/io/parquet.py +++ b/spatialpandas/io/parquet.py @@ -30,7 +30,9 @@ PANDAS_GE_12 = Version(pd.__version__) >= Version("1.2.0") # When we drop support for pyarrow < 5 all code related to this can be removed. -LEGACY_PYARROW = Version(pa.__version__) < Version("5.0.0") +_pa_version = Version(pa.__version__) +PYARROW_LT_5 = _pa_version < Version("5.0.0") +PYARROW_GE_15 = _pa_version >= Version("15.0.0") def _load_parquet_pandas_metadata( @@ -45,8 +47,10 @@ def _load_parquet_pandas_metadata( raise ValueError("Path not found: " + path) if filesystem.isdir(path): - if LEGACY_PYARROW: + if PYARROW_LT_5: basic_kwargs = dict(validate_schema=False) + elif PYARROW_GE_15: + basic_kwargs = {} else: basic_kwargs = dict(use_legacy_dataset=False) @@ -57,7 +61,7 @@ def _load_parquet_pandas_metadata( **engine_kwargs, ) - if LEGACY_PYARROW: + if PYARROW_LT_5: common_metadata = pqds.common_metadata if common_metadata is None: # Get metadata for first piece @@ -128,8 +132,10 @@ def read_parquet( engine_kwargs = engine_kwargs or {} filesystem = validate_coerce_filesystem(path, filesystem, storage_options) - if LEGACY_PYARROW: + if PYARROW_LT_5: basic_kwargs = dict(validate_schema=False) + elif PYARROW_GE_15: + basic_kwargs = {} else: basic_kwargs = dict(use_legacy_dataset=False) @@ -142,7 +148,7 @@ def read_parquet( **kwargs, ) - if LEGACY_PYARROW: + if PYARROW_LT_5: metadata = _load_parquet_pandas_metadata( path, filesystem=filesystem, @@ -338,8 +344,10 @@ def _perform_read_parquet_dask( filesystem, storage_options, ) - if LEGACY_PYARROW: + if PYARROW_LT_5: basic_kwargs = dict(validate_schema=False) + elif PYARROW_GE_15: + basic_kwargs = {} else: basic_kwargs = dict(use_legacy_dataset=False) @@ -420,7 +428,7 @@ def _perform_read_parquet_dask( else: cols_no_index = None - if LEGACY_PYARROW: + if PYARROW_LT_5: files = paths else: files = getattr(datasets[0], "files", paths) @@ -524,7 +532,7 @@ def _read_metadata(filename, filesystem): def _load_partition_bounds(pqds, filesystem=None): partition_bounds = None - if LEGACY_PYARROW: + if PYARROW_LT_5: common_metadata = pqds.common_metadata else: filename = "/".join([_get_parent_path(pqds.files[0]), "_common_metadata"]) From 8eddfd8a706d772380ae175b6726ba98b9cf31a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 18:50:39 +0200 Subject: [PATCH 10/15] Use version for dask warnings --- spatialpandas/dask.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/spatialpandas/dask.py b/spatialpandas/dask.py index 9ce8d85..117b6b4 100644 --- a/spatialpandas/dask.py +++ b/spatialpandas/dask.py @@ -9,6 +9,7 @@ import pyarrow as pa import pyarrow.parquet as pq from retrying import retry +from packaging.version import Version import dask import dask.dataframe as dd @@ -190,11 +191,11 @@ def pack_partitions(self, npartitions=None, p=15, shuffle='tasks'): # Set index to distance. This will trigger an expensive shuffle # sort operation - try: - ddf = ddf.set_index('hilbert_distance', npartitions=npartitions, shuffle_method=shuffle) - except TypeError: - # Changed to shuffle_method in 2024.1, https://github.com/dask/dask/pull/10738 - ddf = ddf.set_index('hilbert_distance', npartitions=npartitions, shuffle=shuffle) + if Version(dask.__version__) >= Version('2024.1'): + shuffle_kwargs = {'shuffle_method': shuffle} + else: + shuffle_kwargs = {'shuffle': shuffle} + ddf = ddf.set_index('hilbert_distance', npartitions=npartitions, **shuffle_kwargs) if ddf.npartitions != npartitions: # set_index doesn't change the number of partitions if the partitions From c5143cee4d0ba728b6fb84f133df98037af6dcbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 7 Aug 2024 18:58:55 +0200 Subject: [PATCH 11/15] Add filterwarning --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4a78079..e0ed52e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,10 @@ addopts = [ minversion = "7" xfail_strict = true log_cli_level = "INFO" -filterwarnings = ["error"] +filterwarnings = [ + "error", + "ignore:datetime.datetime.utcnow():DeprecationWarning:botocore", # https://github.com/boto/boto3/issues/3889 +] [tool.ruff] fix = true From fcd7aa343c09341b1771d8eb3562eba45ecea724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Thu, 8 Aug 2024 12:08:20 +0200 Subject: [PATCH 12/15] Set geometry --- spatialpandas/geodataframe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spatialpandas/geodataframe.py b/spatialpandas/geodataframe.py index 304091f..bce0d60 100644 --- a/spatialpandas/geodataframe.py +++ b/spatialpandas/geodataframe.py @@ -67,7 +67,10 @@ def _constructor_from_mgr(self, mgr, axes): if not any(isinstance(block.dtype, GeometryDtype) for block in mgr.blocks): return pd.DataFrame._from_mgr(mgr, axes) - return GeoDataFrame._from_mgr(mgr, axes) + gdf = GeoDataFrame._from_mgr(mgr, axes) + if (gdf.columns == "geometry").sum() == 1: # only if "geometry" is single col + gdf._geometry = "geometry" + return gdf @property def _constructor_sliced(self): From 7c2e2beb262eb65b32dab9af069b91c5dc9058df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Thu, 8 Aug 2024 12:22:02 +0200 Subject: [PATCH 13/15] Add _constructor_expanddim_from_mgr --- spatialpandas/geoseries.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/spatialpandas/geoseries.py b/spatialpandas/geoseries.py index 0d60856..aeeb768 100644 --- a/spatialpandas/geoseries.py +++ b/spatialpandas/geoseries.py @@ -52,6 +52,22 @@ def _constructor_expanddim(self): from .geodataframe import GeoDataFrame return GeoDataFrame + def _constructor_expanddim_from_mgr(self, mgr, axes): + from .geodataframe import GeoDataFrame + df = pd.DataFrame._from_mgr(mgr, axes) + geo_cols = [col for col in df.columns if isinstance(df[col].dtype, GeometryDtype)] + if geo_cols: + if len(geo_cols) == 1: + geo_col_name = geo_cols + else: + geo_col_name = None + + if geo_col_name is None or not isinstance(getattr(df, "dtype", None), GeometryDtype): + df = GeoDataFrame(df) + else: + df = df.set_geometry(geo_col_name) + return df + @property def bounds(self): return pd.DataFrame( From f1c8a7f98e1b9ee8b769694b57b85b136c8cd1d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Thu, 8 Aug 2024 12:32:10 +0200 Subject: [PATCH 14/15] Remove legacy pyarrow support --- spatialpandas/io/parquet.py | 83 +++++++------------------------------ 1 file changed, 15 insertions(+), 68 deletions(-) diff --git a/spatialpandas/io/parquet.py b/spatialpandas/io/parquet.py index 3706622..3ff4578 100644 --- a/spatialpandas/io/parquet.py +++ b/spatialpandas/io/parquet.py @@ -8,7 +8,6 @@ import fsspec import pandas as pd -import pyarrow as pa from dask import delayed from dask.dataframe import from_delayed, from_pandas from dask.dataframe import read_parquet as dd_read_parquet @@ -29,11 +28,6 @@ # improve pandas compatibility, based on geopandas _compat.py PANDAS_GE_12 = Version(pd.__version__) >= Version("1.2.0") -# When we drop support for pyarrow < 5 all code related to this can be removed. -_pa_version = Version(pa.__version__) -PYARROW_LT_5 = _pa_version < Version("5.0.0") -PYARROW_GE_15 = _pa_version >= Version("15.0.0") - def _load_parquet_pandas_metadata( path, @@ -47,37 +41,20 @@ def _load_parquet_pandas_metadata( raise ValueError("Path not found: " + path) if filesystem.isdir(path): - if PYARROW_LT_5: - basic_kwargs = dict(validate_schema=False) - elif PYARROW_GE_15: - basic_kwargs = {} - else: - basic_kwargs = dict(use_legacy_dataset=False) - pqds = ParquetDataset( path, filesystem=filesystem, - **basic_kwargs, **engine_kwargs, ) - if PYARROW_LT_5: - common_metadata = pqds.common_metadata - if common_metadata is None: - # Get metadata for first piece - piece = pqds.pieces[0] - metadata = piece.get_metadata().metadata - else: - metadata = pqds.common_metadata.metadata - else: - filename = "/".join([_get_parent_path(pqds.files[0]), "_common_metadata"]) - try: - common_metadata = _read_metadata(filename, filesystem=filesystem) - except FileNotFoundError: - # Common metadata doesn't exist, so get metadata for first piece instead - filename = pqds.files[0] - common_metadata = _read_metadata(filename, filesystem=filesystem) - metadata = common_metadata.metadata + filename = "/".join([_get_parent_path(pqds.files[0]), "_common_metadata"]) + try: + common_metadata = _read_metadata(filename, filesystem=filesystem) + except FileNotFoundError: + # Common metadata doesn't exist, so get metadata for first piece instead + filename = pqds.files[0] + common_metadata = _read_metadata(filename, filesystem=filesystem) + metadata = common_metadata.metadata else: with filesystem.open(path) as f: pf = ParquetFile(f) @@ -132,31 +109,15 @@ def read_parquet( engine_kwargs = engine_kwargs or {} filesystem = validate_coerce_filesystem(path, filesystem, storage_options) - if PYARROW_LT_5: - basic_kwargs = dict(validate_schema=False) - elif PYARROW_GE_15: - basic_kwargs = {} - else: - basic_kwargs = dict(use_legacy_dataset=False) - # Load using pyarrow to handle parquet files and directories across filesystems dataset = ParquetDataset( path, filesystem=filesystem, - **basic_kwargs, **engine_kwargs, **kwargs, ) - if PYARROW_LT_5: - metadata = _load_parquet_pandas_metadata( - path, - filesystem=filesystem, - storage_options=storage_options, - engine_kwargs=engine_kwargs, - ) - else: - metadata = dataset.schema.pandas_metadata + metadata = dataset.schema.pandas_metadata # If columns specified, prepend index columns to it if columns is not None: @@ -344,13 +305,6 @@ def _perform_read_parquet_dask( filesystem, storage_options, ) - if PYARROW_LT_5: - basic_kwargs = dict(validate_schema=False) - elif PYARROW_GE_15: - basic_kwargs = {} - else: - basic_kwargs = dict(use_legacy_dataset=False) - datasets = [] for path in paths: if filesystem.isdir(path): @@ -361,7 +315,6 @@ def _perform_read_parquet_dask( d = ParquetDataset( path, filesystem=filesystem, - **basic_kwargs, **engine_kwargs, ) datasets.append(d) @@ -428,10 +381,7 @@ def _perform_read_parquet_dask( else: cols_no_index = None - if PYARROW_LT_5: - files = paths - else: - files = getattr(datasets[0], "files", paths) + files = getattr(datasets[0], "files", paths) meta = dd_read_parquet( files[0], @@ -532,14 +482,11 @@ def _read_metadata(filename, filesystem): def _load_partition_bounds(pqds, filesystem=None): partition_bounds = None - if PYARROW_LT_5: - common_metadata = pqds.common_metadata - else: - filename = "/".join([_get_parent_path(pqds.files[0]), "_common_metadata"]) - try: - common_metadata = _read_metadata(filename, filesystem=filesystem) - except FileNotFoundError: - common_metadata = None + filename = "/".join([_get_parent_path(pqds.files[0]), "_common_metadata"]) + try: + common_metadata = _read_metadata(filename, filesystem=filesystem) + except FileNotFoundError: + common_metadata = None if common_metadata is not None and b'spatialpandas' in common_metadata.metadata: spatial_metadata = json.loads( From 6b4353c5ef4692f9a80a8192bd8d06d00e79b9ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Thu, 8 Aug 2024 22:07:05 +0200 Subject: [PATCH 15/15] Apply suggestions from code review --- spatialpandas/geoseries.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/spatialpandas/geoseries.py b/spatialpandas/geoseries.py index aeeb768..8354ea1 100644 --- a/spatialpandas/geoseries.py +++ b/spatialpandas/geoseries.py @@ -57,12 +57,9 @@ def _constructor_expanddim_from_mgr(self, mgr, axes): df = pd.DataFrame._from_mgr(mgr, axes) geo_cols = [col for col in df.columns if isinstance(df[col].dtype, GeometryDtype)] if geo_cols: - if len(geo_cols) == 1: - geo_col_name = geo_cols - else: - geo_col_name = None + geo_col_name = geo_cols if len(geo_cols) == 1 else None - if geo_col_name is None or not isinstance(getattr(df, "dtype", None), GeometryDtype): + if geo_col_name is None or not isinstance(getattr(df[geo_col_name], "dtype", None), GeometryDtype): df = GeoDataFrame(df) else: df = df.set_geometry(geo_col_name)