diff --git a/earthaccess/dmrpp_zarr.py b/earthaccess/dmrpp_zarr.py index 79902868..ee336bb5 100644 --- a/earthaccess/dmrpp_zarr.py +++ b/earthaccess/dmrpp_zarr.py @@ -92,8 +92,8 @@ def open_virtual_mfdataset( import xarray as xr if access == "direct": - fs = earthaccess.get_s3_filesystem(results=granules[0]) - fs.storage_options["anon"] = False # type: ignore + fs = earthaccess.get_s3_filesystem(results=granules) # type: ignore + fs.storage_options["anon"] = False else: fs = earthaccess.get_fsspec_https_session() if parallel: @@ -114,7 +114,7 @@ def open_virtual_mfdataset( filetype="dmrpp", # type: ignore group=group, indexes={}, - reader_options={"storage_options": fs.storage_options}, # type: ignore + reader_options={"storage_options": fs.storage_options}, ) ) if preprocess is not None: @@ -127,6 +127,7 @@ def open_virtual_mfdataset( vds = xr.combine_nested(vdatasets, **xr_combine_nested_kwargs) if load: refs = vds.virtualize.to_kerchunk(filepath=None, format="dict") + protocol = "s3" if "s3" in fs.protocol else fs.protocol return xr.open_dataset( "reference://", engine="zarr", @@ -135,8 +136,8 @@ def open_virtual_mfdataset( "consolidated": False, "storage_options": { "fo": refs, # codespell:ignore - "remote_protocol": fs.protocol, - "remote_options": fs.storage_options, # type: ignore + "remote_protocol": protocol, + "remote_options": fs.storage_options, }, }, ) diff --git a/earthaccess/kerchunk.py b/earthaccess/kerchunk.py index 9ee40dec..cd570f89 100644 --- a/earthaccess/kerchunk.py +++ b/earthaccess/kerchunk.py @@ -6,6 +6,7 @@ import fsspec.utils import s3fs +# import ipdb import earthaccess @@ -15,12 +16,19 @@ def _get_chunk_metadata( ) -> list[dict]: from kerchunk.hdf import SingleHdf5ToZarr + if not isinstance(granule, earthaccess.DataGranule) and isinstance(granule, dict): + # WHY: dask serialization is doing something weird, it serializes the granule as a simple dict + # we need to add cast it back to a datagranule to get the nice methods for parsing the data links + # TODO: ask James what is going on + granule = earthaccess.DataGranule(granule) + metadata = [] access = "direct" if isinstance(fs, s3fs.S3FileSystem) else "indirect" + # ipdb.set_trace() for url in granule.data_links(access=access): with fs.open(url) as inf: - h5chunks = SingleHdf5ToZarr(inf, url) + h5chunks = SingleHdf5ToZarr(inf, url) # type: ignore m = h5chunks.translate() metadata.append(m) @@ -50,6 +58,8 @@ def consolidate_metadata( # Get metadata for each granule get_chunk_metadata = dask.delayed(_get_chunk_metadata) # type: ignore + + # ipdb.set_trace() chunks = dask.compute(*[get_chunk_metadata(g, fs) for g in granules]) # type: ignore chunks = sum(chunks, start=[]) diff --git a/pyproject.toml b/pyproject.toml index 6e227789..11ec34a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,15 +57,19 @@ Changelog = "https://github.com/nsidc/earthaccess/blob/main/CHANGELOG.md" [project.optional-dependencies] kerchunk = [ - "numpy >=1.26.4", "kerchunk", "dask", "h5py >=3.6.0", "h5netcdf", "xarray", + "zarr >=2.12.0, <3.0.0a", ] virtualizarr = [ - "virtualizarr >=1.2.0" + "numpy >=1.26.4", + "zarr >=2.12.0, <3.0.0a", + "virtualizarr >=1.2.0", + "dask", + "h5py >=3.6.0", ] dev = [ "bump-my-version >=0.10.0", @@ -75,6 +79,8 @@ dev = [ "uv >=0.4.7", ] test = [ + "zarr >=2.12.0, <3.0.0a", + "numpy >=1.26.4", "mypy >=1.11.2", "pytest >=8.3", "pytest-cov >=5.0", diff --git a/tests/integration/test_virtualizarr.py b/tests/integration/test_virtualizarr.py index 95128a76..166263c4 100644 --- a/tests/integration/test_virtualizarr.py +++ b/tests/integration/test_virtualizarr.py @@ -15,7 +15,14 @@ logger.info(f"earthaccess version: {earthaccess.__version__}") -@pytest.fixture(scope="module", params=["MUR25-JPL-L4-GLOB-v04.2"]) +@pytest.fixture( + scope="module", + params=[ + "MUR25-JPL-L4-GLOB-v04.2", + "AVHRR_OI-NCEI-L4-GLOB-v2.1", + "M2T1NXSLV", + ], +) def granule(request): granules = earthaccess.search_data( count=1, temporal=("2024"), short_name=request.param @@ -23,33 +30,9 @@ def granule(request): return granules[0] -def test_dmrpp(granule): - from virtualizarr import open_virtual_dataset # type: ignore - - fs = earthaccess.get_fsspec_https_session() - data_path = granule.data_links(access="indirect")[0] - dmrpp_path = data_path + ".dmrpp" - - result = open_virtual_dataset( - dmrpp_path, - filetype="dmrpp", # type: ignore - indexes={}, - reader_options={"storage_options": fs.storage_options}, # type: ignore - ) - - expected = open_virtual_dataset( - data_path, - indexes={}, - reader_options={"storage_options": fs.storage_options}, # type: ignore - ) - - # TODO: replace with xr.testing when virtualizarr fill_val is fixed (https://github.com/zarr-developers/VirtualiZarr/issues/287) - # and dmrpp deflateLevel (zlib compression level) is always present (https://github.com/OPENDAP/bes/issues/954) - for var in result.variables: - assert var in expected.variables - assert result[var].dims == expected[var].dims - assert result[var].shape == expected[var].shape - assert result[var].dtype == expected[var].dtype - assert result[var].data.manifest == expected[var].data.manifest - assert set(result.coords) == set(expected.coords) - assert result.attrs == expected.attrs +def test_open_virtual_dataset(granule): + # Simply check that the dmrpp can be found, parsed, and loaded. Actual parser result is checked in virtualizarr + vds = earthaccess.open_virtual_dataset(granule, load=False) + assert vds is not None + vds_load = earthaccess.open_virtual_dataset(granule, load=True) + assert vds_load is not None diff --git a/uv.lock b/uv.lock index 6e6acce3..351229a8 100644 --- a/uv.lock +++ b/uv.lock @@ -309,7 +309,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, - { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "platform_python_implementation == 'PyPy' or python_full_version >= '3.12'" }, + { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or platform_python_implementation == 'PyPy'" }, { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12' and platform_python_implementation != 'PyPy'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/8d/4f/11d2d314f0bdbe7ff975737d125e1a5357115afe28fcc64f13e68b05ba61/botocore-1.35.36.tar.gz", hash = "sha256:354ec1b766f0029b5d6ff0c45d1a0f9e5007b7d2f3ec89bcdd755b208c5bc797", size = 12808757 } @@ -538,7 +538,7 @@ name = "click" version = "8.1.7" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "platform_system == 'Windows'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/96/d3/f04c7bfcf5c1862a2a5b845c6b2b360488cf47af55dfa79c98f6a6bf98b5/click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de", size = 336121 } wheels = [ @@ -849,8 +849,8 @@ kerchunk = [ { name = "h5netcdf" }, { name = "h5py" }, { name = "kerchunk" }, - { name = "numpy" }, { name = "xarray" }, + { name = "zarr" }, ] test = [ { name = "dask" }, @@ -864,15 +864,20 @@ test = [ { name = "pytest-watch" }, { name = "python-magic" }, { name = "responses" }, - { name = "types-requests", version = "2.31.0.6", source = { registry = "https://pypi.org/simple" }, marker = "platform_python_implementation == 'PyPy' or python_full_version >= '3.12'" }, + { name = "types-requests", version = "2.31.0.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or platform_python_implementation == 'PyPy'" }, { name = "types-requests", version = "2.32.0.20241016", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12' and platform_python_implementation != 'PyPy'" }, { name = "types-setuptools" }, { name = "vcrpy" }, { name = "virtualizarr" }, { name = "xarray" }, + { name = "zarr" }, ] virtualizarr = [ + { name = "dask" }, + { name = "h5py" }, + { name = "numpy" }, { name = "virtualizarr" }, + { name = "zarr" }, ] [package.metadata] @@ -881,12 +886,14 @@ requires-dist = [ { name = "cftime", marker = "extra == 'docs'", specifier = ">=1.6.4" }, { name = "dask", marker = "extra == 'docs'", specifier = ">=2024.8.0" }, { name = "dask", marker = "extra == 'kerchunk'" }, + { name = "dask", marker = "extra == 'virtualizarr'" }, { name = "earthaccess", extras = ["kerchunk"], marker = "extra == 'test'" }, { name = "earthaccess", extras = ["virtualizarr"], marker = "extra == 'test'" }, { name = "fsspec", specifier = ">=2022.11" }, { name = "h5netcdf", marker = "extra == 'docs'", specifier = ">=0.11" }, { name = "h5netcdf", marker = "extra == 'kerchunk'" }, { name = "h5py", marker = "extra == 'kerchunk'", specifier = ">=3.6.0" }, + { name = "h5py", marker = "extra == 'virtualizarr'", specifier = ">=3.6.0" }, { name = "importlib-resources", specifier = ">=6.3.2" }, { name = "ipywidgets", marker = "extra == 'docs'", specifier = ">=7.7.0" }, { name = "jupyterlab", marker = "extra == 'docs'", specifier = ">=3" }, @@ -903,7 +910,8 @@ requires-dist = [ { name = "multimethod", specifier = ">=1.8" }, { name = "mypy", marker = "extra == 'test'", specifier = ">=1.11.2" }, { name = "nox", marker = "extra == 'dev'" }, - { name = "numpy", marker = "extra == 'kerchunk'", specifier = ">=1.26.4" }, + { name = "numpy", marker = "extra == 'test'", specifier = ">=1.26.4" }, + { name = "numpy", marker = "extra == 'virtualizarr'", specifier = ">=1.26.4" }, { name = "pqdm", specifier = ">=0.1" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=2.4" }, { name = "pygments", marker = "extra == 'docs'", specifier = ">=2.11.1" }, @@ -929,6 +937,9 @@ requires-dist = [ { name = "widgetsnbextension", marker = "extra == 'docs'", specifier = ">=3.6.0" }, { name = "xarray", marker = "extra == 'docs'", specifier = ">=2023.1" }, { name = "xarray", marker = "extra == 'kerchunk'" }, + { name = "zarr", marker = "extra == 'kerchunk'", specifier = ">=2.12.0,<3.0.0a0" }, + { name = "zarr", marker = "extra == 'test'", specifier = ">=2.12.0,<3.0.0a0" }, + { name = "zarr", marker = "extra == 'virtualizarr'", specifier = ">=2.12.0,<3.0.0a0" }, ] [[package]] @@ -1261,7 +1272,7 @@ name = "ipykernel" version = "6.29.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "appnope", marker = "platform_system == 'Darwin'" }, + { name = "appnope", marker = "sys_platform == 'darwin'" }, { name = "comm" }, { name = "debugpy" }, { name = "ipython" }, @@ -2005,7 +2016,7 @@ version = "1.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, - { name = "colorama", marker = "platform_system == 'Windows'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, { name = "ghp-import" }, { name = "jinja2" }, { name = "markdown" }, @@ -3368,7 +3379,7 @@ dependencies = [ { name = "certifi" }, { name = "charset-normalizer" }, { name = "idna" }, - { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "platform_python_implementation == 'PyPy' or python_full_version >= '3.12'" }, + { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or platform_python_implementation == 'PyPy'" }, { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12' and platform_python_implementation != 'PyPy'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } @@ -3383,7 +3394,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pyyaml" }, { name = "requests" }, - { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "platform_python_implementation == 'PyPy' or python_full_version >= '3.12'" }, + { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or platform_python_implementation == 'PyPy'" }, { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12' and platform_python_implementation != 'PyPy'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/67/24/1d67c8974daa502e860b4a5b57ad6de0d7dbc0b1160ef7148189a24a40e1/responses-0.25.3.tar.gz", hash = "sha256:617b9247abd9ae28313d57a75880422d55ec63c29d33d629697590a034358dba", size = 77798 } @@ -3695,7 +3706,7 @@ name = "tqdm" version = "4.67.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "platform_system == 'Windows'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/e8/4f/0153c21dc5779a49a0598c445b1978126b1344bab9ee71e53e44877e14e0/tqdm-4.67.0.tar.gz", hash = "sha256:fe5a6f95e6fe0b9755e9469b77b9c3cf850048224ecaa8293d7d2d31f97d869a", size = 169739 } wheels = [ @@ -3733,7 +3744,7 @@ resolution-markers = [ "python_full_version >= '3.13' and platform_python_implementation != 'PyPy'", ] dependencies = [ - { name = "types-urllib3", marker = "platform_python_implementation == 'PyPy' or python_full_version >= '3.12'" }, + { name = "types-urllib3", marker = "python_full_version >= '3.12' or platform_python_implementation == 'PyPy'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f9/b8/c1e8d39996b4929b918aba10dba5de07a8b3f4c8487bb61bb79882544e69/types-requests-2.31.0.6.tar.gz", hash = "sha256:cd74ce3b53c461f1228a9b783929ac73a666658f223e28ed29753771477b3bd0", size = 15535 } wheels = [ @@ -3928,7 +3939,7 @@ version = "6.0.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pyyaml" }, - { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "platform_python_implementation == 'PyPy' or python_full_version >= '3.12'" }, + { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or platform_python_implementation == 'PyPy'" }, { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12' and platform_python_implementation != 'PyPy'" }, { name = "wrapt" }, { name = "yarl" },