Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix small open_virtual_dataset bugs #923

Merged
merged 6 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions earthaccess/dmrpp_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def open_virtual_mfdataset(
import xarray as xr

if access == "direct":
fs = earthaccess.get_s3_filesystem(results=granules[0])
fs.storage_options["anon"] = False # type: ignore
fs = earthaccess.get_s3_filesystem(results=granules) # type: ignore
fs.storage_options["anon"] = False
else:
fs = earthaccess.get_fsspec_https_session()
if parallel:
Expand All @@ -114,7 +114,7 @@ def open_virtual_mfdataset(
filetype="dmrpp", # type: ignore
group=group,
indexes={},
reader_options={"storage_options": fs.storage_options}, # type: ignore
reader_options={"storage_options": fs.storage_options},
)
)
if preprocess is not None:
Expand All @@ -127,6 +127,7 @@ def open_virtual_mfdataset(
vds = xr.combine_nested(vdatasets, **xr_combine_nested_kwargs)
if load:
refs = vds.virtualize.to_kerchunk(filepath=None, format="dict")
protocol = "s3" if "s3" in fs.protocol else fs.protocol
return xr.open_dataset(
"reference://",
engine="zarr",
Expand All @@ -135,8 +136,8 @@ def open_virtual_mfdataset(
"consolidated": False,
"storage_options": {
"fo": refs, # codespell:ignore
"remote_protocol": fs.protocol,
"remote_options": fs.storage_options, # type: ignore
"remote_protocol": protocol,
"remote_options": fs.storage_options,
},
},
)
Expand Down
12 changes: 11 additions & 1 deletion earthaccess/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import fsspec.utils
import s3fs

# import ipdb
import earthaccess


Expand All @@ -15,12 +16,19 @@ def _get_chunk_metadata(
) -> list[dict]:
from kerchunk.hdf import SingleHdf5ToZarr

if not isinstance(granule, earthaccess.DataGranule) and isinstance(granule, dict):
# WHY: dask serialization is doing something weird, it serializes the granule as a simple dict
# we need to add cast it back to a datagranule to get the nice methods for parsing the data links
# TODO: ask James what is going on
granule = earthaccess.DataGranule(granule)

metadata = []
access = "direct" if isinstance(fs, s3fs.S3FileSystem) else "indirect"
# ipdb.set_trace()

for url in granule.data_links(access=access):
with fs.open(url) as inf:
h5chunks = SingleHdf5ToZarr(inf, url)
h5chunks = SingleHdf5ToZarr(inf, url) # type: ignore
m = h5chunks.translate()
metadata.append(m)

Expand Down Expand Up @@ -50,6 +58,8 @@ def consolidate_metadata(

# Get metadata for each granule
get_chunk_metadata = dask.delayed(_get_chunk_metadata) # type: ignore

# ipdb.set_trace()
chunks = dask.compute(*[get_chunk_metadata(g, fs) for g in granules]) # type: ignore
chunks = sum(chunks, start=[])

Expand Down
10 changes: 8 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,19 @@ Changelog = "https://github.com/nsidc/earthaccess/blob/main/CHANGELOG.md"

[project.optional-dependencies]
kerchunk = [
"numpy >=1.26.4",
"kerchunk",
"dask",
"h5py >=3.6.0",
"h5netcdf",
"xarray",
"zarr >=2.12.0, <3.0.0a",
]
virtualizarr = [
"virtualizarr >=1.2.0"
"numpy >=1.26.4",
"zarr >=2.12.0, <3.0.0a",
"virtualizarr >=1.2.0",
"dask",
"h5py >=3.6.0",
]
dev = [
"bump-my-version >=0.10.0",
Expand All @@ -75,6 +79,8 @@ dev = [
"uv >=0.4.7",
]
test = [
"zarr >=2.12.0, <3.0.0a",
"numpy >=1.26.4",
"mypy >=1.11.2",
"pytest >=8.3",
"pytest-cov >=5.0",
Expand Down
45 changes: 14 additions & 31 deletions tests/integration/test_virtualizarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,24 @@
logger.info(f"earthaccess version: {earthaccess.__version__}")


@pytest.fixture(scope="module", params=["MUR25-JPL-L4-GLOB-v04.2"])
@pytest.fixture(
scope="module",
params=[
"MUR25-JPL-L4-GLOB-v04.2",
"AVHRR_OI-NCEI-L4-GLOB-v2.1",
"M2T1NXSLV",
],
)
def granule(request):
granules = earthaccess.search_data(
count=1, temporal=("2024"), short_name=request.param
)
return granules[0]


def test_dmrpp(granule):
from virtualizarr import open_virtual_dataset # type: ignore

fs = earthaccess.get_fsspec_https_session()
data_path = granule.data_links(access="indirect")[0]
dmrpp_path = data_path + ".dmrpp"

result = open_virtual_dataset(
dmrpp_path,
filetype="dmrpp", # type: ignore
indexes={},
reader_options={"storage_options": fs.storage_options}, # type: ignore
)

expected = open_virtual_dataset(
data_path,
indexes={},
reader_options={"storage_options": fs.storage_options}, # type: ignore
)

# TODO: replace with xr.testing when virtualizarr fill_val is fixed (https://github.com/zarr-developers/VirtualiZarr/issues/287)
# and dmrpp deflateLevel (zlib compression level) is always present (https://github.com/OPENDAP/bes/issues/954)
for var in result.variables:
assert var in expected.variables
assert result[var].dims == expected[var].dims
assert result[var].shape == expected[var].shape
assert result[var].dtype == expected[var].dtype
assert result[var].data.manifest == expected[var].data.manifest
assert set(result.coords) == set(expected.coords)
assert result.attrs == expected.attrs
def test_open_virtual_dataset(granule):
# Simply check that the dmrpp can be found, parsed, and loaded. Actual parser result is checked in virtualizarr
vds = earthaccess.open_virtual_dataset(granule, load=False)
assert vds is not None
vds_load = earthaccess.open_virtual_dataset(granule, load=True)
assert vds_load is not None
35 changes: 23 additions & 12 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading