Skip to content

Commit

Permalink
Merge pull request #152 from podaac/release/2.4.0
Browse files Browse the repository at this point in the history
Release/2.4.0
  • Loading branch information
jamesfwood authored Mar 3, 2023
2 parents 88b6f7a + 58335a5 commit 978b8e2
Show file tree
Hide file tree
Showing 11 changed files with 148 additions and 54 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Security


## [2.4.0]
### Added
### Changed
- [issue/142](https://github.com/podaac/l2ss-py/issues/142): Changed handling of duplicate dimensions as part of integration with new TEMPO ozone profile data.
### Deprecated
### Removed
### Fixed
- [issue/149](https://github.com/podaac/l2ss-py/issues/149): Fixed compression level for netCDF4 object variable creation into a string. Will need to address after netcdf4 rebuilds library. https://github.com/Unidata/netcdf4-python/issues/1236
- [issue/143](https://github.com/podaac/l2ss-py/issues/143): Fixed bug when not specifying any variable subsetting for grouped datasets.
### Security

## [2.3.0]
### Added
- [issue/126](https://github.com/podaac/l2ss-py/issues/126): Added flexibility to variable subsetting
Expand Down
28 changes: 23 additions & 5 deletions cmr/ops_associations.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
C1684065153-PODAAC
C1684065156-PODAAC
C1684065158-PODAAC
C1693233348-PODAAC
C1693233387-PODAAC
C1940473819-POCLOUD
C1940475563-POCLOUD
C1940466008-POCLOUD
Expand Down Expand Up @@ -55,3 +50,26 @@ C2247621105-POCLOUD
C2152045877-POCLOUD
C1940471193-POCLOUD
C2205121315-POCLOUD
C2075141524-POCLOUD
C2499940513-POCLOUD
C2491735309-POCLOUD
C2036881016-POCLOUD
C2499940523-POCLOUD
C2499940517-POCLOUD
C2499940520-POCLOUD
C1940470304-POCLOUD
C1940472420-POCLOUD
C2596983413-POCLOUD
C2596986276-POCLOUD
C2075141684-POCLOUD
C1996881752-POCLOUD
C2075141638-POCLOUD
C2036882055-POCLOUD
C2036880640-POCLOUD
C2251464495-POCLOUD
C2036882482-POCLOUD
C2036882048-POCLOUD
C2068529568-POCLOUD
C2036882397-POCLOUD
C2036882499-POCLOUD
C2036882492-POCLOUD
2 changes: 2 additions & 0 deletions cmr/uat_associations.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ C1240739713-POCLOUD
C1244459498-POCLOUD
C1242387621-POCLOUD
C1238658389-POCLOUD
C1244810554-POCLOUD
C1256420925-POCLOUD
78 changes: 43 additions & 35 deletions podaac/subsetter/dimension_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,11 @@
Functions which improve upon existing netCDF4 library existing functions
"""
import collections
from typing import List, Tuple

import netCDF4 as nc
import xarray as xr


def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]]:
def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset:
"""
xarray cannot read netCDF4 datasets with duplicate dimensions.
Function goes through a dataset to catch any variables with duplicate dimensions.
Expand All @@ -28,55 +26,65 @@ def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]
"""
dup_vars = {}
dup_new_varnames = []

for var_name, var in nc_dataset.variables.items():
dim_list = list(var.dimensions)
if len(set(dim_list)) != len(dim_list): # get true if var.dimensions has a duplicate
dup_vars[var_name] = var # populate dictionary with variables with vars with dup dims

for dup_var_name, dup_var in dup_vars.items():
dim_list = list(dup_var.dimensions) # list of original dimensions of variable with dup dims
# get the dimensions that are duplicated
dim_list = list(dup_var.dimensions) # original dimensions of the variable with duplicated dims

# Dimension(s) that are duplicated are retrieved.
# Note: this is not yet tested for more than one duplicated dimension.
dim_dup = [item for item, count in collections.Counter(dim_list).items() if count > 1][0]
dim_dup_length = dup_var.shape[dup_var.dimensions.index(dim_dup)] # length of the duplicated dimension

# New dimension and variable names are created.
dim_dup_new = dim_dup+'_1'
var_name_new = dup_var_name+'_1'
dup_new_varnames.append(var_name_new)

# create new dimension by copying from the duplicated dimension

data = {}
fill_value = dup_var._FillValue # pylint: disable=W0212
nc_dataset.createDimension(dim_dup_new, nc_dataset.variables[dim_dup].size)
data[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
(dim_dup_new,), fill_value=fill_value)

for ncattr in nc_dataset.variables[dim_dup].ncattrs():
if ncattr != '_FillValue':
data[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
data[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]

# The last dimension for the variable is replaced with the new name in a temporary list.
new_dim_list = dim_list[:-1]
new_dim_list.extend([dim_dup_new])

# createVariable with new dimensions

data[var_name_new] = nc_dataset.createVariable(var_name_new, str(dup_var[:].dtype), tuple(new_dim_list), fill_value=fill_value)
new_dup_var = {}
attrs_contents = {}

# Attributes for the original variable are retrieved.
for attrname in dup_var.ncattrs():
if attrname != '_FillValue':
data[var_name_new].setncattr(attrname, nc_dataset.variables[dup_var_name].getncattr(attrname))
data[var_name_new][:] = nc_dataset.variables[dup_var_name][:]
del nc_dataset.variables[dup_var_name]
attrs_contents[attrname] = nc_dataset.variables[dup_var_name].getncattr(attrname)

# return the variables that will need to be renamed: Rename method is still an issue per https://github.com/Unidata/netcdf-c/issues/1672
return nc_dataset, dup_new_varnames
fill_value = dup_var._FillValue # pylint: disable=W0212

# Only create a new *Dimension* if it doesn't already exist.
if dim_dup_new not in nc_dataset.dimensions.keys():

def rename_dup_vars(dataset: xr.Dataset, rename_vars: List[str]) -> xr.Dataset:
"""
NetCDF4 rename function raises and HDF error for variable in S5P files with duplicate dimensions
This method will use xarray to rename the variables
"""
for i in rename_vars:
original_name = i[:-2]
dataset = dataset.rename({i: original_name})
# New dimension is created by copying from the duplicated dimension.
nc_dataset.createDimension(dim_dup_new, dim_dup_length)

# Only create a new dimension *Variable* if it existed originally in the NetCDF structure.
if dim_dup in nc_dataset.variables.keys():

# New variable object is created for the renamed, previously duplicated dimension.
new_dup_var[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
(dim_dup_new,), fill_value=fill_value)
# New variable's attributes are set to the original ones.
for ncattr in nc_dataset.variables[dim_dup].ncattrs():
if ncattr != '_FillValue':
new_dup_var[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
new_dup_var[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]

# Delete existing Variable
del nc_dataset.variables[dup_var_name]

# Replace original *Variable* with new variable with no duplicated dimensions.
new_dup_var[dup_var_name] = nc_dataset.createVariable(dup_var_name, str(dup_var[:].dtype),
tuple(new_dim_list), fill_value=fill_value)
for attr_name, contents in attrs_contents.items():
new_dup_var[dup_var_name].setncattr(attr_name, contents)
new_dup_var[dup_var_name][:] = dup_var[:]

return dataset
return nc_dataset
7 changes: 5 additions & 2 deletions podaac/subsetter/group_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,11 @@ def _rename_variables(dataset: xr.Dataset, base_dataset: nc.Dataset, start_date)
var_attrs.pop('_FillValue', None)
comp_args = {"zlib": True, "complevel": 1}

var_data = variable.data
if variable.dtype == object:
var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value, **comp_args)
comp_args = {"zlib": False, "complevel": 1}
var_group.createVariable(new_var_name, 'S4', var_dims, fill_value=fill_value, **comp_args)
var_data = np.array(variable.data)
elif variable.dtype == 'timedelta64[ns]':
var_group.createVariable(new_var_name, 'i4', var_dims, fill_value=fill_value, **comp_args)
else:
Expand All @@ -187,7 +190,7 @@ def _rename_variables(dataset: xr.Dataset, base_dataset: nc.Dataset, start_date)

# Copy data
var_group.variables[new_var_name].set_auto_maskandscale(False)
var_group.variables[new_var_name][:] = variable.data
var_group.variables[new_var_name][:] = var_data


def h5file_transform(finput: str) -> Tuple[nc.Dataset, bool]:
Expand Down
15 changes: 8 additions & 7 deletions podaac/subsetter/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,7 @@ def convert_to_datetime(dataset: xr.Dataset, time_vars: list) -> Tuple[xr.Datase
return dataset, start_date


def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]:
def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, bool]:
"""Open netcdf file, and flatten groups if they exist."""
file_extension = filepath.split('.')[-1]

Expand All @@ -985,9 +985,9 @@ def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]:
if has_groups:
nc_dataset = transform_grouped_dataset(nc_dataset, filepath)

nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset)
nc_dataset = dc.remove_duplicate_dims(nc_dataset)

return nc_dataset, rename_vars, has_groups
return nc_dataset, has_groups


def override_decode_cf_datetime() -> None:
Expand Down Expand Up @@ -1071,18 +1071,20 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
than one value in the case where there are multiple groups and
different coordinate variables for each group.
"""
nc_dataset, rename_vars, has_groups = open_as_nc_dataset(file_to_subset)
nc_dataset, has_groups = open_as_nc_dataset(file_to_subset)

override_decode_cf_datetime()

if has_groups:
# Make sure all variables start with '/'
variables = ['/' + var if not var.startswith('/') else var for var in variables]
if variables:
variables = ['/' + var if not var.startswith('/') else var for var in variables]
lat_var_names = ['/' + var if not var.startswith('/') else var for var in lat_var_names]
lon_var_names = ['/' + var if not var.startswith('/') else var for var in lon_var_names]
time_var_names = ['/' + var if not var.startswith('/') else var for var in time_var_names]
# Replace all '/' with GROUP_DELIM
variables = [var.replace('/', GROUP_DELIM) for var in variables]
if variables:
variables = [var.replace('/', GROUP_DELIM) for var in variables]
lat_var_names = [var.replace('/', GROUP_DELIM) for var in lat_var_names]
lon_var_names = [var.replace('/', GROUP_DELIM) for var in lon_var_names]
time_var_names = [var.replace('/', GROUP_DELIM) for var in time_var_names]
Expand All @@ -1099,7 +1101,6 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
xr.backends.NetCDF4DataStore(nc_dataset),
**args
) as dataset:
dataset = dc.rename_dup_vars(dataset, rename_vars)
lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names(
dataset=dataset,
lat_var_names=lat_var_names,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

[tool.poetry]
name = "l2ss-py"
version = "2.3.0"
version = "2.4.0-rc.1"
description = "L2 Subsetter Service"
authors = ["podaac-tva <[email protected]>"]
license = "Apache-2.0"
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
59 changes: 55 additions & 4 deletions tests/test_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_subset_bbox(test_file, data_dir, subset_output_dir, request):
output_file=subset_output_file
)

out_ds, rename_vars, _ = subset.open_as_nc_dataset(subset_output_file)
out_ds, _ = subset.open_as_nc_dataset(subset_output_file)
out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds),
decode_times=False,
decode_coords=False,
Expand Down Expand Up @@ -549,7 +549,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request):
bbox = np.array(((-180, 180), (-90, 90)))
output_file = "{}_{}".format(request.node.name, test_file)

in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(data_dir, test_file))
in_ds, _ = subset.open_as_nc_dataset(join(data_dir, test_file))
in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds),
decode_times=False,
decode_coords=False)
Expand All @@ -575,7 +575,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request):
variables=[var.replace(GROUP_DELIM, '/') for var in included_variables]
)

out_ds, rename_vars, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file))
out_ds, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file))
out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds),
decode_times=False,
decode_coords=False)
Expand Down Expand Up @@ -1226,7 +1226,7 @@ def test_get_time_variable_name(test_file, data_dir, subset_output_dir):
'mask_and_scale': False,
'decode_times': True
}
ds, rename_vars, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file))
ds, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file))
ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), **args)

lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0]
Expand Down Expand Up @@ -1330,6 +1330,34 @@ def test_duplicate_dims_tropomi(data_dir, subset_output_dir, request):
assert variable.shape == \
out_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables[var_name].shape

def test_duplicate_dims_tempo_ozone(data_dir, subset_output_dir, request):
"""
Check if TEMPO Ozone files run successfully even though
these files have variables with duplicate dimensions
"""
TEMPO_dir = join(data_dir, 'TEMPO')
tempo_ozone_file = 'TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc'

bbox = np.array(((-180, 180), (-90, 90)))
output_file = "{}_{}".format(request.node.name, tempo_ozone_file)
shutil.copyfile(
os.path.join(TEMPO_dir, tempo_ozone_file),
os.path.join(subset_output_dir, tempo_ozone_file)
)
box_test = subset.subset(
file_to_subset=join(subset_output_dir, tempo_ozone_file),
bbox=bbox,
output_file=join(subset_output_dir, output_file)
)
# check if the box_test is

in_nc = nc.Dataset(join(TEMPO_dir, tempo_ozone_file))
out_nc = nc.Dataset(join(subset_output_dir, output_file))

for var_name, variable in in_nc.groups['support_data'].variables.items():
assert variable.shape == \
out_nc.groups['support_data'].variables[var_name].shape


def test_omi_novars_subset(data_dir, subset_output_dir, request):
"""
Expand Down Expand Up @@ -2012,6 +2040,29 @@ def test_var_subsetting_tropomi(data_dir, subset_output_dir, request):

assert list(slash_dataset.groups['PRODUCT'].variables) == list(noslash_dataset.groups['PRODUCT'].variables)

def test_tropomi_utc_time(data_dir, subset_output_dir, request):
"""Verify that the time UTC values are conserved in S5P files"""
trop_dir = join(data_dir, 'tropomi')
trop_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4'
variable = ['/PRODUCT/time_utc']
bbox = np.array(((-180, 180), (-90, 90)))
output_file = "{}_{}".format(request.node.name, trop_file)
shutil.copyfile(
os.path.join(trop_dir, trop_file),
os.path.join(subset_output_dir, trop_file)
)
subset.subset(
file_to_subset=join(subset_output_dir, trop_file),
bbox=bbox,
output_file=join(subset_output_dir, output_file),
variables=variable
)

in_nc_dataset = nc.Dataset(join(trop_dir, trop_file))
out_nc_dataset = nc.Dataset(join(subset_output_dir, output_file))

assert in_nc_dataset.groups['PRODUCT'].variables['time_utc'][:].squeeze()[0] ==\
out_nc_dataset.groups['PRODUCT'].variables['time_utc'][:].squeeze()[0]

def test_bad_time_unit(subset_output_dir):
fill_val = -99999.0
Expand Down

0 comments on commit 978b8e2

Please sign in to comment.