diff --git a/CHANGELOG.md b/CHANGELOG.md index dcbe725d..97f801be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Security +## [2.4.0] +### Added +### Changed +- [issue/142](https://github.com/podaac/l2ss-py/issues/142): Changed handling of duplicate dimensions as part of integration with new TEMPO ozone profile data. +### Deprecated +### Removed +### Fixed +- [issue/149](https://github.com/podaac/l2ss-py/issues/149): Fixed compression level for netCDF4 object variable creation into a string. Will need to address after netcdf4 rebuilds library. https://github.com/Unidata/netcdf4-python/issues/1236 +- [issue/143](https://github.com/podaac/l2ss-py/issues/143): Fixed bug when not specifying any variable subsetting for grouped datasets. +### Security + ## [2.3.0] ### Added - [issue/126](https://github.com/podaac/l2ss-py/issues/126): Added flexibility to variable subsetting diff --git a/cmr/ops_associations.txt b/cmr/ops_associations.txt index ccd53765..64b81b11 100644 --- a/cmr/ops_associations.txt +++ b/cmr/ops_associations.txt @@ -1,8 +1,3 @@ -C1684065153-PODAAC -C1684065156-PODAAC -C1684065158-PODAAC -C1693233348-PODAAC -C1693233387-PODAAC C1940473819-POCLOUD C1940475563-POCLOUD C1940466008-POCLOUD @@ -55,3 +50,26 @@ C2247621105-POCLOUD C2152045877-POCLOUD C1940471193-POCLOUD C2205121315-POCLOUD +C2075141524-POCLOUD +C2499940513-POCLOUD +C2491735309-POCLOUD +C2036881016-POCLOUD +C2499940523-POCLOUD +C2499940517-POCLOUD +C2499940520-POCLOUD +C1940470304-POCLOUD +C1940472420-POCLOUD +C2596983413-POCLOUD +C2596986276-POCLOUD +C2075141684-POCLOUD +C1996881752-POCLOUD +C2075141638-POCLOUD +C2036882055-POCLOUD +C2036880640-POCLOUD +C2251464495-POCLOUD +C2036882482-POCLOUD +C2036882048-POCLOUD +C2068529568-POCLOUD +C2036882397-POCLOUD +C2036882499-POCLOUD +C2036882492-POCLOUD diff --git a/cmr/uat_associations.txt b/cmr/uat_associations.txt index dfb704f4..f267df81 100644 --- a/cmr/uat_associations.txt +++ b/cmr/uat_associations.txt @@ -39,3 +39,5 @@ C1240739713-POCLOUD C1244459498-POCLOUD C1242387621-POCLOUD C1238658389-POCLOUD +C1244810554-POCLOUD +C1256420925-POCLOUD diff --git a/podaac/subsetter/dimension_cleanup.py b/podaac/subsetter/dimension_cleanup.py index 9391dacc..9ea3ce5e 100644 --- a/podaac/subsetter/dimension_cleanup.py +++ b/podaac/subsetter/dimension_cleanup.py @@ -12,13 +12,11 @@ Functions which improve upon existing netCDF4 library existing functions """ import collections -from typing import List, Tuple import netCDF4 as nc -import xarray as xr -def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]]: +def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset: """ xarray cannot read netCDF4 datasets with duplicate dimensions. Function goes through a dataset to catch any variables with duplicate dimensions. @@ -28,55 +26,65 @@ def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str] """ dup_vars = {} dup_new_varnames = [] + for var_name, var in nc_dataset.variables.items(): dim_list = list(var.dimensions) if len(set(dim_list)) != len(dim_list): # get true if var.dimensions has a duplicate dup_vars[var_name] = var # populate dictionary with variables with vars with dup dims + for dup_var_name, dup_var in dup_vars.items(): - dim_list = list(dup_var.dimensions) # list of original dimensions of variable with dup dims - # get the dimensions that are duplicated + dim_list = list(dup_var.dimensions) # original dimensions of the variable with duplicated dims + + # Dimension(s) that are duplicated are retrieved. + # Note: this is not yet tested for more than one duplicated dimension. dim_dup = [item for item, count in collections.Counter(dim_list).items() if count > 1][0] + dim_dup_length = dup_var.shape[dup_var.dimensions.index(dim_dup)] # length of the duplicated dimension + + # New dimension and variable names are created. dim_dup_new = dim_dup+'_1' var_name_new = dup_var_name+'_1' dup_new_varnames.append(var_name_new) - # create new dimension by copying from the duplicated dimension - - data = {} - fill_value = dup_var._FillValue # pylint: disable=W0212 - nc_dataset.createDimension(dim_dup_new, nc_dataset.variables[dim_dup].size) - data[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype, - (dim_dup_new,), fill_value=fill_value) - - for ncattr in nc_dataset.variables[dim_dup].ncattrs(): - if ncattr != '_FillValue': - data[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr)) - data[dim_dup_new][:] = nc_dataset.variables[dim_dup][:] - + # The last dimension for the variable is replaced with the new name in a temporary list. new_dim_list = dim_list[:-1] new_dim_list.extend([dim_dup_new]) - # createVariable with new dimensions - - data[var_name_new] = nc_dataset.createVariable(var_name_new, str(dup_var[:].dtype), tuple(new_dim_list), fill_value=fill_value) + new_dup_var = {} + attrs_contents = {} + # Attributes for the original variable are retrieved. for attrname in dup_var.ncattrs(): if attrname != '_FillValue': - data[var_name_new].setncattr(attrname, nc_dataset.variables[dup_var_name].getncattr(attrname)) - data[var_name_new][:] = nc_dataset.variables[dup_var_name][:] - del nc_dataset.variables[dup_var_name] + attrs_contents[attrname] = nc_dataset.variables[dup_var_name].getncattr(attrname) - # return the variables that will need to be renamed: Rename method is still an issue per https://github.com/Unidata/netcdf-c/issues/1672 - return nc_dataset, dup_new_varnames + fill_value = dup_var._FillValue # pylint: disable=W0212 + # Only create a new *Dimension* if it doesn't already exist. + if dim_dup_new not in nc_dataset.dimensions.keys(): -def rename_dup_vars(dataset: xr.Dataset, rename_vars: List[str]) -> xr.Dataset: - """ - NetCDF4 rename function raises and HDF error for variable in S5P files with duplicate dimensions - This method will use xarray to rename the variables - """ - for i in rename_vars: - original_name = i[:-2] - dataset = dataset.rename({i: original_name}) + # New dimension is created by copying from the duplicated dimension. + nc_dataset.createDimension(dim_dup_new, dim_dup_length) + + # Only create a new dimension *Variable* if it existed originally in the NetCDF structure. + if dim_dup in nc_dataset.variables.keys(): + + # New variable object is created for the renamed, previously duplicated dimension. + new_dup_var[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype, + (dim_dup_new,), fill_value=fill_value) + # New variable's attributes are set to the original ones. + for ncattr in nc_dataset.variables[dim_dup].ncattrs(): + if ncattr != '_FillValue': + new_dup_var[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr)) + new_dup_var[dim_dup_new][:] = nc_dataset.variables[dim_dup][:] + + # Delete existing Variable + del nc_dataset.variables[dup_var_name] + + # Replace original *Variable* with new variable with no duplicated dimensions. + new_dup_var[dup_var_name] = nc_dataset.createVariable(dup_var_name, str(dup_var[:].dtype), + tuple(new_dim_list), fill_value=fill_value) + for attr_name, contents in attrs_contents.items(): + new_dup_var[dup_var_name].setncattr(attr_name, contents) + new_dup_var[dup_var_name][:] = dup_var[:] - return dataset + return nc_dataset diff --git a/podaac/subsetter/group_handling.py b/podaac/subsetter/group_handling.py index 429e558a..20270384 100644 --- a/podaac/subsetter/group_handling.py +++ b/podaac/subsetter/group_handling.py @@ -175,8 +175,11 @@ def _rename_variables(dataset: xr.Dataset, base_dataset: nc.Dataset, start_date) var_attrs.pop('_FillValue', None) comp_args = {"zlib": True, "complevel": 1} + var_data = variable.data if variable.dtype == object: - var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value, **comp_args) + comp_args = {"zlib": False, "complevel": 1} + var_group.createVariable(new_var_name, 'S4', var_dims, fill_value=fill_value, **comp_args) + var_data = np.array(variable.data) elif variable.dtype == 'timedelta64[ns]': var_group.createVariable(new_var_name, 'i4', var_dims, fill_value=fill_value, **comp_args) else: @@ -187,7 +190,7 @@ def _rename_variables(dataset: xr.Dataset, base_dataset: nc.Dataset, start_date) # Copy data var_group.variables[new_var_name].set_auto_maskandscale(False) - var_group.variables[new_var_name][:] = variable.data + var_group.variables[new_var_name][:] = var_data def h5file_transform(finput: str) -> Tuple[nc.Dataset, bool]: diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index 411276e4..08820111 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -970,7 +970,7 @@ def convert_to_datetime(dataset: xr.Dataset, time_vars: list) -> Tuple[xr.Datase return dataset, start_date -def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]: +def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, bool]: """Open netcdf file, and flatten groups if they exist.""" file_extension = filepath.split('.')[-1] @@ -985,9 +985,9 @@ def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]: if has_groups: nc_dataset = transform_grouped_dataset(nc_dataset, filepath) - nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset) + nc_dataset = dc.remove_duplicate_dims(nc_dataset) - return nc_dataset, rename_vars, has_groups + return nc_dataset, has_groups def override_decode_cf_datetime() -> None: @@ -1071,18 +1071,20 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, than one value in the case where there are multiple groups and different coordinate variables for each group. """ - nc_dataset, rename_vars, has_groups = open_as_nc_dataset(file_to_subset) + nc_dataset, has_groups = open_as_nc_dataset(file_to_subset) override_decode_cf_datetime() if has_groups: # Make sure all variables start with '/' - variables = ['/' + var if not var.startswith('/') else var for var in variables] + if variables: + variables = ['/' + var if not var.startswith('/') else var for var in variables] lat_var_names = ['/' + var if not var.startswith('/') else var for var in lat_var_names] lon_var_names = ['/' + var if not var.startswith('/') else var for var in lon_var_names] time_var_names = ['/' + var if not var.startswith('/') else var for var in time_var_names] # Replace all '/' with GROUP_DELIM - variables = [var.replace('/', GROUP_DELIM) for var in variables] + if variables: + variables = [var.replace('/', GROUP_DELIM) for var in variables] lat_var_names = [var.replace('/', GROUP_DELIM) for var in lat_var_names] lon_var_names = [var.replace('/', GROUP_DELIM) for var in lon_var_names] time_var_names = [var.replace('/', GROUP_DELIM) for var in time_var_names] @@ -1099,7 +1101,6 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, xr.backends.NetCDF4DataStore(nc_dataset), **args ) as dataset: - dataset = dc.rename_dup_vars(dataset, rename_vars) lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names( dataset=dataset, lat_var_names=lat_var_names, diff --git a/pyproject.toml b/pyproject.toml index 4ac69f66..809e8f3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ [tool.poetry] name = "l2ss-py" -version = "2.3.0" +version = "2.4.0-rc.1" description = "L2 Subsetter Service" authors = ["podaac-tva "] license = "Apache-2.0" diff --git a/tests/data/TEMPO/TEMPO_NO2-PROXY_L2_V01_20130731T232959Z_S015G06_partial.nc b/tests/data/TEMPO/TEMPO_NO2-PROXY_L2_V01_20130731T232959Z_S015G06_partial.nc new file mode 100644 index 00000000..99c20829 Binary files /dev/null and b/tests/data/TEMPO/TEMPO_NO2-PROXY_L2_V01_20130731T232959Z_S015G06_partial.nc differ diff --git a/tests/data/TEMPO/TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc b/tests/data/TEMPO/TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc new file mode 100644 index 00000000..f99dad6f Binary files /dev/null and b/tests/data/TEMPO/TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc differ diff --git a/tests/data/tropomi/S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4 b/tests/data/tropomi/S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4 index b48656b8..f9fee799 100644 Binary files a/tests/data/tropomi/S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4 and b/tests/data/tropomi/S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4 differ diff --git a/tests/test_subset.py b/tests/test_subset.py index 6b1e726f..7a0c1605 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -183,7 +183,7 @@ def test_subset_bbox(test_file, data_dir, subset_output_dir, request): output_file=subset_output_file ) - out_ds, rename_vars, _ = subset.open_as_nc_dataset(subset_output_file) + out_ds, _ = subset.open_as_nc_dataset(subset_output_file) out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds), decode_times=False, decode_coords=False, @@ -549,7 +549,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request): bbox = np.array(((-180, 180), (-90, 90))) output_file = "{}_{}".format(request.node.name, test_file) - in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(data_dir, test_file)) + in_ds, _ = subset.open_as_nc_dataset(join(data_dir, test_file)) in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds), decode_times=False, decode_coords=False) @@ -575,7 +575,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request): variables=[var.replace(GROUP_DELIM, '/') for var in included_variables] ) - out_ds, rename_vars, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file)) + out_ds, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file)) out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds), decode_times=False, decode_coords=False) @@ -1226,7 +1226,7 @@ def test_get_time_variable_name(test_file, data_dir, subset_output_dir): 'mask_and_scale': False, 'decode_times': True } - ds, rename_vars, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file)) + ds, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file)) ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), **args) lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0] @@ -1330,6 +1330,34 @@ def test_duplicate_dims_tropomi(data_dir, subset_output_dir, request): assert variable.shape == \ out_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables[var_name].shape +def test_duplicate_dims_tempo_ozone(data_dir, subset_output_dir, request): + """ + Check if TEMPO Ozone files run successfully even though + these files have variables with duplicate dimensions + """ + TEMPO_dir = join(data_dir, 'TEMPO') + tempo_ozone_file = 'TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc' + + bbox = np.array(((-180, 180), (-90, 90))) + output_file = "{}_{}".format(request.node.name, tempo_ozone_file) + shutil.copyfile( + os.path.join(TEMPO_dir, tempo_ozone_file), + os.path.join(subset_output_dir, tempo_ozone_file) + ) + box_test = subset.subset( + file_to_subset=join(subset_output_dir, tempo_ozone_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file) + ) + # check if the box_test is + + in_nc = nc.Dataset(join(TEMPO_dir, tempo_ozone_file)) + out_nc = nc.Dataset(join(subset_output_dir, output_file)) + + for var_name, variable in in_nc.groups['support_data'].variables.items(): + assert variable.shape == \ + out_nc.groups['support_data'].variables[var_name].shape + def test_omi_novars_subset(data_dir, subset_output_dir, request): """ @@ -2012,6 +2040,29 @@ def test_var_subsetting_tropomi(data_dir, subset_output_dir, request): assert list(slash_dataset.groups['PRODUCT'].variables) == list(noslash_dataset.groups['PRODUCT'].variables) +def test_tropomi_utc_time(data_dir, subset_output_dir, request): + """Verify that the time UTC values are conserved in S5P files""" + trop_dir = join(data_dir, 'tropomi') + trop_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4' + variable = ['/PRODUCT/time_utc'] + bbox = np.array(((-180, 180), (-90, 90))) + output_file = "{}_{}".format(request.node.name, trop_file) + shutil.copyfile( + os.path.join(trop_dir, trop_file), + os.path.join(subset_output_dir, trop_file) + ) + subset.subset( + file_to_subset=join(subset_output_dir, trop_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + variables=variable + ) + + in_nc_dataset = nc.Dataset(join(trop_dir, trop_file)) + out_nc_dataset = nc.Dataset(join(subset_output_dir, output_file)) + + assert in_nc_dataset.groups['PRODUCT'].variables['time_utc'][:].squeeze()[0] ==\ + out_nc_dataset.groups['PRODUCT'].variables['time_utc'][:].squeeze()[0] def test_bad_time_unit(subset_output_dir): fill_val = -99999.0