Merge pull request #152 from podaac/release/2.4.0

Release/2.4.0
podaac · Mar 3, 2023 · 978b8e2 · 978b8e2
2 parents 88b6f7a + 58335a5
commit 978b8e2
Show file tree

Hide file tree

Showing 11 changed files with 148 additions and 54 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Security
 
 
+## [2.4.0]
+### Added
+### Changed
+- [issue/142](https://github.com/podaac/l2ss-py/issues/142): Changed handling of duplicate dimensions as part of integration with new TEMPO ozone profile data.
+### Deprecated 
+### Removed
+### Fixed
+- [issue/149](https://github.com/podaac/l2ss-py/issues/149): Fixed compression level for netCDF4 object variable creation into a string. Will need to address after netcdf4 rebuilds library. https://github.com/Unidata/netcdf4-python/issues/1236
+- [issue/143](https://github.com/podaac/l2ss-py/issues/143): Fixed bug when not specifying any variable subsetting for grouped datasets.
+### Security
+
 ## [2.3.0]
 ### Added
 - [issue/126](https://github.com/podaac/l2ss-py/issues/126): Added flexibility to variable subsetting

diff --git a/cmr/ops_associations.txt b/cmr/ops_associations.txt
@@ -1,8 +1,3 @@
-C1684065153-PODAAC
-C1684065156-PODAAC
-C1684065158-PODAAC
-C1693233348-PODAAC
-C1693233387-PODAAC
 C1940473819-POCLOUD
 C1940475563-POCLOUD
 C1940466008-POCLOUD
@@ -55,3 +50,26 @@ C2247621105-POCLOUD
 C2152045877-POCLOUD
 C1940471193-POCLOUD
 C2205121315-POCLOUD
+C2075141524-POCLOUD
+C2499940513-POCLOUD
+C2491735309-POCLOUD
+C2036881016-POCLOUD
+C2499940523-POCLOUD
+C2499940517-POCLOUD
+C2499940520-POCLOUD
+C1940470304-POCLOUD
+C1940472420-POCLOUD
+C2596983413-POCLOUD
+C2596986276-POCLOUD
+C2075141684-POCLOUD
+C1996881752-POCLOUD
+C2075141638-POCLOUD
+C2036882055-POCLOUD
+C2036880640-POCLOUD
+C2251464495-POCLOUD
+C2036882482-POCLOUD
+C2036882048-POCLOUD
+C2068529568-POCLOUD
+C2036882397-POCLOUD
+C2036882499-POCLOUD
+C2036882492-POCLOUD
diff --git a/cmr/uat_associations.txt b/cmr/uat_associations.txt
@@ -39,3 +39,5 @@ C1240739713-POCLOUD
 C1244459498-POCLOUD
 C1242387621-POCLOUD
 C1238658389-POCLOUD
+C1244810554-POCLOUD
+C1256420925-POCLOUD
diff --git a/podaac/subsetter/dimension_cleanup.py b/podaac/subsetter/dimension_cleanup.py
@@ -12,13 +12,11 @@
 Functions which improve upon existing netCDF4 library existing functions
 """
 import collections
-from typing import List, Tuple
 
 import netCDF4 as nc
-import xarray as xr
 
 
-def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]]:
+def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset:
     """
     xarray cannot read netCDF4 datasets with duplicate dimensions.
     Function goes through a dataset to catch any variables with duplicate dimensions.
@@ -28,55 +26,65 @@ def remove_duplicate_dims(nc_dataset: nc.Dataset) -> Tuple[nc.Dataset, List[str]
     """
     dup_vars = {}
     dup_new_varnames = []
+
     for var_name, var in nc_dataset.variables.items():
         dim_list = list(var.dimensions)
         if len(set(dim_list)) != len(dim_list):  # get true if var.dimensions has a duplicate
             dup_vars[var_name] = var  # populate dictionary with variables with vars with dup dims
+
     for dup_var_name, dup_var in dup_vars.items():
-        dim_list = list(dup_var.dimensions)  # list of original dimensions of variable with dup dims
-        # get the dimensions that are duplicated
+        dim_list = list(dup_var.dimensions)  # original dimensions of the variable with duplicated dims
+
+        # Dimension(s) that are duplicated are retrieved.
+        #   Note: this is not yet tested for more than one duplicated dimension.
         dim_dup = [item for item, count in collections.Counter(dim_list).items() if count > 1][0]
+        dim_dup_length = dup_var.shape[dup_var.dimensions.index(dim_dup)]  # length of the duplicated dimension
+
+        # New dimension and variable names are created.
         dim_dup_new = dim_dup+'_1'
         var_name_new = dup_var_name+'_1'
         dup_new_varnames.append(var_name_new)
 
-        # create new dimension by copying from the duplicated dimension
-
-        data = {}
-        fill_value = dup_var._FillValue  # pylint: disable=W0212
-        nc_dataset.createDimension(dim_dup_new, nc_dataset.variables[dim_dup].size)
-        data[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
-                                                      (dim_dup_new,), fill_value=fill_value)
-
-        for ncattr in nc_dataset.variables[dim_dup].ncattrs():
-            if ncattr != '_FillValue':
-                data[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
-        data[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]
-
+        # The last dimension for the variable is replaced with the new name in a temporary list.
         new_dim_list = dim_list[:-1]
         new_dim_list.extend([dim_dup_new])
 
-        # createVariable with new dimensions
-
-        data[var_name_new] = nc_dataset.createVariable(var_name_new, str(dup_var[:].dtype), tuple(new_dim_list), fill_value=fill_value)
+        new_dup_var = {}
+        attrs_contents = {}
 
+        # Attributes for the original variable are retrieved.
         for attrname in dup_var.ncattrs():
             if attrname != '_FillValue':
-                data[var_name_new].setncattr(attrname, nc_dataset.variables[dup_var_name].getncattr(attrname))
-                data[var_name_new][:] = nc_dataset.variables[dup_var_name][:]
-        del nc_dataset.variables[dup_var_name]
+                attrs_contents[attrname] = nc_dataset.variables[dup_var_name].getncattr(attrname)
 
-    # return the variables that will need to be renamed: Rename method is still an issue per https://github.com/Unidata/netcdf-c/issues/1672
-    return nc_dataset, dup_new_varnames
+        fill_value = dup_var._FillValue  # pylint: disable=W0212
 
+        # Only create a new *Dimension* if it doesn't already exist.
+        if dim_dup_new not in nc_dataset.dimensions.keys():
 
-def rename_dup_vars(dataset: xr.Dataset, rename_vars: List[str]) -> xr.Dataset:
-    """
-    NetCDF4 rename function raises and HDF error for variable in S5P files with duplicate dimensions
-    This method will use xarray to rename the variables
-    """
-    for i in rename_vars:
-        original_name = i[:-2]
-        dataset = dataset.rename({i: original_name})
+            # New dimension is created by copying from the duplicated dimension.
+            nc_dataset.createDimension(dim_dup_new, dim_dup_length)
+
+            # Only create a new dimension *Variable* if it existed originally in the NetCDF structure.
+            if dim_dup in nc_dataset.variables.keys():
+
+                # New variable object is created for the renamed, previously duplicated dimension.
+                new_dup_var[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
+                                                                     (dim_dup_new,), fill_value=fill_value)
+                # New variable's attributes are set to the original ones.
+                for ncattr in nc_dataset.variables[dim_dup].ncattrs():
+                    if ncattr != '_FillValue':
+                        new_dup_var[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
+                new_dup_var[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]
+
+        # Delete existing Variable
+        del nc_dataset.variables[dup_var_name]
+
+        # Replace original *Variable* with new variable with no duplicated dimensions.
+        new_dup_var[dup_var_name] = nc_dataset.createVariable(dup_var_name, str(dup_var[:].dtype),
+                                                              tuple(new_dim_list), fill_value=fill_value)
+        for attr_name, contents in attrs_contents.items():
+            new_dup_var[dup_var_name].setncattr(attr_name, contents)
+        new_dup_var[dup_var_name][:] = dup_var[:]
 
-    return dataset
+    return nc_dataset
diff --git a/podaac/subsetter/group_handling.py b/podaac/subsetter/group_handling.py
@@ -175,8 +175,11 @@ def _rename_variables(dataset: xr.Dataset, base_dataset: nc.Dataset, start_date)
         var_attrs.pop('_FillValue', None)
         comp_args = {"zlib": True, "complevel": 1}
 
+        var_data = variable.data
         if variable.dtype == object:
-            var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value, **comp_args)
+            comp_args = {"zlib": False, "complevel": 1}
+            var_group.createVariable(new_var_name, 'S4', var_dims, fill_value=fill_value, **comp_args)
+            var_data = np.array(variable.data)
         elif variable.dtype == 'timedelta64[ns]':
             var_group.createVariable(new_var_name, 'i4', var_dims, fill_value=fill_value, **comp_args)
         else:
@@ -187,7 +190,7 @@ def _rename_variables(dataset: xr.Dataset, base_dataset: nc.Dataset, start_date)
 
         # Copy data
         var_group.variables[new_var_name].set_auto_maskandscale(False)
-        var_group.variables[new_var_name][:] = variable.data
+        var_group.variables[new_var_name][:] = var_data
 
 
 def h5file_transform(finput: str) -> Tuple[nc.Dataset, bool]:

diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
@@ -970,7 +970,7 @@ def convert_to_datetime(dataset: xr.Dataset, time_vars: list) -> Tuple[xr.Datase
     return dataset, start_date
 
 
-def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]:
+def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, bool]:
     """Open netcdf file, and flatten groups if they exist."""
     file_extension = filepath.split('.')[-1]
 
@@ -985,9 +985,9 @@ def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]:
         if has_groups:
             nc_dataset = transform_grouped_dataset(nc_dataset, filepath)
 
-    nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset)
+    nc_dataset = dc.remove_duplicate_dims(nc_dataset)
 
-    return nc_dataset, rename_vars, has_groups
+    return nc_dataset, has_groups
 
 
 def override_decode_cf_datetime() -> None:
@@ -1071,18 +1071,20 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
         than one value in the case where there are multiple groups and
         different coordinate variables for each group.
     """
-    nc_dataset, rename_vars, has_groups = open_as_nc_dataset(file_to_subset)
+    nc_dataset, has_groups = open_as_nc_dataset(file_to_subset)
 
     override_decode_cf_datetime()
 
     if has_groups:
         # Make sure all variables start with '/'
-        variables = ['/' + var if not var.startswith('/') else var for var in variables]
+        if variables:
+            variables = ['/' + var if not var.startswith('/') else var for var in variables]
         lat_var_names = ['/' + var if not var.startswith('/') else var for var in lat_var_names]
         lon_var_names = ['/' + var if not var.startswith('/') else var for var in lon_var_names]
         time_var_names = ['/' + var if not var.startswith('/') else var for var in time_var_names]
         # Replace all '/' with GROUP_DELIM
-        variables = [var.replace('/', GROUP_DELIM) for var in variables]
+        if variables:
+            variables = [var.replace('/', GROUP_DELIM) for var in variables]
         lat_var_names = [var.replace('/', GROUP_DELIM) for var in lat_var_names]
         lon_var_names = [var.replace('/', GROUP_DELIM) for var in lon_var_names]
         time_var_names = [var.replace('/', GROUP_DELIM) for var in time_var_names]
@@ -1099,7 +1101,6 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
             xr.backends.NetCDF4DataStore(nc_dataset),
             **args
     ) as dataset:
-        dataset = dc.rename_dup_vars(dataset, rename_vars)
         lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names(
             dataset=dataset,
             lat_var_names=lat_var_names,

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@
 
 [tool.poetry]
 name = "l2ss-py"
-version = "2.3.0"
+version = "2.4.0-rc.1"
 description = "L2 Subsetter Service"
 authors = ["podaac-tva <[email protected]>"]
 license = "Apache-2.0"

diff --git a/tests/data/TEMPO/TEMPO_NO2-PROXY_L2_V01_20130731T232959Z_S015G06_partial.nc b/tests/data/TEMPO/TEMPO_NO2-PROXY_L2_V01_20130731T232959Z_S015G06_partial.nc
diff --git a/tests/data/TEMPO/TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc b/tests/data/TEMPO/TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc
diff --git a/...FFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4 b/...FFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4
diff --git a/tests/test_subset.py b/tests/test_subset.py
@@ -183,7 +183,7 @@ def test_subset_bbox(test_file, data_dir, subset_output_dir, request):
         output_file=subset_output_file
     )
 
-    out_ds, rename_vars, _ = subset.open_as_nc_dataset(subset_output_file)
+    out_ds, _ = subset.open_as_nc_dataset(subset_output_file)
     out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds),
                              decode_times=False,
                              decode_coords=False,
@@ -549,7 +549,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request):
     bbox = np.array(((-180, 180), (-90, 90)))
     output_file = "{}_{}".format(request.node.name, test_file)
 
-    in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(data_dir, test_file))
+    in_ds, _ = subset.open_as_nc_dataset(join(data_dir, test_file))
     in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds),
                             decode_times=False,
                             decode_coords=False)
@@ -575,7 +575,7 @@ def test_specified_variables(test_file, data_dir, subset_output_dir, request):
         variables=[var.replace(GROUP_DELIM, '/') for var in included_variables]
     )
 
-    out_ds, rename_vars, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file))
+    out_ds, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file))
     out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds),
                              decode_times=False,
                              decode_coords=False)
@@ -1226,7 +1226,7 @@ def test_get_time_variable_name(test_file, data_dir, subset_output_dir):
         'mask_and_scale': False,
         'decode_times': True
     }
-    ds, rename_vars, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file))
+    ds, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file))
     ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), **args)
 
     lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0]
@@ -1330,6 +1330,34 @@ def test_duplicate_dims_tropomi(data_dir, subset_output_dir, request):
         assert variable.shape == \
                out_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables[var_name].shape
 
+def test_duplicate_dims_tempo_ozone(data_dir, subset_output_dir, request):
+    """
+    Check if TEMPO Ozone files run successfully even though
+    these files have variables with duplicate dimensions
+    """
+    TEMPO_dir = join(data_dir, 'TEMPO')
+    tempo_ozone_file = 'TEMPO_O3PROF-PROXY_L2_V01_20130831T222959Z_S014G06.nc'
+
+    bbox = np.array(((-180, 180), (-90, 90)))
+    output_file = "{}_{}".format(request.node.name, tempo_ozone_file)
+    shutil.copyfile(
+        os.path.join(TEMPO_dir, tempo_ozone_file),
+        os.path.join(subset_output_dir, tempo_ozone_file)
+    )
+    box_test = subset.subset(
+        file_to_subset=join(subset_output_dir, tempo_ozone_file),
+        bbox=bbox,
+        output_file=join(subset_output_dir, output_file)
+    )
+    # check if the box_test is
+
+    in_nc = nc.Dataset(join(TEMPO_dir, tempo_ozone_file))
+    out_nc = nc.Dataset(join(subset_output_dir, output_file))
+
+    for var_name, variable in in_nc.groups['support_data'].variables.items():
+        assert variable.shape == \
+               out_nc.groups['support_data'].variables[var_name].shape
+
 
 def test_omi_novars_subset(data_dir, subset_output_dir, request):
     """
@@ -2012,6 +2040,29 @@ def test_var_subsetting_tropomi(data_dir, subset_output_dir, request):
 
     assert list(slash_dataset.groups['PRODUCT'].variables) == list(noslash_dataset.groups['PRODUCT'].variables)
 
+def test_tropomi_utc_time(data_dir, subset_output_dir, request):
+    """Verify that the time UTC values are conserved in S5P files"""
+    trop_dir = join(data_dir, 'tropomi')
+    trop_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4'
+    variable = ['/PRODUCT/time_utc']
+    bbox = np.array(((-180, 180), (-90, 90)))
+    output_file = "{}_{}".format(request.node.name, trop_file)
+    shutil.copyfile(
+        os.path.join(trop_dir, trop_file),
+        os.path.join(subset_output_dir, trop_file)
+    )
+    subset.subset(
+        file_to_subset=join(subset_output_dir, trop_file),
+        bbox=bbox,
+        output_file=join(subset_output_dir, output_file),
+        variables=variable
+    )
+
+    in_nc_dataset = nc.Dataset(join(trop_dir, trop_file))
+    out_nc_dataset = nc.Dataset(join(subset_output_dir, output_file))
+
+    assert in_nc_dataset.groups['PRODUCT'].variables['time_utc'][:].squeeze()[0] ==\
+                    out_nc_dataset.groups['PRODUCT'].variables['time_utc'][:].squeeze()[0]
 
 def test_bad_time_unit(subset_output_dir):
     fill_val = -99999.0