Skip to content

Commit

Permalink
Feature/issue-205 (#215)
Browse files Browse the repository at this point in the history
* allow vlen variables to be subsetted for omipixcor

* add dimension name changing for OMIPIXCOR collections

* Lint code

* add blank line for linting

* add tests for OMI pixcor

* add omi pix cor test data file

* update changelog

* Check if time variables are being added a second time

* Add comment for compute time vars

* Linted code

* remove print statements and add extra space

* remove print statements and add extra space

---------

Co-authored-by: nlensse1 <[email protected]>
  • Loading branch information
nlenssen2013 and nlensse1 authored Nov 29, 2023
1 parent 8575323 commit b558678
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 30 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- [issue/209](https://github.com/podaac/l2ss-py/issues/209): Update code so dims are created if they don't already exists.
- Update way we modify shapefile from 180 to 360 lon lats.
- [issue/205](https://github.com/podaac/l2ss-py/issues/205): Add coding capability for when groups have same dimension names but different values. Xarray rename dims is utilized
- [issue/220](https://github.com/podaac/l2ss-py/issues/220): Check if the time variables being found haven't already been found. Add time_vars as an extra arguement in compute_time_variable_name
### Security

## [2.6.0]
Expand Down
35 changes: 35 additions & 0 deletions podaac/subsetter/dimension_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,38 @@ def sync_dims_inplace(original_dataset: xr.Dataset, new_dataset: xr.Dataset) ->
for new_dim in new_variable_dims:
if new_dim not in original_variable_dims:
new_dataset[variable_name] = new_dataset[variable_name].isel({new_dim: 0})


def recreate_pixcore_dimensions(datasets: list):
"""
if dimensions have different values after subsetting,
then they better have different names
"""
dim_dict = {}
count = 0
for dataset in datasets:
dim_list_shape = list(dataset.dims.values())
current_dims = list(dataset.dims.keys())
rename_list = []
for current_dim, dim_value in zip(current_dims, dim_list_shape):
if current_dim not in dim_dict:
dim_dict[current_dim] = dim_value
else:
# find dim name with conflicting values
if dim_dict[current_dim] != dim_value:
# create a new name for the dim
new_dim = current_dim+'_'+str(count)
dim_tup = (current_dim, new_dim)
# add the old and new name tuple to the list
rename_list.append(dim_tup)
else:
pass

if len(rename_list) > 0:
# xarray rename_dims funct with dict of old names (keys) to new names (values)
rename_dict = dict(rename_list)
datasets[count] = dataset.rename_dims(rename_dict)

count += 1

return datasets
6 changes: 3 additions & 3 deletions podaac/subsetter/group_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ def recombine_grouped_datasets(datasets: List[xr.Dataset], output_file: str, sta
"""

base_dataset = nc.Dataset(output_file, mode='w')

for dataset in datasets:
group_lst = []
for var_name in dataset.variables.keys(): # need logic if there is data in the top level not in a group
Expand Down Expand Up @@ -181,7 +180,8 @@ def _rename_variables(dataset: xr.Dataset, base_dataset: nc.Dataset, start_date,
comp_args = {"zlib": True, "complevel": 1}

var_data = variable.data
if variable.dtype == object:

if variable.dtype in [object, '|S27']:
comp_args = {"zlib": False, "complevel": 1}
var_group.createVariable(new_var_name, 'S4', var_dims, fill_value=fill_value, **comp_args)
var_data = np.array(variable.data)
Expand All @@ -197,7 +197,7 @@ def _rename_variables(dataset: xr.Dataset, base_dataset: nc.Dataset, start_date,

# Copy data
var_group.variables[new_var_name].set_auto_maskandscale(False)
if variable.dtype in ['|S1', '|S2']:
if variable.dtype in ['|S1', '|S2', '|S27']:
var_group.variables[new_var_name][:] = variable.values
else:
var_group.variables[new_var_name][:] = var_data
Expand Down
27 changes: 14 additions & 13 deletions podaac/subsetter/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,13 +487,14 @@ def get_spatial_bounds(dataset: xr.Dataset, lat_var_names: str, lon_var_names: s
return np.array([[min_lon, max_lon], [min_lat, max_lat]])


def compute_time_variable_name(dataset: xr.Dataset, lat_var: xr.Variable) -> str:
def compute_time_variable_name(dataset: xr.Dataset, lat_var: xr.Variable, total_time_vars: list) -> str:
"""
Try to determine the name of the 'time' variable. This is done as
follows:
- The variable name contains 'time'
- The variable dimensions match the dimensions of the given lat var
- The variable that hasn't already been found
Parameters
----------
Expand All @@ -512,7 +513,6 @@ def compute_time_variable_name(dataset: xr.Dataset, lat_var: xr.Variable) -> str
ValueError
If the time variable could not be determined
"""

time_vars = find_matching_coords(dataset, ['time'])
if time_vars:
# There should only be one time var match (this is called once
Expand All @@ -523,26 +523,26 @@ def compute_time_variable_name(dataset: xr.Dataset, lat_var: xr.Variable) -> str
time_vars = list(filter(lambda var_name: 'time' in var_name, dataset.dims.keys()))

for var_name in time_vars:
if "time" in var_name and dataset[var_name].squeeze().dims == lat_var.squeeze().dims:
if var_name not in total_time_vars and "time" in var_name and dataset[var_name].squeeze().dims == lat_var.squeeze().dims:
return var_name
for var_name in list(dataset.data_vars.keys()):
if "time" in var_name and dataset[var_name].squeeze().dims == lat_var.squeeze().dims:
if var_name not in total_time_vars and "time" in var_name and dataset[var_name].squeeze().dims == lat_var.squeeze().dims:
return var_name

# first check if any variables are named 'time'
for var_name in list(dataset.data_vars.keys()):
var_name_time = var_name.strip(GROUP_DELIM).split(GROUP_DELIM)[-1]
if len(dataset[var_name].squeeze().dims) == 0:
continue
if ('time' == var_name_time.lower() or 'timeMidScan' == var_name_time) and dataset[var_name].squeeze().dims[0] in lat_var.squeeze().dims:
if var_name not in total_time_vars and ('time' == var_name_time.lower() or 'timeMidScan' == var_name_time) and dataset[var_name].squeeze().dims[0] in lat_var.squeeze().dims:
return var_name

# then check if any variables have 'time' in the string if the above loop doesn't return anything
for var_name in list(dataset.data_vars.keys()):
var_name_time = var_name.strip(GROUP_DELIM).split(GROUP_DELIM)[-1]
if len(dataset[var_name].squeeze().dims) == 0:
continue
if 'time' in var_name_time.lower() and dataset[var_name].squeeze().dims[0] in lat_var.squeeze().dims:
if var_name not in total_time_vars and 'time' in var_name_time.lower() and dataset[var_name].squeeze().dims[0] in lat_var.squeeze().dims:
return var_name

raise ValueError('Unable to determine time variable')
Expand Down Expand Up @@ -962,8 +962,8 @@ def subset_with_bbox(dataset: xr.Dataset, # pylint: disable=too-many-branches
total_list.extend(group_vars)
if diffs == -1:
return datasets

return datasets
dim_cleaned_datasets = dc.recreate_pixcore_dimensions(datasets)
return dim_cleaned_datasets


def subset_with_shapefile(dataset: xr.Dataset,
Expand Down Expand Up @@ -1060,11 +1060,12 @@ def get_coordinate_variable_names(dataset: xr.Dataset,
if not lat_var_names or not lon_var_names:
lat_var_names, lon_var_names = compute_coordinate_variable_names(dataset)
if not time_var_names:
time_var_names = [
compute_time_variable_name(
dataset, dataset[lat_var_name]
) for lat_var_name in lat_var_names
]
time_var_names = []
for lat_var_name in lat_var_names:
time_var_names.append(compute_time_variable_name(dataset,
dataset[lat_var_name],
time_var_names))

time_var_names.append(compute_utc_name(dataset))
time_var_names = [x for x in time_var_names if x is not None] # remove Nones and any duplicates

Expand Down
Binary file not shown.
57 changes: 43 additions & 14 deletions tests/test_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_subset_variables(test_file, data_dir, subset_output_dir, request):
time_var_name = None
try:
lat_var_name = subset.compute_coordinate_variable_names(in_ds)[0][0]
time_var_name = subset.compute_time_variable_name(in_ds, in_ds[lat_var_name])
time_var_name = subset.compute_time_variable_name(in_ds, in_ds[lat_var_name], [])
except ValueError:
# unable to determine lon lat vars
pass
Expand Down Expand Up @@ -1263,7 +1263,7 @@ def test_get_time_variable_name(test_file, data_dir):
ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), **args)

lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0]
time_var_name = subset.compute_time_variable_name(ds, ds[lat_var_name])
time_var_name = subset.compute_time_variable_name(ds, ds[lat_var_name], [])

assert time_var_name is not None
assert 'time' in time_var_name
Expand Down Expand Up @@ -1506,7 +1506,7 @@ def test_get_time_squeeze(data_dir, subset_output_dir):
**args
) as dataset:
lat_var_name = subset.compute_coordinate_variable_names(dataset)[0][0]
time_var_name = subset.compute_time_variable_name(dataset, dataset[lat_var_name])
time_var_name = subset.compute_time_variable_name(dataset, dataset[lat_var_name], [])
lat_dims = dataset[lat_var_name].squeeze().dims
time_dims = dataset[time_var_name].squeeze().dims
assert lat_dims == time_dims
Expand Down Expand Up @@ -1534,7 +1534,7 @@ def test_get_indexers_nd(data_dir, subset_output_dir):
) as dataset:
lat_var_name = subset.compute_coordinate_variable_names(dataset)[0][0]
lon_var_name = subset.compute_coordinate_variable_names(dataset)[1][0]
time_var_name = subset.compute_time_variable_name(dataset, dataset[lat_var_name])
time_var_name = subset.compute_time_variable_name(dataset, dataset[lat_var_name], [])
oper = operator.and_

cond = oper(
Expand Down Expand Up @@ -1725,11 +1725,11 @@ def test_get_time_epoch_var(data_dir, subset_output_dir):
**args
) as dataset:
lat_var_names, _ = subset.compute_coordinate_variable_names(dataset)
time_var_names = [
subset.compute_time_variable_name(
dataset, dataset[lat_var_name]
) for lat_var_name in lat_var_names
]
time_var_names = []
for lat_var_name in lat_var_names:
time_var_names.append(subset.compute_time_variable_name(
dataset, dataset[lat_var_name], time_var_names
))
epoch_time_var = subset.get_time_epoch_var(dataset, time_var_names[0])

assert epoch_time_var.split('__')[-1] == 'time'
Expand Down Expand Up @@ -1835,6 +1835,35 @@ def test_temporal_he5file_subset(data_dir, subset_output_dir):
dataset, _ = tc.convert_to_datetime(dataset, time_var_names, hdf_type)
assert dataset[time_var_names[0]].dtype == 'datetime64[ns]'


def test_omi_pixcor(data_dir, subset_output_dir, request):
"""
OMI PIX COR collection has the same shape across groups but covers a different domain
group to group. Dimension names had to be changed in order for copying data back into
netCDF files. L2S developers not this collection was particularly tricky
"""
omi_dir = join(data_dir, 'OMI')
omi_file = 'OMI-Aura_L2-OMPIXCOR_2020m0116t1207-o82471_v003-2020m0116t174929.he5'
omi_file_input = 'input' + omi_file
bbox = np.array(((-180, 180), (-30, 30)))
output_file = "{}_{}".format(request.node.name, omi_file)

shutil.copyfile(
os.path.join(omi_dir, omi_file),
os.path.join(subset_output_dir, omi_file)
)

_ = subset.subset(
file_to_subset=os.path.join(subset_output_dir, omi_file),
bbox=bbox,
output_file=os.path.join(subset_output_dir, output_file)
)

out_nc = nc.Dataset(join(subset_output_dir, output_file))

assert out_nc


def test_MLS_levels(data_dir, subset_output_dir, request):
"""
Test that the unique groups are determined before bounding box
Expand Down Expand Up @@ -1990,11 +2019,11 @@ def test_get_time_OMI(data_dir, subset_output_dir):
**args
) as dataset:
lat_var_names, _ = subset.compute_coordinate_variable_names(dataset)
time_var_names = [
subset.compute_time_variable_name(
dataset, dataset[lat_var_name]
) for lat_var_name in lat_var_names
]
time_var_names = []
for lat_var_name in lat_var_names:
time_var_names.append(subset.compute_time_variable_name(
dataset, dataset[lat_var_name], time_var_names
))
assert "Time" in time_var_names[0]
assert "Latitude" in lat_var_names[0]

Expand Down

0 comments on commit b558678

Please sign in to comment.