From 86c0c1cf2f70b01afd245b47fa739f1c6fe0109c Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Fri, 1 Dec 2023 15:56:50 -0500 Subject: [PATCH 1/6] make attribute flattening work with leading slashes --- concatenator/attribute_handling.py | 30 +++++++-------- tests/test_group_handling.py | 60 +++++++++++++++++++++++------- 2 files changed, 61 insertions(+), 29 deletions(-) diff --git a/concatenator/attribute_handling.py b/concatenator/attribute_handling.py index b99fcab..2f4ee7f 100644 --- a/concatenator/attribute_handling.py +++ b/concatenator/attribute_handling.py @@ -29,33 +29,32 @@ def regroup_coordinate_attribute(attribute_string: str) -> str: """ # Use the separator that's in the attribute string only if all separators in the string are the same. # Otherwise, we will use our own default separator. - whitespaces = re.findall(r'\s+', attribute_string) + whitespaces = re.findall(r"\s+", attribute_string) if len(set(whitespaces)) <= 1: new_sep = whitespaces[0] else: new_sep = COORD_DELIM return new_sep.join( - '/'.join(c.split(GROUP_DELIM))[1:] - for c - in attribute_string.split() # split on any whitespace + "/".join(c.split(GROUP_DELIM))[1:] + for c in attribute_string.split() # split on any whitespace ) -def flatten_coordinate_attribute_paths(dataset: netCDF4.Dataset, - var: netCDF4.Variable, - variable_name: str) -> None: +def flatten_coordinate_attribute_paths( + dataset: netCDF4.Dataset, var: netCDF4.Variable, variable_name: str +) -> None: """Flatten the paths of variables referenced in the coordinates attribute.""" - if 'coordinates' in var.ncattrs(): - coord_att = var.getncattr('coordinates') + if "coordinates" in var.ncattrs(): + coord_att = var.getncattr("coordinates") new_coord_att = _flatten_coordinate_attribute(coord_att) - dataset.variables[variable_name].setncattr('coordinates', new_coord_att) + dataset.variables[variable_name].setncattr("coordinates", new_coord_att) def _flatten_coordinate_attribute(attribute_string: str) -> str: - """Converts attributes that specify group membership via "/" to use new group delimiter, even for the root level. + """Converts attributes with "/" delimiters to use new group delimiter, even for the root level. Examples -------- @@ -73,15 +72,14 @@ def _flatten_coordinate_attribute(attribute_string: str) -> str: """ # Use the separator that's in the attribute string only if all separators in the string are the same. # Otherwise, we will use our own default separator. - whitespaces = re.findall(r'\s+', attribute_string) - if len(set(whitespaces)) <= 1: + whitespaces = re.findall(r"\s+", attribute_string) + if len(set(whitespaces)) == 1: new_sep = whitespaces[0] else: new_sep = COORD_DELIM # A new string is constructed. return new_sep.join( - f'{GROUP_DELIM}{c.replace("/", GROUP_DELIM)}' - for c - in attribute_string.split() # split on any whitespace + f"{GROUP_DELIM}{item}" if not item.startswith(GROUP_DELIM) else item + for item in attribute_string.replace("/", GROUP_DELIM).split() ) diff --git a/tests/test_group_handling.py b/tests/test_group_handling.py index 0515910..bd2c5c8 100644 --- a/tests/test_group_handling.py +++ b/tests/test_group_handling.py @@ -2,27 +2,61 @@ # pylint: disable=C0116, C0301 -from concatenator.attribute_handling import (_flatten_coordinate_attribute, - regroup_coordinate_attribute) +from concatenator.attribute_handling import ( + _flatten_coordinate_attribute, + regroup_coordinate_attribute, +) -def test_coordinate_attribute_flattening(): +def test_coordinate_attribute_flattening_with_no_leading_slash(): # Case with groups present and double spaces. - assert _flatten_coordinate_attribute( - "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude" - ) == '__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude' + assert ( + _flatten_coordinate_attribute( + "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude" + ) + == "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude" + ) # Case with NO groups present and single spaces. - assert _flatten_coordinate_attribute( - "time longitude latitude ozone_profile_pressure ozone_profile_altitude" - ) == "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude" + assert ( + _flatten_coordinate_attribute( + "time longitude latitude ozone_profile_pressure ozone_profile_altitude" + ) + == "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude" + ) + + +def test_coordinate_attribute_flattening_with_a_leading_slash(): + # Case with groups present and double spaces. + assert ( + _flatten_coordinate_attribute( + "/Time_and_Position/time /Time_and_Position/instrument_fov_latitude /Time_and_Position/instrument_fov_longitude" + ) + == "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude" + ) + + # Case with NO groups present and single spaces. + assert ( + _flatten_coordinate_attribute( + "/time /longitude /latitude /ozone_profile_pressure /ozone_profile_altitude" + ) + == "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude" + ) def test_coordinate_attribute_regrouping(): # Case with groups present and double spaces. - assert regroup_coordinate_attribute( - '__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude') == "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude" + assert ( + regroup_coordinate_attribute( + "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude" + ) + == "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude" + ) # Case with NO groups present and single spaces. - assert regroup_coordinate_attribute( - "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude") == "time longitude latitude ozone_profile_pressure ozone_profile_altitude" + assert ( + regroup_coordinate_attribute( + "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude" + ) + == "time longitude latitude ozone_profile_pressure ozone_profile_altitude" + ) From ae5bfbcc9afc1f826d4f007a365e7623922ff8ad Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Fri, 1 Dec 2023 17:03:07 -0500 Subject: [PATCH 2/6] extract function for the most basic flattening of a path string --- concatenator/attribute_handling.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/concatenator/attribute_handling.py b/concatenator/attribute_handling.py index 2f4ee7f..b98343d 100644 --- a/concatenator/attribute_handling.py +++ b/concatenator/attribute_handling.py @@ -79,7 +79,11 @@ def _flatten_coordinate_attribute(attribute_string: str) -> str: new_sep = COORD_DELIM # A new string is constructed. - return new_sep.join( - f"{GROUP_DELIM}{item}" if not item.startswith(GROUP_DELIM) else item - for item in attribute_string.replace("/", GROUP_DELIM).split() - ) + return new_sep.join(flatten_variable_path_str(item) for item in attribute_string.split()) + + +def flatten_variable_path_str(path_str: str) -> str: + """Converts a path with "/" delimiters to use new group delimiter, even for the root level.""" + new_path = path_str.replace("/", GROUP_DELIM) + + return f"{GROUP_DELIM}{new_path}" if not new_path.startswith(GROUP_DELIM) else new_path From ed35a7c6c00108808b33329dac9d259e59eb685e Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Fri, 1 Dec 2023 17:05:20 -0500 Subject: [PATCH 3/6] add option for list of variables with which to restrict the concat results --- concatenator/stitchee.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/concatenator/stitchee.py b/concatenator/stitchee.py index d1f2c8d..f8b4397 100644 --- a/concatenator/stitchee.py +++ b/concatenator/stitchee.py @@ -9,6 +9,7 @@ import xarray as xr from concatenator import GROUP_DELIM +from concatenator.attribute_handling import flatten_variable_path_str from concatenator.dimension_cleanup import remove_duplicate_dims from concatenator.file_ops import add_label_to_path from concatenator.group_handling import ( @@ -27,6 +28,7 @@ def stitchee( concat_method: str = "xarray-concat", concat_dim: str = "", concat_kwargs: dict | None = None, + variables_to_include: list[str] | None = None, logger: Logger = default_logger, ) -> str: """Concatenate netCDF data files along an existing dimension. @@ -35,8 +37,16 @@ def stitchee( ---------- files_to_concat : list[str] output_file : str - keep_tmp_files : bool + write_tmp_flat_concatenated : bool, optional + keep_tmp_files : bool, optional + concat_method : str, optional + Either 'xarray-concat' or 'xarray-combine' concat_dim : str, optional + concat_kwargs : dict, optional + Keyword arguments to pass through to the xarray concatenation method + variables_to_include : list[str], optional + Names of variables to include. All other variables are excluded from the result + logger : logging.Logger Returns @@ -59,6 +69,14 @@ def stitchee( "'concat_dim' was specified, but will not be used because xarray-combine method was selected." ) + # Convert variable names inputted to flattened versions + if variables_to_include is not None: + variables_to_include_flattened = [ + flatten_variable_path_str(v) for v in variables_to_include + ] + else: + variables_to_include_flattened = None + logger.info("Flattening all input files...") xrdataset_list = [] @@ -67,10 +85,21 @@ def stitchee( # The group structure is flattened. start_time = time.time() logger.info(" ..file %03d/%03d <%s>..", i + 1, num_input_files, filepath) - flat_dataset, coord_vars, _ = flatten_grouped_dataset( + flat_dataset, coord_vars, string_vars = flatten_grouped_dataset( nc.Dataset(filepath, "r"), filepath, ensure_all_dims_are_coords=True ) + if variables_to_include_flattened is not None: + variables_to_delete = [ + var_name + for var_name, _ in flat_dataset.variables.items() + if (var_name not in variables_to_include_flattened) + and (var_name not in coord_vars) + ] + + for var_name in variables_to_delete: + del flat_dataset.variables[var_name] + logger.info("Removing duplicate dimensions") flat_dataset = remove_duplicate_dims(flat_dataset) From 0f333a7ea36b167fabb4c2f516075e66d8a9a800 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Fri, 1 Dec 2023 17:06:00 -0500 Subject: [PATCH 4/6] establish default xarray concatenation arguments for dictionary usage --- concatenator/stitchee.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/concatenator/stitchee.py b/concatenator/stitchee.py index f8b4397..a99aeb7 100644 --- a/concatenator/stitchee.py +++ b/concatenator/stitchee.py @@ -130,22 +130,24 @@ def stitchee( # coords='minimal', # compat='override') + # Establish default concatenation keyword arguments if not supplied as input. if concat_kwargs is None: concat_kwargs = {} + if "data_vars" not in concat_kwargs: + concat_kwargs["data_vars"] = "minimal" + if "coords" not in concat_kwargs: + concat_kwargs["coords"] = "minimal" + # Perform concatenation operation. if concat_method == "xarray-concat": combined_ds = xr.concat( xrdataset_list, dim=GROUP_DELIM + concat_dim, - data_vars="minimal", - coords="minimal", **concat_kwargs, ) elif concat_method == "xarray-combine": combined_ds = xr.combine_by_coords( xrdataset_list, - data_vars="minimal", - coords="minimal", **concat_kwargs, ) else: From 10c024dc499cb40b075fc00568210ecb87febbd8 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Fri, 1 Dec 2023 17:09:09 -0500 Subject: [PATCH 5/6] perform verification for test function with dimension OR variable --- tests/test_concat.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/test_concat.py b/tests/test_concat.py index 2ec0f82..b0f8e2a 100644 --- a/tests/test_concat.py +++ b/tests/test_concat.py @@ -12,6 +12,7 @@ import pytest from concatenator import concat_with_nco +from concatenator.attribute_handling import flatten_variable_path_str from concatenator.stitchee import stitchee @@ -61,14 +62,24 @@ def run_verification_with_stitchee( concat_kwargs=concat_kwargs, ) - merged_dataset = nc.Dataset(output_path) - # Verify that the length of the record dimension in the concatenated file equals # the sum of the lengths across the input files length_sum = 0 for file in input_files: - length_sum += len(nc.Dataset(file).variables[record_dim_name]) - assert length_sum == len(merged_dataset.variables[record_dim_name]) + with nc.Dataset(file) as ds: + length_sum += ds.dimensions[flatten_variable_path_str(record_dim_name)].size + + with nc.Dataset(output_path) as merged_dataset: + if record_dim_name in merged_dataset.variables: + # Primary dimension is a root level variable + assert length_sum == len(merged_dataset.variables[record_dim_name]) + elif record_dim_name in merged_dataset.dimensions: + # Primary dimension is a root level dimension, but not a variable + assert length_sum == merged_dataset.dimensions[record_dim_name].size + else: + raise AttributeError( + "Unexpected condition, where primary record dimension is not at the root level." + ) def run_verification_with_nco(self, data_dir, output_name, record_dim_name="mirror_step"): output_path = str(self.__output_path.joinpath(output_name)) From 1780d9239e55ed228b7ca0fb2e73ad5b59d93051 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Fri, 1 Dec 2023 17:09:51 -0500 Subject: [PATCH 6/6] add variables_to_include argument to test function --- tests/test_concat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_concat.py b/tests/test_concat.py index b0f8e2a..71bbb48 100644 --- a/tests/test_concat.py +++ b/tests/test_concat.py @@ -38,13 +38,14 @@ def run_verification_with_stitchee( concat_method: str = "xarray-concat", record_dim_name: str = "mirror_step", concat_kwargs: dict | None = None, + variables_to_include: list[str] | None = None, ): output_path = str(self.__output_path.joinpath(output_name)) # type: ignore data_path = self.__test_data_path.joinpath(data_dir) # type: ignore input_files = [] for filepath in data_path.iterdir(): - if Path(filepath).suffix.lower() in (".nc", ".h5", ".hdf"): + if Path(filepath).suffix.lower() in (".nc", ".nc4", ".h5", ".hdf"): copied_input_new_path = self.__output_path / Path(filepath).name # type: ignore shutil.copyfile(filepath, copied_input_new_path) input_files.append(str(copied_input_new_path)) @@ -60,6 +61,7 @@ def run_verification_with_stitchee( concat_method=concat_method, concat_dim=record_dim_name, concat_kwargs=concat_kwargs, + variables_to_include=variables_to_include, ) # Verify that the length of the record dimension in the concatenated file equals