From 825c560657a570dc056a53d7dfdc2ffafdd789fe Mon Sep 17 00:00:00 2001 From: Montek Thind Date: Tue, 12 Nov 2024 09:05:36 -0800 Subject: [PATCH] Added netcdf (#273) * netcdf * fixed comments * one file * added unit tests * document * added example * edit readme * lowercase * added additional logic: * changing how output works, adding tests for output (#274) * added backticks * missed ths comment --------- Co-authored-by: Kyle Shores --- README.md | 32 +++- src/acom_music_box/__init__.py | 1 + src/acom_music_box/data_output.py | 156 ++++++++++++++++++ src/acom_music_box/main.py | 17 +- src/acom_music_box/music_box.py | 26 +-- .../test_executable_data_output.py | 50 ++++++ tests/unit/test_data_output.py | 80 +++++++++ 7 files changed, 331 insertions(+), 31 deletions(-) create mode 100644 src/acom_music_box/data_output.py create mode 100644 tests/integration/test_executable_data_output.py create mode 100644 tests/unit/test_data_output.py diff --git a/README.md b/README.md index 4a9db369..4e1cf4a5 100644 --- a/README.md +++ b/README.md @@ -32,16 +32,40 @@ Run an example. Notice that the output, in csv format, is printed to the termina music_box -e Chapman ``` -You can also run your own configuration +Output can be saved to a file in csv file when no `--output-format` is passed ``` -music_box -c my_config.json +music_box -e Chapman -o output.csv ``` -Output can be saved to a file +Output can be saved to a file as csv file when `--output-format` csv is passed ``` -music_box -e Chapman -o output.csv +music_box --output-format csv -e Chapman -o output.csv +``` + +Output can be saved to a file as netcdf file when `--output-format` netcdf is passed + +``` +music_box --output-format netcdf -e Chapman -o output.nc +``` + +Output can be saved to a file in csv file to output.csv when no output path is given but `--output-format` is csv + +``` +music_box --output-format csv -e Chapman +``` + +Output can be saved to a file in netcdf file to output.nc when no output path is given but `--output-format` is netcdf + +``` +music_box --output-format netcdf -e Chapman +``` + +You can also run your own configuration + +``` +music_box -c my_config.json ``` And, if you have gnuplot installed, some basic plots can be made to show some resulting concentrations diff --git a/src/acom_music_box/__init__.py b/src/acom_music_box/__init__.py index 898a338d..25aba61e 100644 --- a/src/acom_music_box/__init__.py +++ b/src/acom_music_box/__init__.py @@ -13,3 +13,4 @@ from .evolving_conditions import EvolvingConditions from .music_box import MusicBox from .examples import Examples +from .data_output import DataOutput diff --git a/src/acom_music_box/data_output.py b/src/acom_music_box/data_output.py new file mode 100644 index 00000000..9bb61c5e --- /dev/null +++ b/src/acom_music_box/data_output.py @@ -0,0 +1,156 @@ +import os +import datetime +import logging + +logger = logging.getLogger(__name__) + +class DataOutput: + """ + A class to handle data output operations for a DataFrame, including converting to CSV + or NetCDF formats with appended units for columns. Designed for environmental data + with specific units and formats. + + This class manages file paths, unit mappings, and data output formats based on + the provided arguments, ensuring valid paths and creating necessary directories. + + Parameters + ---------- + df : pandas.DataFrame + The DataFrame containing the data to output. + args : argparse.Namespace + Arguments specifying output path, format, and additional options. + + Attributes + ---------- + df : pandas.DataFrame + The DataFrame to be output. + args : argparse.Namespace + Command-line arguments or configurations specifying output options. + unit_mapping : dict + A dictionary mapping specific columns to their respective units. + + Examples + -------- + >>> import pandas as pd + >>> from argparse import Namespace + >>> df = pd.DataFrame({ + ... 'ENV.temperature': [290, 295, 300], + ... 'ENV.pressure': [101325, 100000, 98500], + ... 'ENV.number_density_air': [102, 5096, 850960], + ... 'time': [0, 1, 2] + ... }) + >>> args = Namespace(output='output.nc', output_format='netcdf') + >>> data_output = DataOutput(df, args) + >>> data_output.output() + """ + + def __init__(self, df, args): + """ + Initialize the DataOutput class with a DataFrame and configuration arguments. + + Parameters + ---------- + df : pandas.DataFrame + The DataFrame containing the data to be output. + args : argparse.Namespace + Arguments specifying the output configuration, such as file path and format. + + Notes + ----- + The `args` argument should have the following attributes: + - output : str + The path to save the output file. + - output_format : str, optional + Format of the output file, either 'csv' or 'netcdf'. Defaults to 'csv'. + """ + self.df = df + self.args = args + self.unit_mapping = { + 'ENV.temperature': 'K', + 'ENV.pressure': 'Pa', + 'ENV.number_density_air': 'kg -m3', + 'time': 's' + } + + def _get_default_filename(self): + """Generate a default filename based on the current datetime and output format.""" + now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + extension = 'csv' if self.args.output_format == 'csv' else 'nc' + return f"music_box_{now}.{extension}" + + def _ensure_output_path(self): + """Ensure the output path is valid and create directories if needed.""" + if not self.args.output: + self.args.output = self._get_default_filename() + + if os.path.isdir(self.args.output): + self.args.output = os.path.join( + self.args.output, self._get_default_filename()) + + dir_path = os.path.dirname(self.args.output) + if dir_path and not os.path.exists(dir_path): + os.makedirs(dir_path, exist_ok=True) + logger.info(f"Created directory: {dir_path}") + + def _append_units_to_columns(self): + """Append units to DataFrame column names based on unit mapping.""" + self.df.columns = [ + f"{col}.{self.unit_mapping[col]}" if col in self.unit_mapping else + f"{col}.mol m-3" if col.startswith('CONC.') else col + for col in self.df.columns + ] + + def _convert_to_netcdf(self): + """Convert DataFrame to xarray Dataset and save as NetCDF with attributes.""" + ds = self.df.set_index(['time']).to_xarray() + for var in ds.data_vars: + if var.startswith('CONC.'): + ds[var].attrs = {'units': 'mol m-3'} + + ds['ENV.temperature'].attrs = {'units': 'K'} + ds['ENV.pressure'].attrs = {'units': 'Pa'} + ds['ENV.number_density_air'].attrs = {'units': 'kg -m3'} + ds['time'].attrs = {'units': 's'} + + ds.to_netcdf(self.args.output) + + def _output_csv(self): + """Handles CSV output.""" + self._append_units_to_columns() + if self.args.output: + self._ensure_output_path() + self.df.to_csv(self.args.output, index=False) + logger.info(f"CSV output written to: {self.args.output}") + else: + print(self.df.to_csv(index=False)) + + def _output_netcdf(self): + """Handles NetCDF output.""" + if self.args.output: + self._ensure_output_path() + self._convert_to_netcdf() + logger.info(f"NetCDF output written to: {self.args.output}") + + def _output_terminal(self): + """Handles output to terminal.""" + self._append_units_to_columns() + print(self.df.to_csv(index=False)) + + def output(self): + """Main method to handle output based on the provided arguments.""" + # Default output paths based on format + if self.args.output is None: + self.args.output = self._get_default_filename() + + # Determine output type and call the respective method + if self.args.output_format is None or self.args.output_format == 'terminal': + self._output_terminal() + elif self.args.output_format is None or self.args.output_format == 'csv': + self._output_csv() + elif self.args.output_format == 'netcdf': + self._output_netcdf() + else: + error = f"Unsupported output format: {self.args.output_format}" + logger.error(error) + raise ValueError(error) + diff --git a/src/acom_music_box/main.py b/src/acom_music_box/main.py index 6108dafb..49936060 100644 --- a/src/acom_music_box/main.py +++ b/src/acom_music_box/main.py @@ -8,7 +8,7 @@ import tempfile import matplotlib.pyplot as plt import mplcursors -from acom_music_box import MusicBox, Examples, __version__ +from acom_music_box import MusicBox, Examples, __version__, DataOutput def format_examples_help(examples): @@ -36,6 +36,12 @@ def parse_arguments(): type=str, help='Path to save the output file, including the file name. If not provided, result will be printed to the console.' ) + parser.add_argument( + '--output-format', + choices=['csv', 'netcdf', 'terminal'], + default='terminal', + help="Specify output format: 'terminal' (default), 'csv', or 'netcdf'." + ) parser.add_argument( '-v', '--verbose', action='count', @@ -154,7 +160,6 @@ def on_add(sel): plt.show() - def main(): start = datetime.datetime.now() @@ -176,6 +181,7 @@ def main(): musicBoxConfigFile = args.config musicBoxOutputPath = args.output + plot_species_list = args.plot.split(',') if args.plot else None if not musicBoxConfigFile: @@ -189,10 +195,11 @@ def main(): logger.debug(f"Configuration file = {musicBoxConfigFile}") myBox.loadJson(musicBoxConfigFile) - result = myBox.solve(musicBoxOutputPath) + result = myBox.solve(callback=None) - if musicBoxOutputPath is None: - print(result.to_csv(index=False)) + # Create an instance of DataOutput + dataOutput = DataOutput(result, args) + dataOutput.output() if plot_species_list: if args.plot_tool == 'gnuplot': diff --git a/src/acom_music_box/music_box.py b/src/acom_music_box/music_box.py index 8ce74186..784fabea 100644 --- a/src/acom_music_box/music_box.py +++ b/src/acom_music_box/music_box.py @@ -59,7 +59,7 @@ def add_evolving_condition(self, time_point, conditions): time=[time_point], conditions=[conditions]) self.evolvingConditions.append(evolving_condition) - def solve(self, output_path=None, callback=None): + def solve(self, callback=None): """ Solves the box model simulation and optionally writes the output to a file. @@ -68,8 +68,8 @@ def solve(self, output_path=None, callback=None): the specified file. Args: - output_path (str, optional): The path to the file where the output will be written. If None, no output file is created. Defaults to None. - callback (function, optional): A callback function that is called after each time step. Defaults to None. The callback will take the most recent results, the current time, conditions, and the total simulation time as arguments. + callback (function, optional): A callback function that is called after each time step. Defaults to None. + The callback will take the most recent results, the current time, conditions, and the total simulation time as arguments. Returns: list: A 2D list where each inner list represents the results of the simulation @@ -194,25 +194,7 @@ def solve(self, output_path=None, callback=None): # increments time curr_time += time_step pbar.update(time_step) - df = pd.DataFrame(output_array[1:], columns=output_array[0]) - # outputs to file if output is present - if output_path is not None: - - # Check if the output_path is a full path or just a file name - if os.path.dirname(output_path) == '': - # If output_path is just a filename, use the current directory - output_path = os.path.join(os.getcwd(), output_path) - elif not os.path.basename(output_path): - raise ValueError(f"Invalid output path: '{output_path}' does not contain a filename.") - - # Ensure the directory exists - dir_path = os.path.dirname(output_path) - if dir_path and not os.path.exists(dir_path): - os.makedirs(dir_path, exist_ok=True) - - df.to_csv(output_path, index=False) - - return df + return pd.DataFrame(output_array[1:], columns=output_array[0]) def loadJson(self, path_to_json): """ diff --git a/tests/integration/test_executable_data_output.py b/tests/integration/test_executable_data_output.py new file mode 100644 index 00000000..7a41779e --- /dev/null +++ b/tests/integration/test_executable_data_output.py @@ -0,0 +1,50 @@ +import subprocess +import os +import glob +import pytest +import tempfile + +@pytest.fixture +def temp_dir(): + with tempfile.TemporaryDirectory() as tmpdirname: + yield tmpdirname + +def test_print_results_to_terminal(temp_dir): + result = subprocess.run(['music_box', '-e', 'Analytical'], capture_output=True, text=True, cwd=temp_dir) + assert len(result.stdout) > 0 + +def test_create_netcdf_with_timestamp(temp_dir): + subprocess.run(['music_box', '-e', 'Analytical', '--output-format', 'netcdf'], cwd=temp_dir) + assert glob.glob(os.path.join(temp_dir, "music_box_*.nc")) + +def test_create_csv_with_timestamp(temp_dir): + subprocess.run(['music_box', '-e', 'Analytical', '--output-format', 'csv'], cwd=temp_dir) + assert glob.glob(os.path.join(temp_dir, "music_box_*.csv")) + +def test_create_named_csv(temp_dir): + subprocess.run(['music_box', '-e', 'Analytical', '--output-format', 'csv', '-o', 'out.csv'], cwd=temp_dir) + assert os.path.exists(os.path.join(temp_dir, "out.csv")) + +def test_create_named_netcdf(temp_dir): + subprocess.run(['music_box', '-e', 'Analytical', '--output-format', 'netcdf', '-o', 'out.nc'], cwd=temp_dir) + assert os.path.exists(os.path.join(temp_dir, "out.nc")) + +def test_create_directory_and_named_netcdf(temp_dir): + os.makedirs(os.path.join(temp_dir, "results"), exist_ok=True) + subprocess.run(['music_box', '-e', 'Analytical', '--output-format', 'netcdf', '-o', 'results/out.nc'], cwd=temp_dir) + assert os.path.exists(os.path.join(temp_dir, "results/out.nc")) + +def test_create_directory_and_named_csv(temp_dir): + os.makedirs(os.path.join(temp_dir, "results"), exist_ok=True) + subprocess.run(['music_box', '-e', 'Analytical', '--output-format', 'csv', '-o', 'results/out.csv'], cwd=temp_dir) + assert os.path.exists(os.path.join(temp_dir, "results/out.csv")) + +def test_create_directory_and_timestamped_csv(temp_dir): + os.makedirs(os.path.join(temp_dir, "results"), exist_ok=True) + subprocess.run(['music_box', '-e', 'Analytical', '--output-format', 'csv', '-o', 'results/'], cwd=temp_dir) + assert glob.glob(os.path.join(temp_dir, "results/music_box_*.csv")) + +def test_create_directory_and_timestamped_netcdf(temp_dir): + os.makedirs(os.path.join(temp_dir, "results"), exist_ok=True) + subprocess.run(['music_box', '-e', 'Analytical', '--output-format', 'netcdf', '-o', 'results/'], cwd=temp_dir) + assert glob.glob(os.path.join(temp_dir, "results/music_box_*.nc")) diff --git a/tests/unit/test_data_output.py b/tests/unit/test_data_output.py new file mode 100644 index 00000000..7ee60d75 --- /dev/null +++ b/tests/unit/test_data_output.py @@ -0,0 +1,80 @@ +import unittest +import pandas as pd +import xarray as xr +import os +import tempfile +from argparse import Namespace +from acom_music_box import DataOutput + +class TestDataOutput(unittest.TestCase): + + def setUp(self): + # Set up a sample DataFrame and arguments for testing + self.df = pd.DataFrame({ + 'ENV.temperature': [290, 295, 300], + 'ENV.pressure': [101325, 100000, 98500], + 'ENV.number_density_air': [102, 5096, 850960], + 'time': [0, 1, 2] + }) + self.temp_dir = tempfile.TemporaryDirectory() + self.csv_path = os.path.join(self.temp_dir.name, 'output.csv') + self.netcdf_path = os.path.join(self.temp_dir.name, 'output.nc') + + def tearDown(self): + # Clean up temporary directory + self.temp_dir.cleanup() + + def test_ensure_output_path_creates_directories(self): + args = Namespace(output=self.csv_path) + data_output = DataOutput(self.df, args) + data_output._ensure_output_path() + self.assertTrue(os.path.exists(os.path.dirname(args.output))) + + def test_append_units_to_columns(self): + args = Namespace(output=None) + data_output = DataOutput(self.df, args) + data_output._append_units_to_columns() + expected_columns = ['ENV.temperature.K', 'ENV.pressure.Pa', 'ENV.number_density_air.kg -m3', 'time.s'] + self.assertEqual(list(data_output.df.columns), expected_columns) + + def test_convert_to_netcdf(self): + args = Namespace(output=self.netcdf_path) + data_output = DataOutput(self.df, args) + data_output._convert_to_netcdf() + self.assertTrue(os.path.exists(self.netcdf_path)) + + # Load the NetCDF file to check the attributes + ds = xr.open_dataset(self.netcdf_path) + self.assertEqual(ds['ENV.temperature'].attrs['units'], 'K') + self.assertEqual(ds['ENV.pressure'].attrs['units'], 'Pa') + self.assertEqual(ds['ENV.number_density_air'].attrs['units'], 'kg -m3') + self.assertEqual(ds['time'].attrs['units'], 's') + ds.close() + + def test_output_csv(self): + args = Namespace(output=self.csv_path, output_format='csv') + data_output = DataOutput(self.df, args) + data_output.output() + self.assertTrue(os.path.exists(self.csv_path)) + + # Check the contents of the CSV file + output_df = pd.read_csv(self.csv_path) + expected_columns = ['ENV.temperature.K', 'ENV.pressure.Pa', 'ENV.number_density_air.kg -m3', 'time.s'] + self.assertEqual(list(output_df.columns), expected_columns) + + def test_output_netcdf(self): + args = Namespace(output=self.netcdf_path, output_format='netcdf') + data_output = DataOutput(self.df, args) + data_output.output() + self.assertTrue(os.path.exists(self.netcdf_path)) + + # Check the contents of the NetCDF file + ds = xr.open_dataset(self.netcdf_path) + self.assertEqual(ds['ENV.temperature'].attrs['units'], 'K') + self.assertEqual(ds['ENV.pressure'].attrs['units'], 'Pa') + self.assertEqual(ds['ENV.number_density_air'].attrs['units'], 'kg -m3') + self.assertEqual(ds['time'].attrs['units'], 's') + ds.close() + +if __name__ == '__main__': + unittest.main()