diff --git a/= b/= new file mode 100644 index 0000000..484d6d6 --- /dev/null +++ b/= @@ -0,0 +1 @@ +Requirement already satisfied: xlrd in /home/andrea/System/libs_conda/miniconda3/envs/door/lib/python3.12/site-packages (2.0.1) diff --git a/door/data_sources/__init__.py b/door/data_sources/__init__.py index 03068e4..840e02b 100644 --- a/door/data_sources/__init__.py +++ b/door/data_sources/__init__.py @@ -5,4 +5,5 @@ from .cds import * from .hsaf import * from .drops2 import * -from .clms import * \ No newline at end of file +from .clms import * +from .persiann import * \ No newline at end of file diff --git a/door/data_sources/chirps/chirps_downloader.py b/door/data_sources/chirps/chirps_downloader.py index 8ab4dfb..21d132b 100644 --- a/door/data_sources/chirps/chirps_downloader.py +++ b/door/data_sources/chirps/chirps_downloader.py @@ -95,9 +95,9 @@ def get_last_published_ts(self, prelim = None, product = None, **kwargs) -> ts.T response = requests.head(current_url) # if the request is successful, the last published timestep is the current timestep - if response.status_code == 200: + if response.status_code is requests.codes.ok: return current_timestep - + # if the request is not successful, move to the previous timestep current_timestep -= 1 diff --git a/door/data_sources/clms/clms_downloader.py b/door/data_sources/clms/clms_downloader.py index cb88697..b1fb6e3 100644 --- a/door/data_sources/clms/clms_downloader.py +++ b/door/data_sources/clms/clms_downloader.py @@ -4,11 +4,13 @@ import xarray as xr import tempfile from typing import Optional, Iterable +import datetime as dt from ...utils.space import BoundingBox, crop_to_bb from ...utils.io import download_http, handle_missing from ...tools import timestepping as ts from ...tools.timestepping.timestep import TimeStep +from ...tools.timestepping.fixed_num_timestep import FixedNTimeStep from ...tools.data import Dataset from ...base_downloaders import URLDownloader @@ -19,7 +21,7 @@ class CLMSDownloader(URLDownloader): clms_url = "https://globalland.vito.be/download/" default_options = { - 'layers': None, + 'variables': '020', 'crop_to_bounds': True, 'ts_per_year': 36 } @@ -27,13 +29,14 @@ class CLMSDownloader(URLDownloader): available_products = { 'swi': { 'versions': ["3.1.1", "3.2.1"], - 'url': clms_url + 'geotiff/soil_water_index/swi_12.5km_v3_{ts_str}daily/{timestep.start:%Y}/{timestep.start:%Y%m%d}/c_gls_SWI{ts_str}-SWI{layer}_{timestep.start:%Y%m%d}1200_GLOBE_ASCAT_V{version}.tiff', + 'url': clms_url + 'geotiff/soil_water_index/swi_12.5km_v3_{ts_str}daily/{timestep.start:%Y}/{timestep.start:%Y%m%d}/c_gls_SWI{ts_str}-SWI{var}_{timestep.start:%Y%m%d}1200_GLOBE_ASCAT_V{version}.tiff', 'nodata': 255, - 'scale_factor': 0.5, - 'available_layers': ["001", "005", "010", "020", "040", "060", "100"], + 'scale_factor': 0.5 } } + available_variables = ["001", "005", "010", "020", "040", "060", "100"] + def __init__(self, product: str) -> None: self.set_product(product) @@ -44,14 +47,67 @@ def set_product(self, product: str) -> None: self.nodata = self.available_products[self.product]["nodata"] self.scale_factor = self.available_products[self.product]["scale_factor"] self.versions = self.available_products[self.product]["versions"] - self.available_layers = self.available_products[self.product]["available_layers"] self.url_blank = self.available_products[self.product]["url"] + def set_variables(self, variables: list) -> None: + self.variables = [] + for var in variables: + this_var = var.lower() + if this_var not in self.available_variables: + msg = f'Variable {var} not available. Choose one of {self.available_variables}' + else: + self.variables.append(this_var) + if len(self.variables) == 0: + raise ValueError('No valid variables selected') + + def get_last_published_ts(self, **kwargs) -> ts.TimeRange: + + """ + Get the last published date for the dataset. + """ + + ts_per_year = self.ts_per_year + + # Set ts_str based on the ts_per_year + if ts_per_year == 36: + ts_str = '10' + TimeStep = FixedNTimeStep.get_subclass(ts_per_year) + elif ts_per_year == 365: + ts_str = '' + TimeStep = ts.Day + else: + raise ValueError(f"ts_per_year {self.ts_per_year} not supported") + + current_timestep = TimeStep.from_date(dt.datetime.now()) + # Get the URL without version + + while True: + for v in self.versions: + current_url_v = self.url_blank.format( + ts_str=ts_str, + timestep=current_timestep, + var=self.variables[0], + version=v + ) + + # send a request to the url + response = requests.head(current_url_v) + + # if the request is successful, the last published timestep is the current timestep + if response.status_code is requests.codes.ok: + return current_timestep + + # if the request is not successful, move to the previous timestep + current_timestep -= 1 + + def get_last_published_date(self, **kwargs) -> dt.datetime: + return self.get_last_published_ts(**kwargs).end + def _get_data_ts(self, time_range: TimeStep, space_bounds: BoundingBox, tmp_path: str, - layer: str, + variable: str, missing_action: str = 'warning', **kwargs) -> Iterable[tuple[xr.DataArray, dict]]: ''' Get the data for a specific timestep. ''' @@ -60,39 +116,45 @@ def _get_data_ts(self, url_blank = self.url_blank.format( ts_str=self.ts_str, timestep=time_range, - layer=layer, + var=variable, version="{version}" ) # Download the file ts_end = time_range.end - tmp_filename_raw = f'temp_{self.product}{layer}_{ts_end:%Y%m%d}.tif' + tmp_filename_raw = f'temp_{self.product}{variable}_{ts_end:%Y%m%d}.tif' tmp_destination = os.path.join(tmp_path, tmp_filename_raw) # try to download the file in both versions + success = False for version in self.versions: - url = url_blank.format(version=version) - try: - download_http(url, tmp_destination) + url_v = url_blank.format(version=version) - # Crop the data - cropped = crop_to_bb(tmp_destination, space_bounds) + response = requests.head(url_v) - # Change the nodata value to np.nan and return the data - cropped = cropped.where(~np.isclose(cropped, self.nodata, equal_nan=True), np.nan) - cropped.rio.no_data = np.nan + if response.status_code is requests.codes.ok: + url = url_v + success = True + break - # Apply the scale factor - cropped *= self.scale_factor + if success: + download_http(url, tmp_destination) - yield cropped, {'layer': layer} - break + # Crop the data + cropped = crop_to_bb(tmp_destination, space_bounds) + + # Change the nodata value to np.nan and return the data + cropped = cropped.where(~np.isclose(cropped, self.nodata, equal_nan=True), np.nan) + cropped.rio.no_data = np.nan + + # Apply the scale factor + cropped *= self.scale_factor - except Exception as e: - continue + yield cropped, {'variable': variable} - # If the loop ends without breaking, the data is missing - handle_missing(missing_action, {'timestep': time_range, 'layer': layer}) + else: + # If the loop ends without breaking, the data is missing + handle_missing(missing_action, {'timestep': time_range, 'variable': variable}) def get_data(self, time_range: ts.TimeRange, @@ -106,14 +168,6 @@ def get_data(self, if options is not None: self.set_options(options) - # Check if the layers are available - if self.layers is None: - self.layers = self.available_layers - else: - for layer in self.layers: - if layer not in self.available_layers: - raise ValueError(f'Layer {layer} not available. Choose one of {self.available_layers}') - # Set ts_str based on the ts_per_year if self.ts_per_year == 36: self.ts_str = '10' @@ -142,8 +196,8 @@ def get_data(self, for timestep in timesteps: with tempfile.TemporaryDirectory() as tmp_path: - for layer in self.layers: - data_struct = self._get_data_ts(timestep, space_bounds, tmp_path, layer) + for variable in self.variables: + data_struct = self._get_data_ts(timestep, space_bounds, tmp_path, variable) if not data_struct: self.log.warning(f'No data found for timestep {timestep}') continue diff --git a/door/data_sources/hsaf/hsaf_downloader.py b/door/data_sources/hsaf/hsaf_downloader.py index e2780f8..2f58690 100644 --- a/door/data_sources/hsaf/hsaf_downloader.py +++ b/door/data_sources/hsaf/hsaf_downloader.py @@ -156,7 +156,7 @@ def get_last_published_ts(self, product = None, **kwargs) -> ts.TimeRange: response = requests.head(current_url) # if the request is successful, the last published timestep is the current timestep - if response.status_code == 200: + if response.status_code is requests.codes.ok: return current_timestep # if the request is not successful, move to the previous timestep diff --git a/door/data_sources/jaxa/__init__.py b/door/data_sources/jaxa/__init__.py new file mode 100644 index 0000000..ace1f4e --- /dev/null +++ b/door/data_sources/jaxa/__init__.py @@ -0,0 +1 @@ +from .gsmap_downloader import GSMAPDownloader \ No newline at end of file diff --git a/door/data_sources/jaxa/gsmap_downloader.py b/door/data_sources/jaxa/gsmap_downloader.py new file mode 100644 index 0000000..5d77774 --- /dev/null +++ b/door/data_sources/jaxa/gsmap_downloader.py @@ -0,0 +1,111 @@ +import os +from typing import Generator +import numpy as np +import xarray as xr +import datetime as dt +import requests +import rioxarray + +from ...tools import timestepping as ts +from ...tools.timestepping.timestep import TimeStep +from ...tools.timestepping.fixed_num_timestep import FixedNTimeStep +from ...base_downloaders import URLDownloader + +from ...utils.space import BoundingBox, crop_to_bb +from ...utils.io import decompress_gz + +class CHRSDownloader(URLDownloader): + source = "JAXA" + name = "GSMAPDownloader" + + default_options = { + 'get_prelim': True, # if True, will also download preliminary data if available + } + + home = "sftp://rainmap@hokusai.eorc.jaxa.jp/realtime/hourly_G/" + prelim_home = home + "prelim/" + available_products: dict = { + "PDIRNow-1hourly": { + "ts_per_year": 8760, + "url": home + 'PDIRNow/PDIRNow1hourly/{timestep.start:%Y}/pdirnow1h{timestep.start:%y%m%d%H}.bin.gz', + "nodata": -9999, + "rows_cols": (3000, 9000), + "lat_lon_box": [59.98, -59.98, .02, 359.98], + "lat_lon_steps": [-0.04, 0.04], + "scale_factor": 100, + "dtype": ' None: + self.set_product(product) + super().__init__(self.url_blank, protocol='http') + + def set_product(self, product: str) -> None: + self.product = product + if product not in self.available_products: + raise ValueError(f'Product {product} not available. Choose one of {self.available_products.keys()}') + self.ts_per_year = self.available_products[product]["ts_per_year"] + self.url_blank = self.available_products[product]["url"] + self.nodata = self.available_products[product]["nodata"] + self.rows_cols = self.available_products[product]["rows_cols"] + self.lat_lon_box = self.available_products[product]["lat_lon_box"] + self.lat_lon_steps = self.available_products[product]["lat_lon_steps"] + self.dtype = self.available_products[product]["dtype"] + self.scale_factor = self.available_products[product]["scale_factor"] + + def _get_data_ts(self, + timestep: TimeStep, + space_bounds: BoundingBox, + tmp_path: str) -> Generator[tuple[xr.DataArray, dict], None, None]: + + ts_end = timestep.end + tmp_filename_raw = f'temp_{self.product}{ts_end:%Y%m%d}' + tmp_filename = f'{tmp_filename_raw}.bin.gz' + tmp_destination = os.path.join(tmp_path, tmp_filename) + print(" --> Download " + str(timestep)) + success = self.download(tmp_destination, min_size=200, missing_action='ignore', timestep=timestep) + if success: + # Unzip the data + unzipped = decompress_gz(tmp_destination) + + data = np.fromfile(unzipped, dtype=np.dtype(self.dtype)) + data = data.reshape(self.rows_cols) + data_in_mm_hr = data.astype(np.float32) / self.scale_factor + data_in_mm_hr[data == self.nodata] = np.nan + + # Create latitude and longitude arrays + lons = np.arange(self.lat_lon_box[2], self.lat_lon_box[3] + self.lat_lon_steps[1]/2, self.lat_lon_steps[1]) + lats = np.arange(self.lat_lon_box[0], self.lat_lon_box[1] + self.lat_lon_steps[0]/2, self.lat_lon_steps[0]) + + # Create the xarray DataArray compliant with the rio extension + data_array = xr.DataArray( + data_in_mm_hr, + dims=["y", "x"], + coords={ + "y": lats, + "x": lons + }, + name="precipitation_rate", + attrs={ + "units": "mm/hr"} + ) + # Set the CRS to WGS84 (EPSG:4326) and geotransform attributes + data_array = data_array.rio.write_crs("EPSG:4326") + data_array = data_array.rio.write_nodata(self.nodata) + + # crop the data + cropped = crop_to_bb(data_array, space_bounds) + + yield cropped, {} + + def format_url(self, prelim=False, **kwargs): + """ + Format the url for the download + """ + if prelim: + url = self.url_prelim_blank.format(**kwargs) + else: + url = self.url_blank.format(**kwargs) + return url + diff --git a/door/data_sources/persiann/__init__.py b/door/data_sources/persiann/__init__.py new file mode 100644 index 0000000..7d9a8fd --- /dev/null +++ b/door/data_sources/persiann/__init__.py @@ -0,0 +1 @@ +from .chrs_downloader import CHRSDownloader \ No newline at end of file diff --git a/door/data_sources/persiann/chrs_downloader.py b/door/data_sources/persiann/chrs_downloader.py new file mode 100644 index 0000000..ddc27b4 --- /dev/null +++ b/door/data_sources/persiann/chrs_downloader.py @@ -0,0 +1,109 @@ +import os +from typing import Generator +import numpy as np +import xarray as xr +import datetime as dt +import requests +import rioxarray + +from ...tools import timestepping as ts +from ...tools.timestepping.timestep import TimeStep +from ...tools.timestepping.fixed_num_timestep import FixedNTimeStep +from ...base_downloaders import URLDownloader + +from ...utils.space import BoundingBox, crop_to_bb +from ...utils.io import decompress_gz + +class CHRSDownloader(URLDownloader): + source = "CHRS" + name = "CHRSDownloader" + + default_options = { + } + + home = "https://persiann.eng.uci.edu/CHRSdata/" + available_products: dict = { + "PDIRNow-1hourly": { + "ts_per_year": 8760, + "url": home + 'PDIRNow/PDIRNow1hourly/{timestep.start:%Y}/pdirnow1h{timestep.start:%y%m%d%H}.bin.gz', + "nodata": -9999, + "rows_cols": (3000, 9000), + "lat_lon_box": [59.98, -59.98, .02, 359.98], + "lat_lon_steps": [-0.04, 0.04], + "scale_factor": 100, + "dtype": ' None: + self.set_product(product) + super().__init__(self.url_blank, protocol='http') + + def set_product(self, product: str) -> None: + self.product = product + if product not in self.available_products: + raise ValueError(f'Product {product} not available. Choose one of {self.available_products.keys()}') + self.ts_per_year = self.available_products[product]["ts_per_year"] + self.url_blank = self.available_products[product]["url"] + self.nodata = self.available_products[product]["nodata"] + self.rows_cols = self.available_products[product]["rows_cols"] + self.lat_lon_box = self.available_products[product]["lat_lon_box"] + self.lat_lon_steps = self.available_products[product]["lat_lon_steps"] + self.dtype = self.available_products[product]["dtype"] + self.scale_factor = self.available_products[product]["scale_factor"] + + def _get_data_ts(self, + timestep: TimeStep, + space_bounds: BoundingBox, + tmp_path: str) -> Generator[tuple[xr.DataArray, dict], None, None]: + + ts_end = timestep.end + tmp_filename_raw = f'temp_{self.product}{ts_end:%Y%m%d}' + tmp_filename = f'{tmp_filename_raw}.bin.gz' + tmp_destination = os.path.join(tmp_path, tmp_filename) + print(" --> Download " + str(timestep)) + success = self.download(tmp_destination, min_size=200, missing_action='ignore', timestep=timestep) + if success: + # Unzip the data + unzipped = decompress_gz(tmp_destination) + + data = np.fromfile(unzipped, dtype=np.dtype(self.dtype)) + data = data.reshape(self.rows_cols) + data_in_mm_hr = data.astype(np.float32) / self.scale_factor + data_in_mm_hr[data == self.nodata] = np.nan + + # Create latitude and longitude arrays + lons = np.arange(self.lat_lon_box[2], self.lat_lon_box[3] + self.lat_lon_steps[1]/2, self.lat_lon_steps[1]) + lats = np.arange(self.lat_lon_box[0], self.lat_lon_box[1] + self.lat_lon_steps[0]/2, self.lat_lon_steps[0]) + + # Create the xarray DataArray compliant with the rio extension + data_array = xr.DataArray( + data_in_mm_hr, + dims=["y", "x"], + coords={ + "y": lats, + "x": lons + }, + name="precipitation_rate", + attrs={ + "units": "mm/hr"} + ) + # Set the CRS to WGS84 (EPSG:4326) and geotransform attributes + data_array = data_array.rio.write_crs("EPSG:4326") + data_array = data_array.rio.write_nodata(self.nodata) + + # crop the data + cropped = crop_to_bb(data_array, space_bounds) + + yield cropped, {} + + def format_url(self, prelim=False, **kwargs): + """ + Format the url for the download + """ + if prelim: + url = self.url_prelim_blank.format(**kwargs) + else: + url = self.url_blank.format(**kwargs) + return url + diff --git a/door/utils/io.py b/door/utils/io.py index ab0c3b3..f2c2bc6 100644 --- a/door/utils/io.py +++ b/door/utils/io.py @@ -67,6 +67,8 @@ def download_http(url, destination, auth=None): if the argument auth is passed as (user, pass), it will be used for authentication """ r = requests.get(url, auth) + if r.status_code != 200: + raise FileNotFoundError(r.text) os.makedirs(os.path.dirname(destination), exist_ok=True) with open(destination, 'wb') as f: f.write(r.content) diff --git a/setup.py b/setup.py index ea67829..a779c1e 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='door', - version='2.2.0', + version='2.2.3', packages=find_packages(), description='A package for operational retrieval of raster data from different sources', author='Luca Trotter', diff --git a/workflow_examples/clms_example.py b/workflow_examples/clms_example.py index a2fd670..d172a5c 100644 --- a/workflow_examples/clms_example.py +++ b/workflow_examples/clms_example.py @@ -15,9 +15,9 @@ test_downloader = CLMSDownloader(product='SWI') test_downloader.get_data(time_range, space_ref, - destination={'path' : HOME, 'filename':'SWI-{layer}_%Y%m%d.tif'}, + destination={'path' : HOME, 'filename':'SWI-{variable}_%Y%m%d.tif'}, options={ - 'layers': ["001","020","060"], + 'variables': ["001","020","060"], 'crop_to_bounds': True, 'ts_per_year': 36 }) \ No newline at end of file diff --git a/workflow_examples/pdir_example.py b/workflow_examples/pdir_example.py new file mode 100644 index 0000000..a527eab --- /dev/null +++ b/workflow_examples/pdir_example.py @@ -0,0 +1,21 @@ +from door.data_sources import CHRSDownloader +import pandas as pd +import datetime as dt +import os + +from door.utils.space import BoundingBox +from door.utils import log +import door.tools.timestepping as ts + +HOME = '/home/andrea/Desktop/Working_dir/door/pdir' +log_file = os.path.join(HOME, 'log.txt') + +log.set_logging(log_file, console = True) + +time_range = ts.TimeRange(start='2022-05-23', end='2022-12-31') +space_ref = BoundingBox(-6,9,3.6,16) + +test_downloader = CHRSDownloader(product='PDIRNow-1hourly') + +test_downloader.get_data(time_range, space_ref, + destination={'path' : os.path.join(HOME, "%Y/%m/%d"), 'filename':'BFA_PDIR-now1h_%Y%m%d%H.tif'}) \ No newline at end of file