Merge pull request #117 from c-hydro/dev

Dev
c-hydro · Nov 14, 2024 · e075d7a · e075d7a
2 parents 5bb993e + a90dabc
commit e075d7a
Show file tree

Hide file tree

Showing 13 changed files with 342 additions and 41 deletions.
diff --git a/= b/=
@@ -0,0 +1 @@
+Requirement already satisfied: xlrd in /home/andrea/System/libs_conda/miniconda3/envs/door/lib/python3.12/site-packages (2.0.1)
diff --git a/door/data_sources/__init__.py b/door/data_sources/__init__.py
@@ -5,4 +5,5 @@
 from .cds import *
 from .hsaf import *
 from .drops2 import *
-from .clms import *
+from .clms import *
+from .persiann import *
diff --git a/door/data_sources/chirps/chirps_downloader.py b/door/data_sources/chirps/chirps_downloader.py
@@ -95,9 +95,9 @@ def get_last_published_ts(self, prelim = None, product = None, **kwargs) -> ts.T
             response = requests.head(current_url)
 
             # if the request is successful, the last published timestep is the current timestep
-            if response.status_code == 200:
+            if response.status_code is requests.codes.ok:
                 return current_timestep
-            
+
             # if the request is not successful, move to the previous timestep
             current_timestep -= 1
 

diff --git a/door/data_sources/clms/clms_downloader.py b/door/data_sources/clms/clms_downloader.py
@@ -4,11 +4,13 @@
 import xarray as xr
 import tempfile
 from typing import Optional, Iterable
+import datetime as dt
 
 from ...utils.space import BoundingBox, crop_to_bb
 from ...utils.io import download_http, handle_missing
 from ...tools import timestepping as ts
 from ...tools.timestepping.timestep import TimeStep
+from ...tools.timestepping.fixed_num_timestep import FixedNTimeStep
 from ...tools.data import Dataset
 from ...base_downloaders import URLDownloader
 
@@ -19,21 +21,22 @@ class CLMSDownloader(URLDownloader):
     clms_url = "https://globalland.vito.be/download/"
 
     default_options = {
-        'layers': None,
+        'variables': '020',
         'crop_to_bounds': True,
         'ts_per_year': 36
     }
 
     available_products = {
         'swi': {
             'versions': ["3.1.1", "3.2.1"],
-            'url': clms_url + 'geotiff/soil_water_index/swi_12.5km_v3_{ts_str}daily/{timestep.start:%Y}/{timestep.start:%Y%m%d}/c_gls_SWI{ts_str}-SWI{layer}_{timestep.start:%Y%m%d}1200_GLOBE_ASCAT_V{version}.tiff',
+            'url': clms_url + 'geotiff/soil_water_index/swi_12.5km_v3_{ts_str}daily/{timestep.start:%Y}/{timestep.start:%Y%m%d}/c_gls_SWI{ts_str}-SWI{var}_{timestep.start:%Y%m%d}1200_GLOBE_ASCAT_V{version}.tiff',
             'nodata': 255,
-            'scale_factor': 0.5,
-            'available_layers': ["001", "005", "010", "020", "040", "060", "100"],
+            'scale_factor': 0.5
         }
     }
 
+    available_variables = ["001", "005", "010", "020", "040", "060", "100"]
+
     def __init__(self, product: str) -> None:
         self.set_product(product)
 
@@ -44,14 +47,67 @@ def set_product(self, product: str) -> None:
         self.nodata = self.available_products[self.product]["nodata"]
         self.scale_factor = self.available_products[self.product]["scale_factor"]
         self.versions = self.available_products[self.product]["versions"]
-        self.available_layers = self.available_products[self.product]["available_layers"]
         self.url_blank =  self.available_products[self.product]["url"]
 
+    def set_variables(self, variables: list) -> None:
+        self.variables = []
+        for var in variables:
+            this_var = var.lower()
+            if this_var not in self.available_variables:
+                msg = f'Variable {var} not available. Choose one of {self.available_variables}'
+            else:
+                self.variables.append(this_var)
+        if len(self.variables) == 0:
+            raise ValueError('No valid variables selected')
+
+    def get_last_published_ts(self, **kwargs) -> ts.TimeRange:
+
+        """
+        Get the last published date for the dataset.
+        """
+
+        ts_per_year = self.ts_per_year
+
+        # Set ts_str based on the ts_per_year
+        if ts_per_year == 36:
+            ts_str = '10'
+            TimeStep = FixedNTimeStep.get_subclass(ts_per_year)
+        elif ts_per_year == 365:
+            ts_str = ''
+            TimeStep = ts.Day
+        else:
+            raise ValueError(f"ts_per_year {self.ts_per_year} not supported")
+
+        current_timestep = TimeStep.from_date(dt.datetime.now())
+        # Get the URL without version
+
+        while True:
+            for v in self.versions:
+                current_url_v = self.url_blank.format(
+                    ts_str=ts_str,
+                    timestep=current_timestep,
+                    var=self.variables[0],
+                    version=v
+                )
+
+                # send a request to the url
+                response = requests.head(current_url_v)
+
+                # if the request is successful, the last published timestep is the current timestep
+                if response.status_code is requests.codes.ok:
+                    return current_timestep
+
+            # if the request is not successful, move to the previous timestep
+            current_timestep -= 1
+
+    def get_last_published_date(self, **kwargs) -> dt.datetime:
+        return self.get_last_published_ts(**kwargs).end
+
     def _get_data_ts(self,
                      time_range: TimeStep,
                      space_bounds: BoundingBox,
                      tmp_path: str,
-                     layer: str,
+                     variable: str,
                      missing_action: str = 'warning',
                      **kwargs) -> Iterable[tuple[xr.DataArray, dict]]:
         ''' Get the data for a specific timestep. '''
@@ -60,39 +116,45 @@ def _get_data_ts(self,
         url_blank = self.url_blank.format(
             ts_str=self.ts_str,
             timestep=time_range,
-            layer=layer,
+            var=variable,
             version="{version}"
         )
 
         # Download the file
         ts_end = time_range.end
-        tmp_filename_raw = f'temp_{self.product}{layer}_{ts_end:%Y%m%d}.tif'
+        tmp_filename_raw = f'temp_{self.product}{variable}_{ts_end:%Y%m%d}.tif'
         tmp_destination = os.path.join(tmp_path, tmp_filename_raw)
 
         # try to download the file in both versions
+        success = False
         for version in self.versions:
-            url = url_blank.format(version=version)
-            try:
-                download_http(url, tmp_destination)
+            url_v = url_blank.format(version=version)
 
-                # Crop the data
-                cropped = crop_to_bb(tmp_destination, space_bounds)
+            response = requests.head(url_v)
 
-                # Change the nodata value to np.nan and return the data
-                cropped = cropped.where(~np.isclose(cropped, self.nodata, equal_nan=True), np.nan)
-                cropped.rio.no_data = np.nan
+            if response.status_code is requests.codes.ok:
+                url = url_v
+                success = True
+                break
 
-                # Apply the scale factor
-                cropped *= self.scale_factor
+        if success:
+            download_http(url, tmp_destination)
 
-                yield cropped, {'layer': layer}
-                break
+            # Crop the data
+            cropped = crop_to_bb(tmp_destination, space_bounds)
+
+            # Change the nodata value to np.nan and return the data
+            cropped = cropped.where(~np.isclose(cropped, self.nodata, equal_nan=True), np.nan)
+            cropped.rio.no_data = np.nan
+
+            # Apply the scale factor
+            cropped *= self.scale_factor
 
-            except Exception as e:
-                continue
+            yield cropped, {'variable': variable}
 
-        # If the loop ends without breaking, the data is missing
-        handle_missing(missing_action, {'timestep': time_range, 'layer': layer})
+        else:
+            # If the loop ends without breaking, the data is missing
+            handle_missing(missing_action, {'timestep': time_range, 'variable': variable})
 
     def get_data(self,
                  time_range: ts.TimeRange,
@@ -106,14 +168,6 @@ def get_data(self,
         if options is not None:
             self.set_options(options)
 
-        # Check if the layers are available
-        if self.layers is None:
-            self.layers = self.available_layers
-        else:
-            for layer in self.layers:
-                if layer not in self.available_layers:
-                    raise ValueError(f'Layer {layer} not available. Choose one of {self.available_layers}')
-
         # Set ts_str based on the ts_per_year
         if self.ts_per_year == 36:
             self.ts_str = '10'
@@ -142,8 +196,8 @@ def get_data(self,
 
         for timestep in timesteps:
             with tempfile.TemporaryDirectory() as tmp_path:
-                for layer in self.layers:
-                    data_struct = self._get_data_ts(timestep, space_bounds, tmp_path, layer)
+                for variable in self.variables:
+                    data_struct = self._get_data_ts(timestep, space_bounds, tmp_path, variable)
                     if not data_struct:
                         self.log.warning(f'No data found for timestep {timestep}')
                         continue

diff --git a/door/data_sources/hsaf/hsaf_downloader.py b/door/data_sources/hsaf/hsaf_downloader.py
@@ -156,7 +156,7 @@ def get_last_published_ts(self, product = None, **kwargs) -> ts.TimeRange:
             response = requests.head(current_url)
 
             # if the request is successful, the last published timestep is the current timestep
-            if response.status_code == 200:
+            if response.status_code is requests.codes.ok:
                 return current_timestep
 
             # if the request is not successful, move to the previous timestep

diff --git a/door/data_sources/jaxa/__init__.py b/door/data_sources/jaxa/__init__.py
@@ -0,0 +1 @@
+from .gsmap_downloader import GSMAPDownloader
diff --git a/door/data_sources/jaxa/gsmap_downloader.py b/door/data_sources/jaxa/gsmap_downloader.py
@@ -0,0 +1,111 @@
+import os
+from typing import Generator
+import numpy as np
+import xarray as xr
+import datetime as dt
+import requests
+import rioxarray
+
+from ...tools import timestepping as ts
+from ...tools.timestepping.timestep import TimeStep
+from ...tools.timestepping.fixed_num_timestep import FixedNTimeStep
+from ...base_downloaders import URLDownloader
+
+from ...utils.space import BoundingBox, crop_to_bb
+from ...utils.io import decompress_gz
+
+class CHRSDownloader(URLDownloader):
+    source = "JAXA"
+    name = "GSMAPDownloader"
+
+    default_options = {
+        'get_prelim': True,  # if True, will also download preliminary data if available
+    }
+
+    home = "sftp://[email protected]/realtime/hourly_G/"
+    prelim_home = home + "prelim/"
+    available_products: dict = {
+        "PDIRNow-1hourly": {
+            "ts_per_year": 8760,
+            "url": home + 'PDIRNow/PDIRNow1hourly/{timestep.start:%Y}/pdirnow1h{timestep.start:%y%m%d%H}.bin.gz',
+            "nodata": -9999,
+            "rows_cols": (3000, 9000),
+            "lat_lon_box": [59.98, -59.98, .02, 359.98],
+            "lat_lon_steps": [-0.04, 0.04],
+            "scale_factor": 100,
+            "dtype": '<i2'  # little-endian int16
+        }
+    }
+
+    def __init__(self, product: str) -> None:
+        self.set_product(product)
+        super().__init__(self.url_blank, protocol='http')
+
+    def set_product(self, product: str) -> None:
+        self.product = product
+        if product not in self.available_products:
+            raise ValueError(f'Product {product} not available. Choose one of {self.available_products.keys()}')
+        self.ts_per_year = self.available_products[product]["ts_per_year"]
+        self.url_blank = self.available_products[product]["url"]
+        self.nodata = self.available_products[product]["nodata"]
+        self.rows_cols = self.available_products[product]["rows_cols"]
+        self.lat_lon_box = self.available_products[product]["lat_lon_box"]
+        self.lat_lon_steps = self.available_products[product]["lat_lon_steps"]
+        self.dtype = self.available_products[product]["dtype"]
+        self.scale_factor = self.available_products[product]["scale_factor"]
+
+    def _get_data_ts(self,
+                     timestep: TimeStep,
+                     space_bounds: BoundingBox,
+                     tmp_path: str) -> Generator[tuple[xr.DataArray, dict], None, None]:
+
+        ts_end = timestep.end
+        tmp_filename_raw = f'temp_{self.product}{ts_end:%Y%m%d}'
+        tmp_filename = f'{tmp_filename_raw}.bin.gz'
+        tmp_destination = os.path.join(tmp_path, tmp_filename)
+        print(" --> Download " + str(timestep))
+        success = self.download(tmp_destination, min_size=200, missing_action='ignore', timestep=timestep)
+        if success:
+            # Unzip the data
+            unzipped = decompress_gz(tmp_destination)
+
+            data = np.fromfile(unzipped, dtype=np.dtype(self.dtype))
+            data = data.reshape(self.rows_cols)
+            data_in_mm_hr = data.astype(np.float32) / self.scale_factor
+            data_in_mm_hr[data == self.nodata] = np.nan
+
+            # Create latitude and longitude arrays
+            lons = np.arange(self.lat_lon_box[2], self.lat_lon_box[3] + self.lat_lon_steps[1]/2, self.lat_lon_steps[1])
+            lats = np.arange(self.lat_lon_box[0], self.lat_lon_box[1] + self.lat_lon_steps[0]/2, self.lat_lon_steps[0])
+
+            # Create the xarray DataArray compliant with the rio extension
+            data_array = xr.DataArray(
+                data_in_mm_hr,
+                dims=["y", "x"],
+                coords={
+                    "y": lats,
+                    "x": lons
+                },
+                name="precipitation_rate",
+                attrs={
+                    "units": "mm/hr"}
+            )
+            # Set the CRS to WGS84 (EPSG:4326) and geotransform attributes
+            data_array = data_array.rio.write_crs("EPSG:4326")
+            data_array = data_array.rio.write_nodata(self.nodata)
+
+            # crop the data
+            cropped = crop_to_bb(data_array, space_bounds)
+
+            yield cropped, {}
+
+    def format_url(self, prelim=False, **kwargs):
+        """
+        Format the url for the download
+        """
+        if prelim:
+            url = self.url_prelim_blank.format(**kwargs)
+        else:
+            url = self.url_blank.format(**kwargs)
+        return url
+
diff --git a/door/data_sources/persiann/__init__.py b/door/data_sources/persiann/__init__.py
@@ -0,0 +1 @@
+from .chrs_downloader import CHRSDownloader
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Requirement already satisfied: xlrd in /home/andrea/System/libs_conda/miniconda3/envs/door/lib/python3.12/site-packages (2.0.1)