Merge pull request #107 from c-hydro/tools-v1.3.4

chore: update tools
c-hydro · Oct 14, 2024 · 6869529 · 6869529
2 parents e9487a6 + b12f4c7
commit 6869529
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 23 deletions.
diff --git a/door/base_downloaders.py b/door/base_downloaders.py
@@ -71,19 +71,22 @@ def get_source(cls, source: Optional[str] = None):
         elif hasattr(cls, 'source'):
             return cls.source
 
-    def set_bounds(self, bounds: None|BoundingBox|list[float]|tuple[float]) -> None:
+    def set_bounds(self, bounds: None|BoundingBox|list[float]|tuple[float]|Dataset) -> None:
         """
         Set the bounds of the data to download.
         """
         if bounds is None:
             return
         elif isinstance(bounds, (list, tuple)):
             _bounds = BoundingBox(*bounds)
-        elif isinstance(bounds, Dataset):
-            _bounds = BoundingBox.from_dataset(bounds)
         elif isinstance(bounds, str):
             _bounds = BoundingBox.from_file(bounds)
-
+        else:
+            try:
+                _bounds = BoundingBox.from_dataset(bounds)
+            except:
+                raise ValueError('Invalid bounds')
+
         self.bounds = _bounds
 
     def set_destination(self, destination: Dataset|dict|str|None) -> None:

diff --git a/door/data_sources/earthdata/cmr_downloader.py b/door/data_sources/earthdata/cmr_downloader.py
@@ -37,6 +37,7 @@ class CMRDownloader(DOORDownloader):
         'make_mosaic': True,
         'crop_to_bounds': True,
         'keep_tiles_naming': False,
+        'selected_tiles' : None
     }
 
     file_ext = ['.hdf', '.h5']
@@ -167,7 +168,7 @@ def build_cmr_query(self, time_start: datetime, time_end: datetime, bounding_box
 
         cmr_base_url = ('{0}provider={1}'
                         '&sort_key=start_date&sort_key=producer_granule_id'
-                        '&scroll=true&page_size={2}'.format(self.cmr_url, self.provider, self.cmr_page_size))
+                        '&page_size={2}'.format(self.cmr_url, self.provider, self.cmr_page_size))
 
         product_query = self.fomat_product(self.product_id)
         version_query = self.format_version(self.version)
@@ -189,33 +190,31 @@ def cmr_search(self, time: ts.TimeRange, space_bounds: BoundingBox) -> dict:
         time_end = time.end
 
         cmr_query_url = self.build_cmr_query(time_start, time_end, bounding_box)
-        cmr_scroll_id = None
+        cmr_searchafter = None
         ctx = ssl.create_default_context()
         ctx.check_hostname = False
         ctx.verify_mode = ssl.CERT_NONE
         try:
             urls = []
             while True:
                 req = Request(cmr_query_url)
-                if cmr_scroll_id:
-                    req.add_header('cmr-scroll-id', cmr_scroll_id)
+                if cmr_searchafter:
+                    req.add_header('CMR-Search-After', cmr_searchafter)
                 response = urlopen(req, context=ctx)
-                if not cmr_scroll_id:
-                    # Python 2 and 3 have different case for the http headers
-                    headers = {k.lower(): v for k, v in dict(response.info()).items()}
-                    cmr_scroll_id = headers['cmr-scroll-id']
-                    hits = int(headers['cmr-hits'])
+
+                # the header 'cmr-search-after' is used to get the next page of results
+                # once we hit a page with no 'cmr-search-after' header, we have all the results
+                headers = {k.lower(): v for k, v in dict(response.info()).items()}
+                cmr_searchafter = headers.get('cmr-search-after', None)
+                if not cmr_searchafter:
+                    break
+
                 search_page = response.read()
                 search_page = json.loads(search_page.decode('utf-8'))
-                url_scroll_results = cmr_filter_urls(search_page, extensions=self.file_ext)
-                if not url_scroll_results:
-                    break
-                if hits > self.cmr_page_size:
-                    sys.stdout.flush()
-                urls += url_scroll_results
+                valid_results = cmr_filter_urls(search_page, extensions=self.file_ext, selected_tiles=self.selected_tiles)
+
+                urls += valid_results
 
-            if hits > self.cmr_page_size:
-                print()
             return urls
         except KeyboardInterrupt:
             quit()
@@ -273,7 +272,7 @@ def format_filename_filter(time: datetime) -> str:
         filename_filter = time.strftime('*A%Y%j*')
         return f'&producer_granule_id[]={filename_filter}&options[producer_granule_id][pattern]=true'
 
-def cmr_filter_urls(search_results, extensions=['.hdf', '.h5']) -> list[str]:
+def cmr_filter_urls(search_results, extensions=['.hdf', '.h5'], selected_tiles = None) -> list[str]:
     """Select only the desired data files from CMR response."""
     if 'feed' not in search_results or 'entry' not in search_results['feed']:
         return []
@@ -312,6 +311,10 @@ def cmr_filter_urls(search_results, extensions=['.hdf', '.h5']) -> list[str]:
             continue
         unique_filenames.add(filename)
 
+        if selected_tiles is not None:
+            if not any(tile in filename for tile in selected_tiles):
+                continue
+
         urls.append(link['href'])
 
     return urls
diff --git a/door/tools b/door/tools
+1 −1		config/options.py
+1 −1		data/__init__.py
+98 −81		data/dataset.py
+6 −0		data/io_utils.py
+295 −0		data/remote_dataset.py
+0 −126		data/s3_dataset.py
+1 −0		timestepping/fixed_doy_timestep.py