Skip to content

Commit

Permalink
Merge pull request #107 from c-hydro/tools-v1.3.4
Browse files Browse the repository at this point in the history
chore: update tools
  • Loading branch information
ltrotter authored Oct 14, 2024
2 parents e9487a6 + b12f4c7 commit 6869529
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 23 deletions.
11 changes: 7 additions & 4 deletions door/base_downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,19 +71,22 @@ def get_source(cls, source: Optional[str] = None):
elif hasattr(cls, 'source'):
return cls.source

def set_bounds(self, bounds: None|BoundingBox|list[float]|tuple[float]) -> None:
def set_bounds(self, bounds: None|BoundingBox|list[float]|tuple[float]|Dataset) -> None:
"""
Set the bounds of the data to download.
"""
if bounds is None:
return
elif isinstance(bounds, (list, tuple)):
_bounds = BoundingBox(*bounds)
elif isinstance(bounds, Dataset):
_bounds = BoundingBox.from_dataset(bounds)
elif isinstance(bounds, str):
_bounds = BoundingBox.from_file(bounds)

else:
try:
_bounds = BoundingBox.from_dataset(bounds)
except:
raise ValueError('Invalid bounds')

self.bounds = _bounds

def set_destination(self, destination: Dataset|dict|str|None) -> None:
Expand Down
39 changes: 21 additions & 18 deletions door/data_sources/earthdata/cmr_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class CMRDownloader(DOORDownloader):
'make_mosaic': True,
'crop_to_bounds': True,
'keep_tiles_naming': False,
'selected_tiles' : None
}

file_ext = ['.hdf', '.h5']
Expand Down Expand Up @@ -167,7 +168,7 @@ def build_cmr_query(self, time_start: datetime, time_end: datetime, bounding_box

cmr_base_url = ('{0}provider={1}'
'&sort_key=start_date&sort_key=producer_granule_id'
'&scroll=true&page_size={2}'.format(self.cmr_url, self.provider, self.cmr_page_size))
'&page_size={2}'.format(self.cmr_url, self.provider, self.cmr_page_size))

product_query = self.fomat_product(self.product_id)
version_query = self.format_version(self.version)
Expand All @@ -189,33 +190,31 @@ def cmr_search(self, time: ts.TimeRange, space_bounds: BoundingBox) -> dict:
time_end = time.end

cmr_query_url = self.build_cmr_query(time_start, time_end, bounding_box)
cmr_scroll_id = None
cmr_searchafter = None
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
try:
urls = []
while True:
req = Request(cmr_query_url)
if cmr_scroll_id:
req.add_header('cmr-scroll-id', cmr_scroll_id)
if cmr_searchafter:
req.add_header('CMR-Search-After', cmr_searchafter)
response = urlopen(req, context=ctx)
if not cmr_scroll_id:
# Python 2 and 3 have different case for the http headers
headers = {k.lower(): v for k, v in dict(response.info()).items()}
cmr_scroll_id = headers['cmr-scroll-id']
hits = int(headers['cmr-hits'])

# the header 'cmr-search-after' is used to get the next page of results
# once we hit a page with no 'cmr-search-after' header, we have all the results
headers = {k.lower(): v for k, v in dict(response.info()).items()}
cmr_searchafter = headers.get('cmr-search-after', None)
if not cmr_searchafter:
break

search_page = response.read()
search_page = json.loads(search_page.decode('utf-8'))
url_scroll_results = cmr_filter_urls(search_page, extensions=self.file_ext)
if not url_scroll_results:
break
if hits > self.cmr_page_size:
sys.stdout.flush()
urls += url_scroll_results
valid_results = cmr_filter_urls(search_page, extensions=self.file_ext, selected_tiles=self.selected_tiles)

urls += valid_results

if hits > self.cmr_page_size:
print()
return urls
except KeyboardInterrupt:
quit()
Expand Down Expand Up @@ -273,7 +272,7 @@ def format_filename_filter(time: datetime) -> str:
filename_filter = time.strftime('*A%Y%j*')
return f'&producer_granule_id[]={filename_filter}&options[producer_granule_id][pattern]=true'

def cmr_filter_urls(search_results, extensions=['.hdf', '.h5']) -> list[str]:
def cmr_filter_urls(search_results, extensions=['.hdf', '.h5'], selected_tiles = None) -> list[str]:
"""Select only the desired data files from CMR response."""
if 'feed' not in search_results or 'entry' not in search_results['feed']:
return []
Expand Down Expand Up @@ -312,6 +311,10 @@ def cmr_filter_urls(search_results, extensions=['.hdf', '.h5']) -> list[str]:
continue
unique_filenames.add(filename)

if selected_tiles is not None:
if not any(tile in filename for tile in selected_tiles):
continue

urls.append(link['href'])

return urls
2 changes: 1 addition & 1 deletion door/tools

0 comments on commit 6869529

Please sign in to comment.