Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ncattrs function to get nc attributes without the need for siphon. #35

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

## [Unreleased](https://github.com/crim-ca/stac-populator) (latest)

<!-- insert list items of new changes here -->
* New function `ncattrs` to get attributes from netCDF files hosted on a THREDDS server.

## [0.4.0](https://github.com/crim-ca/stac-populator/tree/0.4.0) (2023-11-27)

Expand Down
169 changes: 158 additions & 11 deletions STACpopulator/stac_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,20 @@
import logging
import os
import re
import sys
import requests
import xncml
import xmltodict
import urllib.parse
from pathlib import Path
from typing import Any, Literal, MutableMapping, Union


import numpy as np
import pystac
import yaml
from colorlog import ColoredFormatter

from enum import Enum
from STACpopulator.models import STACItem


Expand Down Expand Up @@ -79,6 +86,113 @@ def collection2literal(collection, property="label"):
return Literal[terms]


def thredds_catalog_attrs(url: str) -> dict:
"""Return attributes from the catalog.xml THREDDS server response.

Parameters
----------
url : str
Link to the THREDDS catalog URL.
"""
xml = requests.get(url).text

raw = xmltodict.parse(
xml,
process_namespaces=True,
namespaces={
"http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0": None,
"https://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0": None,
},
)
return raw


def catalog_url(url: str) -> (str, str):
"""Given a THREDDS link to a netCDF file, return a link to its catalog and the file name."""

pr = urllib.parse.urlparse(url)
scheme, netloc, path, params, query, frag = pr

# URL is a reference to a catalog item
if query:
q = urllib.parse.parse_qs(query)
nc = q["dataset"][0].split("/")[-1]

if path.endswith("catalog.html"):
path = path.replace("catalog.html", "catalog.xml")

# Ideally we would create targeted queries for one dataset, but we're missing the dataset name.
# query = ""
else:
nc = path.split("/")[-1]
path = path.replace(nc, "catalog.xml")

# Get catalog information about available services
catalog = urllib.parse.urlunparse((scheme, netloc, path, "", query, ""))

return catalog, nc


def access_urls(catalog_url: str, ds: str) -> dict:
"""Return THREDDS endpoints for the catalog and dataset.

Parameters
----------
catalog_url : str
URI to the THREDDS catalog.
ds : str
Dataset path relative to the catalog.
"""
# Get catalog information about available services
cattrs = thredds_catalog_attrs(catalog_url)["catalog"]

pr = urllib.parse.urlparse(str(catalog_url))

cid = cattrs["dataset"]["@ID"]
if not pr.query:
cid += f"/{ds}"

# Get service URLs for the dataset
access_urls = {}
for service in cattrs["service"]["service"]:
type = ServiceType.from_value(service["@serviceType"]).value
access_urls[type] = f'{pr.scheme}://{pr.netloc}{service["@base"]}{cid}'

return access_urls


def ncml_attrs(ncml_url: str) -> dict:
"""Return attributes from the NcML response of a THREDDS dataset.

Parameters
----------
ncml_url : str
URI to the NcML dataset description, either a remote server URL or path to a local xml file.
"""
xml = requests.get(ncml_url).text

# Get dataset attributes
attrs = xncml.Dataset.from_text(xml).to_cf_dict()
attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"])
return attrs


def ds_attrs(url: str) -> dict:
"""Return attributes from the NcML response of a THREDDS dataset and access URLs from the THREDDS server.

Parameters
----------
url : str
URL to the THREDDS netCDF file
"""
urls = access_urls(*catalog_url(url))
attrs = ncml_attrs(urls["NcML"])

# Include service attributes
attrs["access_urls"] = urls
return attrs


def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
"""Create Polygon geometry from CFMetadata."""
attrs = attrs["groups"]["CFMetadata"]["attributes"]
Expand Down Expand Up @@ -204,26 +318,59 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop
return item


asset_name_remaps = {
"httpserver_service": "HTTPServer",
"opendap_service": "OPENDAP",
"wcs_service": "WCS",
"wms_service": "WMS",
"nccs_service": "NetcdfSubset",
}

media_types = {
"HTTPServer": "application/x-netcdf",
"OPENDAP": pystac.MediaType.HTML,
"OpenDAP": pystac.MediaType.HTML,
"NcML": pystac.MediaType.XML,
"WCS": pystac.MediaType.XML,
"WMS": pystac.MediaType.XML,
"NetcdfSubset": "application/x-netcdf",
"ISO": pystac.MediaType.XML,
"UDDC": pystac.MediaType.HTML
}

asset_roles = {
"HTTPServer": ["data"],
"OPENDAP": ["data"],
"OpenDAP": ["data"],
"WCS": ["data"],
"WMS": ["visual"],
"NetcdfSubset": ["data"],
"NcML": ["metadata"],
"ISO": ["metadata"],
"UDDC": ["metadata"]
}


class ServiceType(Enum):
adde = "ADDE"
dap4 = "DAP4"
dods = "DODS" # same as OpenDAP
opendap = "OpenDAP"
opendapg = "OpenDAPG"
netcdfsubset = "NetcdfSubset"
cdmremote = "CdmRemote"
cdmfeature = "CdmFeature"
ncjson = "ncJSON"
h5service = "H5Service"
httpserver = "HTTPServer"
ftp = "FTP"
gridftp = "GridFTP"
file = "File"
iso = "ISO"
las = "LAS"
ncml = "NcML"
uddc = "UDDC"
wcs = "WCS"
wms = "WMS"
wsdl = "WSDL"
webform = "WebForm"
catalog = "Catalog"
compound = "Compound"
resolver = "Resolver"
thredds = "THREDDS"

@classmethod
def from_value(cls, value):
"""Return value irrespective of case."""
return cls[value.lower()]

74 changes: 74 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import urllib.parse
import json
import pytest
import responses
from responses import _recorder
from pathlib import Path
import requests
from STACpopulator.stac_utils import catalog_url, access_urls, ds_attrs
from STACpopulator.implementations.CMIP6_UofT.add_CMIP6 import CMIP6ItemProperties, CMIP6populator
from STACpopulator.models import GeoJSONPolygon
from STACpopulator.stac_utils import STAC_item_from_metadata


URLS = ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6"
"/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc",
]
URLS = ["https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6"
"/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc",
"https://psl.noaa.gov/thredds/catalog/Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/catalog.html?dataset=Datasets/20thC_ReanV2/Monthlies/gaussian/monolevel/air.2m.mon.mean.nc"]

DATA = Path(__file__).parent / "data"


def reference_path_from_url(url):
"""Return local path to json dict representation of STAC item."""
catalog_link, nc = catalog_url(url)
nc = Path(nc)
parts = catalog_link.split("/")
return DATA.joinpath("references", parts[-2], nc.with_suffix(".json"))


@_recorder.record(file_path=DATA / "responses.yaml")
def store_responses():
"""Store server responses.

Run this if new URLs are added, if remote THREDDS servers are updated or their configuration changed.
"""
for url in URLS:
# Request to catalog link
catalog_link, nc = catalog_url(url)
requests.get(catalog_link)

# Request to NcML link
ncml_link = access_urls(catalog_link, nc)["NCML"]
requests.get(ncml_link)


@responses.activate
def create_reference_items(overwrite=False):
"""Store json representation of STAC item dict created from stored XML responses.

- Run after store_responses() to update the expected STAC item representation.
- Run if the STAC item representation changes.
"""
# Get server responses from files stored on disk
responses._add_from_file(file_path=DATA / "responses.yaml")

for url in URLS:
# Request to catalog link
catalog_link, nc = catalog_url(url)

# Request to NcML link
ncml_link = access_urls(catalog_link, nc)["NcML"]

reference_path = reference_path_from_url(url)

if overwrite or not reference_path.exists():
reference_path.parent.mkdir(parents=True, exist_ok=True)
attrs = ds_attrs(ncml_link, catalog_link)

if "cmip6" in url:
stac_item_id = CMIP6populator.make_cmip6_item_id(attrs["attributes"])
stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon)
reference_path.write_text(json.dumps(stac_item.to_dict()))
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type": "Feature", "stac_version": "1.0.0", "id": "ScenarioMIP_CCCma_CanESM5_ssp245_r13i1p2f1_SImon_siconc_gn", "properties": {"start_datetime": "2019-12-06T12:00:00Z", "end_datetime": "2020-11-04T12:00:00Z", "datetime": null, "cmip6:Conventions": "CF-1.7 CMIP-6.2", "cmip6:activity_id": "ScenarioMIP", "cmip6:creation_date": "2019-09-25T23:01:33Z", "cmip6:data_specs_version": "01.00.30", "cmip6:experiment": "update of RCP4.5 based on SSP2", "cmip6:experiment_id": "ssp245", "cmip6:frequency": "mon", "cmip6:further_info_url": "https://furtherinfo.es-doc.org/CMIP6.CCCma.CanESM5.ssp245.none.r13i1p2f1", "cmip6:grid_label": "gn", "cmip6:institution": "Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada", "cmip6:institution_id": "CCCma", "cmip6:nominal_resolution": "100 km", "cmip6:realm": ["seaIce"], "cmip6:source": "CanESM5 (2019): \naerosol: interactive\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude/latitude; 49 levels; top level 1 hPa)\natmosChem: specified oxidants for aerosols\nland: CLASS3.6/CTEM1.2\nlandIce: specified ice sheets\nocean: NEMO3.4.1 (ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m)\nocnBgchem: Canadian Model of Ocean Carbon (CMOC); NPZD ecosystem with OMIP prescribed carbonate chemistry\nseaIce: LIM2", "cmip6:source_id": "CanESM5", "cmip6:source_type": ["AOGCM"], "cmip6:sub_experiment": "none", "cmip6:sub_experiment_id": "none", "cmip6:table_id": "SImon", "cmip6:variable_id": "siconc", "cmip6:variant_label": "r13i1p2f1", "cmip6:initialization_index": 1, "cmip6:physics_index": 2, "cmip6:realization_index": 13, "cmip6:forcing_index": 1, "cmip6:tracking_id": "hdl:21.14100/9e4f804b-c161-44fa-acd1-c2e94e220c95", "cmip6:version": "v20190429", "cmip6:product": "model-output", "cmip6:license": "CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:///pcmdi.llnl.gov/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.", "cmip6:grid": "ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m", "cmip6:mip_era": "CMIP6"}, "geometry": {"type": "Polygon", "coordinates": [[[0.049800001084804535, -78.39350128173828], [0.049800001084804535, 89.74176788330078], [359.99493408203125, 89.74176788330078], [359.99493408203125, -78.39350128173828], [0.049800001084804535, -78.39350128173828]]]}, "links": [{"rel": "source", "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/x-netcdf", "title": "birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc"}], "assets": {"HTTPServer": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/x-netcdf", "roles": ["data"]}, "OpenDAP": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "text/html", "roles": ["data"]}, "NcML": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/xml", "roles": ["metadata"]}, "UDDC": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/uddc/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "text/html", "roles": ["metadata"]}, "ISO": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/iso/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/xml", "roles": ["metadata"]}, "WCS": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/xml", "roles": ["data"]}, "WMS": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/xml", "roles": ["visual"]}, "NetcdfSubset": {"href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncss/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", "type": "application/x-netcdf", "roles": ["data"]}}, "bbox": [0.049800001084804535, -78.39350128173828, 359.99493408203125, 89.74176788330078], "stac_extensions": []}
Loading
Loading