-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeocubes_to_stac.py
executable file
·258 lines (227 loc) · 10.5 KB
/
geocubes_to_stac.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import pystac
import rasterio
import requests
import datetime
import pandas
import re
from bs4 import BeautifulSoup
from rio_stac.stac import create_stac_item
from shapely.geometry import GeometryCollection, shape
from utils.geocubes_api import get_datasets
def create_collection(collection_info, dataset_info):
"""
Create collection using the dataset info gathered from the API and the provided collection info from the CSV
Returns the collection as pystac.Collection
"""
# The regural expression sub is changing the spaces into underscores
# For sentinel and NDVI collections, the name is specified a bit different as the names contain the years/months of the data
col_name = re.sub(r'\W+','_', collection_info['Name'].lower())
if "sentinel" in col_name:
split = col_name.split("_")[:-2]
col_name = "_".join(split)
elif "ndvi" in col_name:
split = col_name.split("_")[:-1]
col_name = "_".join(split)
col_id = f"{col_name}_at_geocubes"
collection = pystac.Collection(
id = col_id,
title = f"{collection_info['Name']} (GeoCubes)",
description = f"{collection_info['Description']}. Provided by YYYY. Scale: XXXX. Coordinate system: ETRS-TM35FIN.",
license = "CC-BY-4.0",
#Placeholder extents, updated from items later
extent = pystac.Extent(
spatial = pystac.SpatialExtent([[0,0,0,0]]),
temporal = pystac.TemporalExtent([(
datetime.datetime.strptime(f"2000-01-01", "%Y-%m-%d"),
datetime.datetime.strptime(f"2000-12-31", "%Y-%m-%d")
)])
),
providers = [
pystac.Provider(
name = "CSC Finland",
url = "https://www.csc.fi/",
roles = ["host"]
)
],
assets = {
"meta": pystac.Asset(
dataset_info["metadata_URL"],
title = "Metadata",
roles = ["metadata"]
)
},
summaries = pystac.Summaries(
summaries = {
"gsd": []
}
)
)
if "sentinel" in col_name:
collection.providers.append(
pystac.Provider(
name = "ESA",
roles = ["producer"]
)
)
if dataset_info["producer"] == "MML":
collection.description = re.sub("YYYY", "NLS", collection.description)
collection.providers.append(
pystac.Provider(
name = "NLS",
roles = ["producer", "processor"]
)
)
elif dataset_info["producer"] == "IL":
collection.description = re.sub("YYYY", "FMI", collection.description)
collection.providers.append(
pystac.Provider(
name = "FMI",
roles = ["producer"]
)
)
collection.providers.append(
pystac.Provider(
name = "NLS",
roles = ["processor"]
)
)
else:
collection.description = re.sub("YYYY", dataset_info["producer"], collection.description)
collection.providers.append(
pystac.Provider(
name = dataset_info["producer"],
roles = ["producer"]
)
)
collection.providers.append(
pystac.Provider(
name = "NLS",
roles = ["processor"]
)
)
print(f"Collection made: {collection.id}")
return collection
if __name__ == "__main__":
datasets = get_datasets()
try: # Takes the available catalog if it exists
catalog = pystac.Catalog("GeoCubes", "Testing catalog", catalog_type=pystac.CatalogType.RELATIVE_PUBLISHED)
except:
catalog = pystac.Catalog.from_file("Geocubes/catalog.json")
# Information and translations of the GeoCubes
collection_csv = pandas.read_csv('files/karttatasot.csv', index_col='Nimi').to_dict('index')
for col in collection_csv:
collection_info = collection_csv[col]
dataset_info = datasets[col]
collection = create_collection(collection_info, dataset_info)
catalog.add_child(collection)
for year_path in dataset_info['paths']:
#TIFs through BeautifulSoup
page = requests.get(year_path)
data = page.text
soup = BeautifulSoup(data, features="html.parser")
links = [link for link in soup.find_all("a")]
assets = {}
item_links = [link.get("href") for link in links if link.get("href").endswith("tif")]
item_sets = [item.split(".")[0] for item in item_links]
grouped_dict = {}
for item in item_sets:
prefix = "_".join(item.split("_")[:4])
if prefix not in grouped_dict:
grouped_dict[prefix] = []
grouped_dict[prefix].append(item)
for key in grouped_dict.keys():
# Takes the year from the path
item_starttime = datetime.datetime.strptime(f"{year_path.split('/')[-2]}-01-01", "%Y-%m-%d")
item_endtime = datetime.datetime.strptime(f"{year_path.split('/')[-2]}-12-31", "%Y-%m-%d")
with rasterio.open(year_path+grouped_dict[key][0]+".tif") as src:
assets = {
"COG": pystac.Asset(
href=year_path+grouped_dict[key][0]+".tif",
media_type="image/tiff; application=geotiff; profile=cloud-optimized",
title="COG",
roles=["data"],
extra_fields={
"gsd": int(src.res[0]),
"proj:shape": src.shape,
"proj:transform": [
src.transform.a,
src.transform.b,
src.transform.c,
src.transform.d,
src.transform.e,
src.transform.f,
src.transform.g,
src.transform.h,
src.transform.i
]
}
)
}
min_gsd = assets["COG"].extra_fields["gsd"]
for asset in grouped_dict[key][1:]:
with rasterio.open(year_path+asset+".tif") as src:
asset_id = asset.split("_")[-1]
assets[asset_id] = pystac.Asset(
href=year_path+asset+".tif",
media_type="image/tiff; application=geotiff",
title=asset.split('_')[-1],
roles=["data"],
extra_fields={
"gsd": int(src.res[0]),
"proj:shape": src.shape,
"proj:transform": [
src.transform.a,
src.transform.b,
src.transform.c,
src.transform.d,
src.transform.e,
src.transform.f,
src.transform.g,
src.transform.h,
src.transform.i
]
}
)
# Add the GSD into the Collection Summaries if not in it
if assets[asset_id].extra_fields["gsd"] not in collection.summaries.lists["gsd"]:
collection.summaries.lists["gsd"].append(assets[asset_id].extra_fields["gsd"])
min_gsd = min(min_gsd, assets[asset_id].extra_fields["gsd"])
# The sentinel and NDVI items are named a bit differently from the rest
item_year = year_path.split("/")[-1]
if "sentinel" in key:
name = key.split("_")[0].replace('-', '_')
item_info = "_".join(key.split(".")[0].split("_")[1:])
item_id = f"{name.lower().replace(' ', '_').replace(',', '')}_{item_info}"
elif "ndvi" in key:
name = key.split("_")[0]
item_info = "_".join(key.split(".")[0].split("_")[1:])
item_id = f"{name.lower()}_{item_info}"
else:
item_info = "_".join(key.split(".")[0].split("_")[1:])
item_id = f"{collection_info['Name'].lower().replace(' ', '_').replace(',', '')}_{item_info}"
item = create_stac_item(
source=year_path+key+".tif",
id=item_id,
assets=assets,
asset_media_type=pystac.MediaType.TIFF,
with_proj=True,
)
item.common_metadata.start_datetime = item_starttime
item.common_metadata.end_datetime = item_endtime
item.extra_fields["gsd"] = min_gsd
item.properties["proj:epsg"] = 3067
collection.add_item(item)
print(f"* Item made: {item.id}")
# Updating the Spatial and Temporal Extents from the data
# Built-in update_extent_from_items() takes the datetime into account which is false
bounds = [GeometryCollection([shape(s.geometry) for s in collection.get_all_items()]).bounds]
start_times = [st.common_metadata.start_datetime for st in collection.get_all_items()]
end_times = [et.common_metadata.end_datetime for et in collection.get_all_items()]
temporal = [[min(start_times), max(end_times)]]
collection.extent.spatial = pystac.SpatialExtent(bounds)
collection.extent.temporal = pystac.TemporalExtent(temporal)
# Sort the GSD Summaries and add the lowest and highest to the description
sorted_gsd = sorted(collection.summaries.lists["gsd"])
collection.summaries.lists["gsd"] = sorted_gsd
collection.description = re.sub('XXXX', f"{sorted_gsd[0]}m-{sorted_gsd[-1]}m", collection.description)
catalog.normalize_and_save("GeoCubes")