DuckDB + PyIceberg: Data Extraction from geo-sorted ohsome contributions#
-Note
--
-
Set the connection params and configure DuckDB.
-Download the data in 3 steps:
++ Download latest OSM data as GeoPackage#
+In this notebook we demonstrate how you can download the latest OSM data in GeoPackage format.
+These are the steps you see further down:
-
-
Download data with PyIceberg.
-Fitler and process data with DuckDB.
+Set the connection parameters.
+Prepare your input parameters, e.g. define area of interest and OSM tag filter.
+Download data using PyIceberg and DuckDB.
+Filter and process data with DuckDB.
Export results into geopackage file with GeoPandas.
-
Getting started#
+Getting started#
Set connection params.
Getting startedconfig={
'threads': 8,
'max_memory': '8GB',
- # 'enable_object_cache': True
}
)
con.install_extension("spatial")
@@ -548,8 +520,8 @@ Getting started
-Download data with PyIceberg table scan#
+
+Download data with PyIceberg table scan#
In this step we can already filter all OSM contributions by four major factors. We will perform more detailed filtering (e.g. for OSM tags values) later:
status (e.g. latest, historic or deleted OSM features)
@@ -577,8 +549,6 @@ Download data with PyIceberg table scanselected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
-area_of_interest_file =f"../data/{selected_region}.geojson"
-area_of_interest_file = f"https://raw.githubusercontent.com/GIScience/sotm-2024-ohsome-data-insights-workshop/main/data/{selected_region}.geojson"
# Define geometry type filter
geometry_type = 'Polygon'
@@ -633,14 +603,14 @@ Download data with PyIceberg table scan
-download took 198.388 sec.
+download took 15.965 sec.
-
-Filter and process data with DuckDB#
+
+Filter and process data with DuckDB#
Second, we use DuckDB to perform the more detailed filtering. In this step we can filter for:
tags
@@ -658,12 +628,9 @@ Filter and process data with DuckDBSELECT a.*
FROM
raw_osm_data as a,
- st_read('{area_of_interest_file}') as aoi
WHERE 1=1
and tags['building'][1] is not null
and tags['building'][1] != 'no'
- -- spatial filtering part
- and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
@@ -675,28 +642,14 @@ Filter and process data with DuckDB
----------------------------------------------------------------------------
-IOException Traceback (most recent call last)
-Cell In[9], line 20
- 2 start_time = time.time()
- 4 query = f"""
- 5 DROP TABLE IF EXISTS osm_data;
- 6 CREATE TABLE osm_data AS
- (...)
- 18 ;
- 19 """
----> 20 con.sql(query)
- 22 processing_time = round(time.time() - start_time, 3)
- 23 print(f"processing took {processing_time} sec.")
-
-IOException: IO Error: GDAL Error (4): Failed to open file https://raw.githubusercontent.com/GIScience/sotm-2024-ohsome-data-insights-workshop/main/data/nairobi.geojson: {"exception_type":"IO","exception_message":"Cannot open file \"https://raw.githubusercontent.com/GIScience/sotm-2024-ohsome-data-insights-workshop/main/data/nairobi.geojson\": No such file or directory","errno":"2"}
+processing took 0.552 sec.
-
-Save data as GeoPackage#
+
+Save data as GeoPackage#
Show the structure of the data we have just downloaded.
-┌─────────────┬────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────┐
-│ column_name │ column_type │ null │ key │ default │ extra │
-│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
-├─────────────┼────────────────────────────────────────────────────────────┼─────────┼─────────┼─────────┼─────────┤
-│ user_id │ INTEGER │ YES │ NULL │ NULL │ NULL │
-│ valid_from │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │
-│ osm_id │ VARCHAR │ YES │ NULL │ NULL │ NULL │
-│ osm_version │ INTEGER │ YES │ NULL │ NULL │ NULL │
-│ tags │ MAP(VARCHAR, VARCHAR) │ YES │ NULL │ NULL │ NULL │
-│ bbox │ STRUCT(xmin DOUBLE, ymin DOUBLE, xmax DOUBLE, ymax DOUBLE) │ YES │ NULL │ NULL │ NULL │
-│ geometry │ VARCHAR │ YES │ NULL │ NULL │ NULL │
-└─────────────┴────────────────────────────────────────────────────────────┴─────────┴─────────┴─────────┴─────────┘
+┌─────────────┬───────────────────────┬─────────┬─────────┬─────────┬─────────┐
+│ column_name │ column_type │ null │ key │ default │ extra │
+│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
+├─────────────┼───────────────────────┼─────────┼─────────┼─────────┼─────────┤
+│ user_id │ INTEGER │ YES │ NULL │ NULL │ NULL │
+│ valid_from │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │
+│ osm_id │ VARCHAR │ YES │ NULL │ NULL │ NULL │
+│ osm_version │ INTEGER │ YES │ NULL │ NULL │ NULL │
+│ tags │ MAP(VARCHAR, VARCHAR) │ YES │ NULL │ NULL │ NULL │
+│ geometry │ VARCHAR │ YES │ NULL │ NULL │ NULL │
+└─────────────┴───────────────────────┴─────────┴─────────┴─────────┴─────────┘
@@ -738,12 +690,16 @@ Save data as GeoPackage
-┌─────────┬────────────┬─────────┬─────────────┬──────────────────────┬─────────────────────────────────────┬──────────┐
-│ user_id │ valid_from │ osm_id │ osm_version │ tags │ bbox │ geometry │
-│ int32 │ timestamp │ varchar │ int32 │ map(varchar, varch… │ struct(xmin double, ymin double, … │ varchar │
-├─────────┴────────────┴─────────┴─────────────┴──────────────────────┴─────────────────────────────────────┴──────────┤
-│ 0 rows │
-└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+┌──────────┬─────────────────────┬────────────────┬─────────────┬──────────────────────┬───────────────────────────────┐
+│ user_id │ valid_from │ osm_id │ osm_version │ tags │ geometry │
+│ int32 │ timestamp │ varchar │ int32 │ map(varchar, varch… │ varchar │
+├──────────┼─────────────────────┼────────────────┼─────────────┼──────────────────────┼───────────────────────────────┤
+│ 1122708 │ 2021-09-07 23:04:37 │ way/500108893 │ 5 │ {building=yes, rai… │ POLYGON ((36.89974139999999… │
+│ 13366421 │ 2023-01-10 06:13:29 │ way/1130552834 │ 1 │ {building=yes, add… │ POLYGON ((36.8719435 -1.241… │
+│ 18306654 │ 2023-11-11 01:58:08 │ way/88406439 │ 16 │ {building=yes, aer… │ POLYGON ((36.9238579 -1.331… │
+│ 17770290 │ 2022-11-27 17:57:39 │ way/1117632750 │ 1 │ {building=yes} │ POLYGON ((36.6824404 -1.442… │
+│ 3733993 │ 2023-04-11 09:02:45 │ way/1161291128 │ 1 │ {building=yes} │ POLYGON ((36.6824528 -1.438… │
+└──────────┴─────────────────────┴────────────────┴─────────────┴──────────────────────┴───────────────────────────────┘
@@ -764,7 +720,7 @@ Save data as GeoPackage
@@ -777,12 +733,7 @@ Save data as GeoPackagestart_time = time.time()
query = f"""
- SELECT osm_data.*
- FROM
- osm_data,
- st_read('{area_of_interest_file}') as aoi
- WHERE 1=1
- and ST_Intersects(st_GeomFromText(osm_data.geometry), aoi.geom)
+ SELECT * FROM osm_data
"""
df = con.sql(query).df()
@@ -799,15 +750,16 @@ Save data as GeoPackage
Download data with PyIceberg table scan#
+Download data with PyIceberg table scan#
In this step we can already filter all OSM contributions by four major factors. We will perform more detailed filtering (e.g. for OSM tags values) later:
status (e.g. latest, historic or deleted OSM features)
@@ -577,8 +549,6 @@
Download data with PyIceberg table scanselected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
-area_of_interest_file =f"../data/{selected_region}.geojson"
-area_of_interest_file = f"https://raw.githubusercontent.com/GIScience/sotm-2024-ohsome-data-insights-workshop/main/data/{selected_region}.geojson"
# Define geometry type filter
geometry_type = 'Polygon'
@@ -633,14 +603,14 @@ Download data with PyIceberg table scan
-download took 198.388 sec.
+download took 15.965 sec.
download took 198.388 sec.
+download took 15.965 sec.
Filter and process data with DuckDB#
+Filter and process data with DuckDB#
Second, we use DuckDB to perform the more detailed filtering. In this step we can filter for:
tags
@@ -658,12 +628,9 @@
Filter and process data with DuckDBSELECT a.*
FROM
raw_osm_data as a,
- st_read('{area_of_interest_file}') as aoi
WHERE 1=1
and tags['building'][1] is not null
and tags['building'][1] != 'no'
- -- spatial filtering part
- and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
@@ -675,28 +642,14 @@ Filter and process data with DuckDB
----------------------------------------------------------------------------
-IOException Traceback (most recent call last)
-Cell In[9], line 20
- 2 start_time = time.time()
- 4 query = f"""
- 5 DROP TABLE IF EXISTS osm_data;
- 6 CREATE TABLE osm_data AS
- (...)
- 18 ;
- 19 """
----> 20 con.sql(query)
- 22 processing_time = round(time.time() - start_time, 3)
- 23 print(f"processing took {processing_time} sec.")
-
-IOException: IO Error: GDAL Error (4): Failed to open file https://raw.githubusercontent.com/GIScience/sotm-2024-ohsome-data-insights-workshop/main/data/nairobi.geojson: {"exception_type":"IO","exception_message":"Cannot open file \"https://raw.githubusercontent.com/GIScience/sotm-2024-ohsome-data-insights-workshop/main/data/nairobi.geojson\": No such file or directory","errno":"2"}
+processing took 0.552 sec.
---------------------------------------------------------------------------
-IOException Traceback (most recent call last)
-Cell In[9], line 20
- 2 start_time = time.time()
- 4 query = f"""
- 5 DROP TABLE IF EXISTS osm_data;
- 6 CREATE TABLE osm_data AS
- (...)
- 18 ;
- 19 """
----> 20 con.sql(query)
- 22 processing_time = round(time.time() - start_time, 3)
- 23 print(f"processing took {processing_time} sec.")
-
-IOException: IO Error: GDAL Error (4): Failed to open file https://raw.githubusercontent.com/GIScience/sotm-2024-ohsome-data-insights-workshop/main/data/nairobi.geojson: {"exception_type":"IO","exception_message":"Cannot open file \"https://raw.githubusercontent.com/GIScience/sotm-2024-ohsome-data-insights-workshop/main/data/nairobi.geojson\": No such file or directory","errno":"2"}
+processing took 0.552 sec.
Save data as GeoPackage#
+Save data as GeoPackage#
Show the structure of the data we have just downloaded.
┌─────────────┬────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────┐
-│ column_name │ column_type │ null │ key │ default │ extra │
-│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
-├─────────────┼────────────────────────────────────────────────────────────┼─────────┼─────────┼─────────┼─────────┤
-│ user_id │ INTEGER │ YES │ NULL │ NULL │ NULL │
-│ valid_from │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │
-│ osm_id │ VARCHAR │ YES │ NULL │ NULL │ NULL │
-│ osm_version │ INTEGER │ YES │ NULL │ NULL │ NULL │
-│ tags │ MAP(VARCHAR, VARCHAR) │ YES │ NULL │ NULL │ NULL │
-│ bbox │ STRUCT(xmin DOUBLE, ymin DOUBLE, xmax DOUBLE, ymax DOUBLE) │ YES │ NULL │ NULL │ NULL │
-│ geometry │ VARCHAR │ YES │ NULL │ NULL │ NULL │
-└─────────────┴────────────────────────────────────────────────────────────┴─────────┴─────────┴─────────┴─────────┘
+┌─────────────┬───────────────────────┬─────────┬─────────┬─────────┬─────────┐
+│ column_name │ column_type │ null │ key │ default │ extra │
+│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │
+├─────────────┼───────────────────────┼─────────┼─────────┼─────────┼─────────┤
+│ user_id │ INTEGER │ YES │ NULL │ NULL │ NULL │
+│ valid_from │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │
+│ osm_id │ VARCHAR │ YES │ NULL │ NULL │ NULL │
+│ osm_version │ INTEGER │ YES │ NULL │ NULL │ NULL │
+│ tags │ MAP(VARCHAR, VARCHAR) │ YES │ NULL │ NULL │ NULL │
+│ geometry │ VARCHAR │ YES │ NULL │ NULL │ NULL │
+└─────────────┴───────────────────────┴─────────┴─────────┴─────────┴─────────┘
Save data as GeoPackage
┌─────────┬────────────┬─────────┬─────────────┬──────────────────────┬─────────────────────────────────────┬──────────┐
-│ user_id │ valid_from │ osm_id │ osm_version │ tags │ bbox │ geometry │
-│ int32 │ timestamp │ varchar │ int32 │ map(varchar, varch… │ struct(xmin double, ymin double, … │ varchar │
-├─────────┴────────────┴─────────┴─────────────┴──────────────────────┴─────────────────────────────────────┴──────────┤
-│ 0 rows │
-└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+┌──────────┬─────────────────────┬────────────────┬─────────────┬──────────────────────┬───────────────────────────────┐
+│ user_id │ valid_from │ osm_id │ osm_version │ tags │ geometry │
+│ int32 │ timestamp │ varchar │ int32 │ map(varchar, varch… │ varchar │
+├──────────┼─────────────────────┼────────────────┼─────────────┼──────────────────────┼───────────────────────────────┤
+│ 1122708 │ 2021-09-07 23:04:37 │ way/500108893 │ 5 │ {building=yes, rai… │ POLYGON ((36.89974139999999… │
+│ 13366421 │ 2023-01-10 06:13:29 │ way/1130552834 │ 1 │ {building=yes, add… │ POLYGON ((36.8719435 -1.241… │
+│ 18306654 │ 2023-11-11 01:58:08 │ way/88406439 │ 16 │ {building=yes, aer… │ POLYGON ((36.9238579 -1.331… │
+│ 17770290 │ 2022-11-27 17:57:39 │ way/1117632750 │ 1 │ {building=yes} │ POLYGON ((36.6824404 -1.442… │
+│ 3733993 │ 2023-04-11 09:02:45 │ way/1161291128 │ 1 │ {building=yes} │ POLYGON ((36.6824528 -1.438… │
+└──────────┴─────────────────────┴────────────────┴─────────────┴──────────────────────┴───────────────────────────────┘
Save data as GeoPackage
Save data as GeoPackagestart_time = time.time()
query = f"""
- SELECT osm_data.*
- FROM
- osm_data,
- st_read('{area_of_interest_file}') as aoi
- WHERE 1=1
- and ST_Intersects(st_GeomFromText(osm_data.geometry), aoi.geom)
+ SELECT * FROM osm_data
"""
df = con.sql(query).df()
@@ -799,15 +750,16 @@ Save data as GeoPackage
Display currentness of OSM features on map
-<traitlets.traitlets.directional_link at 0x7e30a802d6d0>
+<traitlets.traitlets.directional_link at 0x754e04508110>
@@ -1038,7 +823,7 @@ Display currentness of OSM features on map
-
+
<traitlets.traitlets.directional_link at 0x7e30a802d6d0>
+<traitlets.traitlets.directional_link at 0x754e04508110>
Display currentness of OSM features on map -
Display currentness of OSM features on map
-
diff --git a/book/02b_buildings_currentness_DuckDB_only.html b/book/02b_buildings_currentness_DuckDB_only.html
index 697d8c1..fce017c 100644
--- a/book/02b_buildings_currentness_DuckDB_only.html
+++ b/book/02b_buildings_currentness_DuckDB_only.html
@@ -183,52 +183,32 @@
Data Extraction
Simple Data Analysis
Data Integration
Advanced Data Analysis
Background
- Partitioning and Sorting
- DuckDB: Currentness of Buildings
-
- DuckDB: Data Extraction from geo-sorted ohsome contributions
-
-
-
-
@@ -455,16 +435,12 @@ Contents
@@ -486,9 +462,8 @@ DuckDB: Currentness of Buildings
-Getting started#
+
+Getting started#
Set connection params.
@@ -547,6 +522,7 @@ Getting started
Prepare the input parameters for your analysis#
@@ -1445,16 +1421,12 @@ Display currentness of OSM features on map
-
diff --git a/book/03_hot_tm_project_analysis.html b/book/03_hot_tm_project_analysis.html
index e1484f8..9b31f9c 100644
--- a/book/03_hot_tm_project_analysis.html
+++ b/book/03_hot_tm_project_analysis.html
@@ -60,8 +60,8 @@
-
-
+
+
@@ -183,52 +183,32 @@
Data Extraction
Simple Data Analysis
Data Integration
- HOT Tasking Manager project analysis
-
-- DuckDB: Mapillary Coverage Analysis
-
-
-
-
-
+- Mapillary Coverage Analysis
Advanced Data Analysis
Background
@@ -455,17 +435,13 @@ Contents
@@ -488,9 +464,8 @@ HOT Tasking Manager project analysis
-Getting started#
+
+Getting started#
Set connection params.
diff --git a/book/03_mapillary_data_analysis.html b/book/03_mapillary_data_analysis.html
index 6c42fcd..9f1f896 100644
--- a/book/03_mapillary_data_analysis.html
+++ b/book/03_mapillary_data_analysis.html
@@ -8,7 +8,7 @@
- DuckDB: Mapillary Coverage Analysis — SOTM 2024 ohsome-data-insights Workshop
+ Mapillary Coverage Analysis — SOTM 2024 ohsome-data-insights Workshop
@@ -60,7 +60,7 @@
-
+
@@ -183,52 +183,32 @@
Data Extraction
Simple Data Analysis
Data Integration
- HOT Tasking Manager project analysis
-
-- DuckDB: Mapillary Coverage Analysis
-
-
-
-
-
+- Mapillary Coverage Analysis
Advanced Data Analysis
Background
@@ -445,7 +425,7 @@
- DuckDB: Mapillary Coverage Analysis
+ Mapillary Coverage Analysis
@@ -455,17 +435,13 @@ Contents
@@ -476,8 +452,8 @@ Contents
-
-DuckDB: Mapillary Coverage Analysis#
+
+Mapillary Coverage Analysis#
In this notebook we demonstrate how to combine two datasets: Mapillary sequences and OSM road network.
We want to find out which parts of the road network in a city are not yet covered by Mapillary street level imagery.
These are the steps you see further down:
@@ -488,9 +464,8 @@ DuckDB: Mapillary Coverage Analysis
-Getting started#
+
+Getting started#
Set connection params.
Set connection to MinIO object storage.
@@ -549,14 +544,15 @@ Getting started
Prepare the input parameters for your analysis#
-# Set s3 path for parquet input data
-osm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions_germany/**"
-#osm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/global_wkt_smaller/**"
-
+# Set iceberg table
+namespace = 'geo_sort'
+tablename = 'contributions'
+icebergtable = catalog.load_table((namespace, tablename))
mapillary_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/mapillary_sequences/*"
@@ -573,7 +569,7 @@ Prepare the input parameters for your analysisselected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
-area_of_interest_file =f"../data/{selected_region}.geojson"
+area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"
epsg_codes = {
'heidelberg': 'EPSG:32632',
@@ -591,9 +587,43 @@ Prepare the input parameters for your analysis
-Get OSM data#
+
+Get OSM data#
+Download latest OSM data for bounding box.
+
+
+import time
+start_time = time.time()
+
+icebergtable.scan(
+ row_filter=(
+ f"status = '{status}' "
+ f"and geometry_type = '{geometry_type}' "
+ f"and (xmax >= {xmin} and xmin <= {xmax}) "
+ f"and (ymax >= {ymin} and ymin <= {ymax}) "
+ ),
+ selected_fields=(
+ "user_id",
+ "osm_id",
+ "osm_version",
+ "valid_from",
+ "tags",
+ "geometry",
+ ),
+).to_duckdb('raw_osm_data',connection=con)
+
+download_time = round(time.time() - start_time, 3)
+print(f"download took {download_time} sec.")
+
+
+
+
+download took 17.048 sec.
+
+
+
+
+Clip OSM highways with to area of interest and calculate road length in kilometer.
-download took 1.619 sec.
+processing took 3.557 sec.
-Display OSM data.
+Display OSM data on a map.
-
-Get Mapillary Data#
+
+Get Mapillary Data#
+We are going to download Mapillary sequences.
import time
@@ -791,11 +745,12 @@ Get Mapillary Data
-download took 1.247 sec.
+download took 1.241 sec.
+Display Mapillary data on a map.
Display currentness of OSM features on map
Getting started#
+Getting started#
Set connection params.
Getting started
Prepare the input parameters for your analysis#
Display currentness of OSM features on map -
Data Extraction
Simple Data Analysis
Data Integration
- HOT Tasking Manager project analysis - -
- DuckDB: Mapillary Coverage Analysis - - - - - +
- Mapillary Coverage Analysis
Advanced Data Analysis
Background
@@ -455,17 +435,13 @@Contents
@@ -488,9 +464,8 @@HOT Tasking Manager project analysis
-Getting started#
+
+Getting started#
Set connection params.
diff --git a/book/03_mapillary_data_analysis.html b/book/03_mapillary_data_analysis.html
index 6c42fcd..9f1f896 100644
--- a/book/03_mapillary_data_analysis.html
+++ b/book/03_mapillary_data_analysis.html
@@ -8,7 +8,7 @@
- DuckDB: Mapillary Coverage Analysis — SOTM 2024 ohsome-data-insights Workshop
+ Mapillary Coverage Analysis — SOTM 2024 ohsome-data-insights Workshop
@@ -60,7 +60,7 @@
-
+
@@ -183,52 +183,32 @@
Data Extraction
Simple Data Analysis
Data Integration
- HOT Tasking Manager project analysis
-
-- DuckDB: Mapillary Coverage Analysis
-
-
-
-
-
+- Mapillary Coverage Analysis
Advanced Data Analysis
Background
@@ -445,7 +425,7 @@
- DuckDB: Mapillary Coverage Analysis
+ Mapillary Coverage Analysis
@@ -455,17 +435,13 @@ Contents
@@ -476,8 +452,8 @@ Contents
-
-DuckDB: Mapillary Coverage Analysis#
+
+Mapillary Coverage Analysis#
In this notebook we demonstrate how to combine two datasets: Mapillary sequences and OSM road network.
We want to find out which parts of the road network in a city are not yet covered by Mapillary street level imagery.
These are the steps you see further down:
@@ -488,9 +464,8 @@ DuckDB: Mapillary Coverage Analysis
-Getting started#
+
+Getting started#
Set connection params.
Set connection to MinIO object storage.
@@ -549,14 +544,15 @@ Getting started
Prepare the input parameters for your analysis#
-# Set s3 path for parquet input data
-osm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions_germany/**"
-#osm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/global_wkt_smaller/**"
-
+# Set iceberg table
+namespace = 'geo_sort'
+tablename = 'contributions'
+icebergtable = catalog.load_table((namespace, tablename))
mapillary_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/mapillary_sequences/*"
@@ -573,7 +569,7 @@ Prepare the input parameters for your analysisselected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
-area_of_interest_file =f"../data/{selected_region}.geojson"
+area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"
epsg_codes = {
'heidelberg': 'EPSG:32632',
@@ -591,9 +587,43 @@ Prepare the input parameters for your analysis
-Get OSM data#
+
+Get OSM data#
+Download latest OSM data for bounding box.
+
+
+import time
+start_time = time.time()
+
+icebergtable.scan(
+ row_filter=(
+ f"status = '{status}' "
+ f"and geometry_type = '{geometry_type}' "
+ f"and (xmax >= {xmin} and xmin <= {xmax}) "
+ f"and (ymax >= {ymin} and ymin <= {ymax}) "
+ ),
+ selected_fields=(
+ "user_id",
+ "osm_id",
+ "osm_version",
+ "valid_from",
+ "tags",
+ "geometry",
+ ),
+).to_duckdb('raw_osm_data',connection=con)
+
+download_time = round(time.time() - start_time, 3)
+print(f"download took {download_time} sec.")
+
+
+
+
+download took 17.048 sec.
+
+
+
+
+Clip OSM highways with to area of interest and calculate road length in kilometer.
-download took 1.619 sec.
+processing took 3.557 sec.
-Display OSM data.
+Display OSM data on a map.
-
-Get Mapillary Data#
+
+Get Mapillary Data#
+We are going to download Mapillary sequences.
import time
@@ -791,11 +745,12 @@ Get Mapillary Data
-download took 1.247 sec.
+download took 1.241 sec.
+Display Mapillary data on a map.
Getting started#
Set connection params.
Data Extraction
Simple Data Analysis
Data Integration
- HOT Tasking Manager project analysis - -
- DuckDB: Mapillary Coverage Analysis - - - - - +
- Mapillary Coverage Analysis
Advanced Data Analysis
Background
@@ -445,7 +425,7 @@DuckDB: Mapillary Coverage Analysis
+Mapillary Coverage Analysis
Contents
Contents
DuckDB: Mapillary Coverage Analysis#
+Mapillary Coverage Analysis#
In this notebook we demonstrate how to combine two datasets: Mapillary sequences and OSM road network.
We want to find out which parts of the road network in a city are not yet covered by Mapillary street level imagery.
These are the steps you see further down:
@@ -488,9 +464,8 @@DuckDB: Mapillary Coverage Analysis
-Getting started#
+
+Getting started#
Set connection params.
Set connection to MinIO object storage.
@@ -549,14 +544,15 @@ Getting started
Prepare the input parameters for your analysis#
-# Set s3 path for parquet input data
-osm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions_germany/**"
-#osm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/global_wkt_smaller/**"
-
+# Set iceberg table
+namespace = 'geo_sort'
+tablename = 'contributions'
+icebergtable = catalog.load_table((namespace, tablename))
mapillary_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/mapillary_sequences/*"
@@ -573,7 +569,7 @@ Prepare the input parameters for your analysisselected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
-area_of_interest_file =f"../data/{selected_region}.geojson"
+area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"
epsg_codes = {
'heidelberg': 'EPSG:32632',
@@ -591,9 +587,43 @@ Prepare the input parameters for your analysis
-Get OSM data#
+
+Get OSM data#
+Download latest OSM data for bounding box.
+
+
+import time
+start_time = time.time()
+
+icebergtable.scan(
+ row_filter=(
+ f"status = '{status}' "
+ f"and geometry_type = '{geometry_type}' "
+ f"and (xmax >= {xmin} and xmin <= {xmax}) "
+ f"and (ymax >= {ymin} and ymin <= {ymax}) "
+ ),
+ selected_fields=(
+ "user_id",
+ "osm_id",
+ "osm_version",
+ "valid_from",
+ "tags",
+ "geometry",
+ ),
+).to_duckdb('raw_osm_data',connection=con)
+
+download_time = round(time.time() - start_time, 3)
+print(f"download took {download_time} sec.")
+
+
+
+
+download took 17.048 sec.
+
+
+
+
+Clip OSM highways with to area of interest and calculate road length in kilometer.
-download took 1.619 sec.
+processing took 3.557 sec.
-Display OSM data.
+Display OSM data on a map.
-
-Get Mapillary Data#
+
+Get Mapillary Data#
+We are going to download Mapillary sequences.
import time
@@ -791,11 +745,12 @@ Get Mapillary Data
-download took 1.247 sec.
+download took 1.241 sec.
+Display Mapillary data on a map.
Getting started#
Set connection params.
Set connection to MinIO object storage.
Getting started
Prepare the input parameters for your analysis#
-# Set s3 path for parquet input data
-osm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions_germany/**"
-#osm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/global_wkt_smaller/**"
-
+# Set iceberg table
+namespace = 'geo_sort'
+tablename = 'contributions'
+icebergtable = catalog.load_table((namespace, tablename))
mapillary_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/mapillary_sequences/*"
@@ -573,7 +569,7 @@ Prepare the input parameters for your analysisselected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
-area_of_interest_file =f"../data/{selected_region}.geojson"
+area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"
epsg_codes = {
'heidelberg': 'EPSG:32632',
@@ -591,9 +587,43 @@ Prepare the input parameters for your analysis
-Get OSM data#
+
+Get OSM data#
+Download latest OSM data for bounding box.
+
+
+import time
+start_time = time.time()
+
+icebergtable.scan(
+ row_filter=(
+ f"status = '{status}' "
+ f"and geometry_type = '{geometry_type}' "
+ f"and (xmax >= {xmin} and xmin <= {xmax}) "
+ f"and (ymax >= {ymin} and ymin <= {ymax}) "
+ ),
+ selected_fields=(
+ "user_id",
+ "osm_id",
+ "osm_version",
+ "valid_from",
+ "tags",
+ "geometry",
+ ),
+).to_duckdb('raw_osm_data',connection=con)
+
+download_time = round(time.time() - start_time, 3)
+print(f"download took {download_time} sec.")
+
+
+
+
+download took 17.048 sec.
+
+
+
+
+Clip OSM highways with to area of interest and calculate road length in kilometer.
-download took 1.619 sec.
+processing took 3.557 sec.
-Display OSM data.
+Display OSM data on a map.
-
-Get Mapillary Data#
+
+Get Mapillary Data#
+We are going to download Mapillary sequences.
import time
@@ -791,11 +745,12 @@ Get Mapillary Data
-download took 1.247 sec.
+download took 1.241 sec.
+Display Mapillary data on a map.
# Set s3 path for parquet input data
-osm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions_germany/**"
-#osm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/global_wkt_smaller/**"
-
+# Set iceberg table
+namespace = 'geo_sort'
+tablename = 'contributions'
+icebergtable = catalog.load_table((namespace, tablename))
mapillary_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/mapillary_sequences/*"
@@ -573,7 +569,7 @@ Prepare the input parameters for your analysisselected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
-area_of_interest_file =f"../data/{selected_region}.geojson"
+area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"
epsg_codes = {
'heidelberg': 'EPSG:32632',
@@ -591,9 +587,43 @@ Prepare the input parameters for your analysis
-Get OSM data#
+
+Get OSM data#
+Download latest OSM data for bounding box.
+
+
+import time
+start_time = time.time()
+
+icebergtable.scan(
+ row_filter=(
+ f"status = '{status}' "
+ f"and geometry_type = '{geometry_type}' "
+ f"and (xmax >= {xmin} and xmin <= {xmax}) "
+ f"and (ymax >= {ymin} and ymin <= {ymax}) "
+ ),
+ selected_fields=(
+ "user_id",
+ "osm_id",
+ "osm_version",
+ "valid_from",
+ "tags",
+ "geometry",
+ ),
+).to_duckdb('raw_osm_data',connection=con)
+
+download_time = round(time.time() - start_time, 3)
+print(f"download took {download_time} sec.")
+
+
+
+
+download took 17.048 sec.
+
+
+
+
+Clip OSM highways with to area of interest and calculate road length in kilometer.
-download took 1.619 sec.
+processing took 3.557 sec.
-Display OSM data.
+Display OSM data on a map.
-
-Get Mapillary Data#
+
+Get Mapillary Data#
+We are going to download Mapillary sequences.
import time
@@ -791,11 +745,12 @@ Get Mapillary Data
-download took 1.247 sec.
+download took 1.241 sec.
+Display Mapillary data on a map.