Skip to content

Commit

Permalink
fix all headings and use pyiceberg and duckdb in all notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
Hagellach37 committed Aug 31, 2024
1 parent 3a9054e commit 8db81fd
Show file tree
Hide file tree
Showing 9 changed files with 1,206 additions and 377 deletions.
2 changes: 1 addition & 1 deletion book/00_MinIO_Object_Store.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"id": "8a8d65c2-8638-4385-a40d-0dfa190d4601",
"metadata": {},
"source": [
"# Connect to MinIO via DuckDB"
"## Connect to MinIO via DuckDB"
]
},
{
Expand Down
128 changes: 58 additions & 70 deletions book/01a_Data_Extraction_DuckDB_PyIceberg.ipynb

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions book/01b_Data_Extraction_DuckDB_only.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"id": "b6ec70ad-bce1-4052-b151-23fe47fad942",
"metadata": {},
"source": [
"# Getting started\n",
"## Getting started\n",
"Set connection params."
]
},
Expand Down Expand Up @@ -116,7 +116,7 @@
"id": "bed363ef-ee28-411e-9d56-501c1005beac",
"metadata": {},
"source": [
"# Download with DuckDB\n",
"## Download with DuckDB\n",
"In this step we can already filter all OSM contributions by four major factors. We will perform more detailed filtering (e.g. for OSM tags values) later:\n",
"* **status** (e.g. latest, historic or deleted OSM features)\n",
"* **location** (using the bounding box coordinates of each OSM feature)\n",
Expand Down Expand Up @@ -257,7 +257,7 @@
"id": "73b8b5e7-41b7-4c3f-a901-3c5403dc4150",
"metadata": {},
"source": [
"# Save data as GeoPackage"
"## Save data as GeoPackage"
]
},
{
Expand Down Expand Up @@ -436,7 +436,7 @@
"id": "50bf5649-9176-4e7f-83ac-752fc9e6faae",
"metadata": {},
"source": [
"# Work with the data in QGIS\n",
"## Work with the data in QGIS\n",
"Add your geopackage file in QGIS, e.g. via drag-and-drop or through file manager."
]
},
Expand Down
175 changes: 101 additions & 74 deletions book/02a_buildings_currentness_DuckDB_PyIceberg.ipynb

Large diffs are not rendered by default.

48 changes: 24 additions & 24 deletions book/02b_buildings_currentness_DuckDB_only.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion book/03_hot_tm_project_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"id": "e1f3f905-3bea-40ba-b986-952783ac9a85",
"metadata": {},
"source": [
"# Getting started\n",
"## Getting started\n",
"Set connection params."
]
},
Expand Down
253 changes: 182 additions & 71 deletions book/03_mapillary_data_analysis.ipynb

Large diffs are not rendered by default.

163 changes: 98 additions & 65 deletions book/04_Country_User_Activity_DuckDB_only.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"id": "e9dc6612-474d-4acc-b582-c1938fc934e7",
"metadata": {},
"source": [
"# DuckDB: Country user activity map\n",
"# User activity map\n",
"In this notebook we demonstrate how to analyze and visualize daily OSM user activity for a country.\n",
"\n",
"These are the steps you see further down:\n",
Expand All @@ -21,13 +21,13 @@
"id": "c05030dc-5478-4d48-a1c3-514393d1ce1c",
"metadata": {},
"source": [
"# Getting started\n",
"## Getting started\n",
"Set connection params."
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "28abef98-ce6d-416f-b9e5-6902792febec",
"metadata": {},
"outputs": [],
Expand All @@ -48,7 +48,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"id": "d025997b-17a6-42b9-9b76-e95c5293fc79",
"metadata": {},
"outputs": [],
Expand All @@ -66,6 +66,46 @@
"con.load_extension(\"spatial\")"
]
},
{
"cell_type": "markdown",
"id": "3698a75f-ff95-463b-98dd-2dd05680c94e",
"metadata": {},
"source": [
"Set the connection params to Iceberg Rest Catalog."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "39b78ffb-9f49-4a66-bbb3-b752992deb07",
"metadata": {},
"outputs": [],
"source": [
"!pip install \"pyiceberg[s3fs,duckdb,sql-sqlite,pyarrow]\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "939cfc99-0c29-4e09-872a-083d7364a957",
"metadata": {},
"outputs": [],
"source": [
"from pyiceberg.catalog.rest import RestCatalog\n",
"\n",
"catalog = RestCatalog(\n",
" name=\"default\",\n",
" **{\n",
" \"uri\": \"https://sotm2024.iceberg.ohsome.org\",\n",
" \"s3.endpoint\": \"https://sotm2024.minio.heigit.org\",\n",
" \"py-io-impl\": \"pyiceberg.io.pyarrow.PyArrowFileIO\",\n",
" \"s3.access-key-id\": s3_user,\n",
" \"s3.secret-access-key\": s3_password,\n",
" \"s3.region\": \"eu-central-1\"\n",
" }\n",
")"
]
},
{
"cell_type": "markdown",
"id": "7da992ee-ac70-4d31-98f7-a3a2d301855a",
Expand Down Expand Up @@ -123,14 +163,15 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 77,
"id": "c125effb-70cc-4b35-9246-72e334c8f4e4",
"metadata": {},
"outputs": [],
"source": [
"# Set s3 path for parquet input data\n",
"#parquet_data_path = \"s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions_germany_h3/**\"\n",
"parquet_data_path = \"s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions/**\"\n",
"# Set iceberg table\n",
"namespace = 'geo_sort'\n",
"tablename = 'contributions'\n",
"icebergtable = catalog.load_table((namespace, tablename))\n",
"\n",
"# Define location filter\n",
"bboxes = {\n",
Expand All @@ -139,12 +180,11 @@
" 'brazil': (-73.99, -33.77, -34.73, 5.24)\n",
"}\n",
"\n",
"selected_region = 'germany'\n",
"selected_region = 'kenya'\n",
"xmin, ymin, xmax, ymax = bboxes[selected_region]\n",
"\n",
"# Define time range filter\n",
"start_timestamp = '2024-01-01'\n",
"end_timestamp = '2024-06-01'\n",
"start_timestamp = '2024-01-01T00:00:00'\n",
"time_interval = 'day'"
]
},
Expand All @@ -159,57 +199,38 @@
},
{
"cell_type": "code",
"execution_count": 148,
"id": "cb1514a0-6c0b-4313-a777-ae204ee7ce91",
"execution_count": 78,
"id": "b951b95d-11a7-4d2b-a81f-3ab8e5339a1b",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f2e1851ba80f44b388711e63a96f87af",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"this took 328.781 sec.\n"
"download took 16.318 sec.\n"
]
}
],
"source": [
"import time\n",
"start_time = time.time()\n",
"\n",
"query = f\"\"\"\n",
"DROP TABLE IF EXISTS user_contributions;\n",
"CREATE TABLE user_contributions AS\n",
"SELECT\n",
" h3_r5,\n",
" valid_from,\n",
" user_id\n",
"FROM read_parquet('{parquet_data_path}', hive_partitioning=true) a \n",
"WHERE 1=1\n",
" and (status = 'latest' or status = 'history')\n",
" and valid_from >= '{start_timestamp}'\n",
" and valid_from < '{end_timestamp}'\n",
" -- spatial filtering part\n",
" and (a.bbox.xmax >= {xmin} AND a.bbox.xmin <= {xmax})\n",
" and (a.bbox.ymax >= {ymin} AND a.bbox.ymin <= {ymax})\n",
";\n",
"\"\"\"\n",
"con.sql(query)\n",
"icebergtable.scan(\n",
" row_filter=(\n",
" f\"(status = 'latest' or status = 'history')\"\n",
" f\"and (xmax >= {xmin} and xmin <= {xmax}) \"\n",
" f\"and (ymax >= {ymin} and ymin <= {ymax}) \"\n",
" f\"and valid_from >= '{start_timestamp}'\"\n",
" ),\n",
" selected_fields=(\n",
" \"user_id\",\n",
" \"valid_from\",\n",
" \"h3_r5\"\n",
" ),\n",
").to_duckdb('raw_osm_data',connection=con)\n",
"\n",
"download_time = round(time.time() - start_time, 3)\n",
"print(f\"this took {download_time} sec.\")"
"print(f\"download took {download_time} sec.\")"
]
},
{
Expand All @@ -224,26 +245,39 @@
},
{
"cell_type": "code",
"execution_count": 149,
"execution_count": 79,
"id": "7bde9b9b-b4bd-488e-8dc3-b0b5ccbdba79",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"processing took 0.166 sec.\n"
]
}
],
"source": [
"start_time = time.time()\n",
"\n",
"query = f\"\"\"\n",
"INSTALL h3 FROM community;\n",
"LOAD h3;\n",
"\n",
"SELECT\n",
" h3_r5,\n",
" epoch_ms(date_trunc('{time_interval}', valid_from)) as time_interval,\n",
" 1.0 * epoch_ms(date_trunc('{time_interval}', valid_from)) as time_interval,\n",
" count(distinct user_id) as n_users,\n",
" h3_cell_to_boundary_wkt(h3_r5) as geometry\n",
"FROM user_contributions\n",
"FROM raw_osm_data\n",
"GROUP BY 1, 2\n",
"ORDER BY 2, 1;\n",
"\"\"\"\n",
"\n",
"df = con.sql(query).df()"
"df = con.sql(query).df()\n",
"\n",
"processing_time = round(time.time() - start_time, 3)\n",
"print(f\"processing took {processing_time} sec.\")"
]
},
{
Expand All @@ -257,7 +291,7 @@
},
{
"cell_type": "code",
"execution_count": 150,
"execution_count": 80,
"id": "1d7426fa-44ab-461a-97f3-7f378cc15c6d",
"metadata": {},
"outputs": [],
Expand All @@ -282,7 +316,7 @@
},
{
"cell_type": "code",
"execution_count": 151,
"execution_count": 81,
"id": "9153d200-dbd6-45dd-ad91-1d6823b32d8a",
"metadata": {},
"outputs": [],
Expand All @@ -303,8 +337,8 @@
"heights = gdf[\"height\"].to_numpy()\n",
"heights = np.nan_to_num(heights, nan=1)\n",
"\n",
"min_valid_from = 1000 * datetime.datetime(2024,1,1).timestamp()\n",
"max_valid_from = 1000 * datetime.datetime(2024,6,1).timestamp()\n",
"min_valid_from = 1000 * datetime.datetime(2023,1,1).replace(tzinfo=datetime.timezone.utc).timestamp()\n",
"max_valid_from = 1000 * datetime.datetime(2024,6,1).replace(tzinfo=datetime.timezone.utc).timestamp()\n",
"\n",
"# the lonboard map definition\n",
"layer = lonboard.PolygonLayer.from_geopandas(\n",
Expand Down Expand Up @@ -341,17 +375,17 @@
},
{
"cell_type": "code",
"execution_count": 152,
"execution_count": 82,
"id": "b6151959-5eb1-414f-a6d9-1b4d9dab41f3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<traitlets.traitlets.directional_link at 0x742148909ac0>"
"<traitlets.traitlets.directional_link at 0x7da1d41f6720>"
]
},
"execution_count": 152,
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -365,11 +399,10 @@
"end = datetime.datetime(2024,6,1)\n",
"delta = end - start # returns timedelta\n",
"dates = [start + timedelta(days=i) for i in range(delta.days + 1)]\n",
"options = [(i.strftime('%d-%b-%Y'), int(1000* i.timestamp())) for i in dates]\n",
"options = [(i.strftime('%d-%b-%Y'), 1000* i.replace(tzinfo=datetime.timezone.utc).timestamp()) for i in dates]\n",
"\n",
"date_slider = ipywidgets.SelectionSlider(\n",
" options=options,\n",
" #index=(0, len(dates)-1),\n",
" description='Day:',\n",
" layout=ipywidgets.Layout(width='1000px'),\n",
" disabled=False\n",
Expand All @@ -392,14 +425,14 @@
},
{
"cell_type": "code",
"execution_count": 153,
"execution_count": 83,
"id": "d4a7fba3-f047-464e-9c2d-ba3ce2a89726",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2a9704153ea24afdbe496d217d3923fb",
"model_id": "f0abda2707564f008a48071ccd169db0",
"version_major": 2,
"version_minor": 1
},
Expand All @@ -413,12 +446,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "99b8681c31c148c6b7d0f42d1e15f601",
"model_id": "9b62e0719de04a948fb41741e2e977a8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"SelectionSlider(description='Day:', layout=Layout(width='1000px'), options=(('01-Jan-2024', 1704067200000), ('"
"SelectionSlider(description='Day:', layout=Layout(width='1000px'), options=(('01-Jan-2024', 1704067200000.0), …"
]
},
"metadata": {},
Expand Down
Loading

0 comments on commit 8db81fd

Please sign in to comment.