From 4fc11e95ea4033c8d49e47b52f9f3b77a5bfbaa4 Mon Sep 17 00:00:00 2001 From: h9b Date: Mon, 26 Aug 2024 17:10:34 +0200 Subject: [PATCH] remove unused notebook --- notebooks/01_Data_Extraction.ipynb | 435 ----------------------------- 1 file changed, 435 deletions(-) delete mode 100644 notebooks/01_Data_Extraction.ipynb diff --git a/notebooks/01_Data_Extraction.ipynb b/notebooks/01_Data_Extraction.ipynb deleted file mode 100644 index 2ab9ea6..0000000 --- a/notebooks/01_Data_Extraction.ipynb +++ /dev/null @@ -1,435 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "7ee2c20c-e467-4499-84c5-0d583cee77b6", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "# Data Extraction from *geo-sorted ohsome contributions* \n", - "\n", - ":::{note}\n", - "1. Set the connection params to Iceberg Rest Catalog and configure DuckDB.\n", - "2. Download the data in 3 steps:\n", - " * Do an iceberg table scan with a pre-filter with PyIceberg.\n", - " * Fine filter the data in a Dataframe after download with DuckDB.\n", - " * Export results into geopackage file with GeoPandas.\n", - ":::" - ] - }, - { - "cell_type": "markdown", - "id": "b6ec70ad-bce1-4052-b151-23fe47fad942", - "metadata": {}, - "source": [ - "# Getting started\n", - "Set connection params." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "554c5bc9-8962-44ed-96f8-fd34b3efe564", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "s3_user = os.environ[\"S3_ACCESS_KEY_ID\"] # add your user here\n", - "s3_password = os.environ[\"S3_SECRET_ACCESS_KEY\"] # add your password here" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "473e2ffa-dba9-4ba1-8319-403e3f0ecbdb", - "metadata": {}, - "outputs": [], - "source": [ - "from pyiceberg.catalog.rest import RestCatalog\n", - "\n", - "catalog = RestCatalog(\n", - " name=\"default\",\n", - " **{\n", - " \"uri\": \"https://sotm2024.iceberg.ohsome.org\",\n", - " \"s3.endpoint\": \"https://sotm2024.minio.heigit.org\",\n", - " \"py-io-impl\": \"pyiceberg.io.pyarrow.PyArrowFileIO\",\n", - " \"s3.access-key-id\": s3_user,\n", - " \"s3.secret-access-key\": s3_password,\n", - " \"s3.region\": \"eu-central-1\"\n", - " }\n", - ")\n", - "\n", - "# iceberg table\n", - "namespace = 'geo_sort'\n", - "tablename = 'contributions'\n", - "icebergtable = catalog.load_table((namespace, tablename))" - ] - }, - { - "cell_type": "markdown", - "id": "c4eccfe1-6b87-4640-a33a-3314252d0223", - "metadata": {}, - "source": [ - "Configure DuckDB." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f2e24802-8629-4648-afe5-d4c5f21403df", - "metadata": {}, - "outputs": [], - "source": [ - "import duckdb\n", - "\n", - "con = duckdb.connect(\n", - " config={\n", - " 'threads': 8,\n", - " 'max_memory': '8GB'\n", - " }\n", - ")\n", - "con.install_extension(\"spatial\")\n", - "con.load_extension(\"spatial\")" - ] - }, - { - "cell_type": "markdown", - "id": "bed363ef-ee28-411e-9d56-501c1005beac", - "metadata": {}, - "source": [ - "# Iceberg to DuckDB\n", - "In this step we can already filter all OSM contributions by four major factors. We will perform more detailed filtering (e.g. for OSM tags values) later:\n", - "* **status** (e.g. latest, historic or deleted OSM features)\n", - "* **location** (using the bounding box coordinates of each OSM feature)\n", - "* **geometry type** (e.g. for Polygons, Linestrings or Points)\n", - "* **time** (e.g. the edit timestamp of each OSM contribution)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "872da17b-f490-4a1a-8aec-09ad847db0ea", - "metadata": {}, - "outputs": [], - "source": [ - "# Define status filter\n", - "status = 'latest'\n", - "\n", - "# Define location filter\n", - "bboxes = {\n", - " 'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),\n", - " 'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),\n", - " 'mannheim': (8.41416, 49.410362, 8.58999, 49.590489), \n", - " 'berlin': (13.088345, 52.338271, 13.761161, 52.675509)\n", - "}\n", - "xmin, ymin, xmax, ymax = bboxes['heidelberg']\n", - "\n", - "# Define geometry type filter\n", - "geometry_type = 'Polygon'\n", - "\n", - "# Define time filter (optional)\n", - "min_timestamp = '2024-01-01T00:00:00'\n", - "max_timestamp = '2024-06-01T00:00:00'" - ] - }, - { - "cell_type": "markdown", - "id": "19916f1e-cb3b-417e-bff8-91831cb00f51", - "metadata": {}, - "source": [ - "Furthermore, we define which attributes / columns this download should contain. Check out the [dataset description page](./README.md) to get an overview on all available columns.\n", - "\n", - "Usually you rarely want to extract all available columns as this would reduce speed of the data download. Here we are going to download the following information:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "523676cd-a446-4d09-b660-2332a6d964e5", - "metadata": {}, - "outputs": [], - "source": [ - "selected_fields = [\n", - " \"user_id\",\n", - " \"osm_id\",\n", - " \"osm_version\",\n", - " \"valid_from\",\n", - " \"tags\",\n", - " \"geometry\" \n", - "]" - ] - }, - { - "cell_type": "markdown", - "id": "3da6409d-802d-4162-a6f6-df1ac74a6b50", - "metadata": {}, - "source": [ - ":::{margin} Download speed matters only in this step.\n", - "Download speed matters only in this step.\n", - "\n", - "This is the only step in which we will download data from the server to our client (e.g. your laptop or jupyter notebook server). Internet connection and overall data size are the most common potential bottlenecks for this part of the analysis.

\n", - "

We have optimized the structure for all tables in the geo_sort namespace to filter for status, geometry_type and location.\n", - " \n", - ":::" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d2125907-3028-4e76-ae26-39ac7adf0f94", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "download took 27.171 sec.\n" - ] - } - ], - "source": [ - "import time\n", - "\n", - "start_time = time.time()\n", - "\n", - "icebergtable.scan(\n", - " row_filter= (\n", - " f\"status = '{status}'\"\n", - " f\"and geometry_type = '{geometry_type}'\"\n", - " #f\"and (bbox.xmax >= {xmin} and bbox.xmin <= {xmax})\"\n", - " #f\"and (bbox.ymax >= {ymin} and bbox.ymin <= {ymax})\"\n", - " # optional timestamp filter\n", - " # f\"and valid_from >= '{min_timestamp}'\"\n", - " # f\"and valid_from < '{max_timestamp}'\"\n", - " ),\n", - " selected_fields=selected_fields,\n", - " # optional: limit the number of features downloadd \n", - " limit=25000\n", - ").to_duckdb('osm_data',connection=con)\n", - "\n", - "download_time = round(time.time() - start_time, 3)\n", - "print(f\"download took {download_time} sec.\")" - ] - }, - { - "cell_type": "markdown", - "id": "73b8b5e7-41b7-4c3f-a901-3c5403dc4150", - "metadata": {}, - "source": [ - "# DuckDB to GeoPackage" - ] - }, - { - "cell_type": "markdown", - "id": "3500cf69-363b-452e-be63-0f969b536e5b", - "metadata": {}, - "source": [ - "Show the structure of the data we have just downloaded." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d738f1c2-cbaf-49ce-9350-e6b28e2414b3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "┌─────────────┬───────────────────────┬─────────┬─────────┬─────────┬─────────┐\n", - "│ column_name │ column_type │ null │ key │ default │ extra │\n", - "│ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │\n", - "├─────────────┼───────────────────────┼─────────┼─────────┼─────────┼─────────┤\n", - "│ user_id │ INTEGER │ YES │ NULL │ NULL │ NULL │\n", - "│ valid_from │ TIMESTAMP │ YES │ NULL │ NULL │ NULL │\n", - "│ osm_id │ VARCHAR │ YES │ NULL │ NULL │ NULL │\n", - "│ osm_version │ INTEGER │ YES │ NULL │ NULL │ NULL │\n", - "│ tags │ MAP(VARCHAR, VARCHAR) │ YES │ NULL │ NULL │ NULL │\n", - "│ geometry │ VARCHAR │ YES │ NULL │ NULL │ NULL │\n", - "└─────────────┴───────────────────────┴─────────┴─────────┴─────────┴─────────┘" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = \"\"\"\n", - "DESCRIBE\n", - "FROM osm_data;\n", - "\"\"\"\n", - "con.sql(query)" - ] - }, - { - "cell_type": "markdown", - "id": "60155988-a105-4a04-a305-b635e6b41ce7", - "metadata": {}, - "source": [ - "Inspect a few features." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "7fc355bf-cf38-487d-9b80-5cc02b9edc31", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "┌─────────┬─────────────────────┬──────────────────┬─────────────┬──────────────────────┬──────────────────────────────┐\n", - "│ user_id │ valid_from │ osm_id │ osm_version │ tags │ geometry │\n", - "│ int32 │ timestamp │ varchar │ int32 │ map(varchar, varch… │ varchar │\n", - "├─────────┼─────────────────────┼──────────────────┼─────────────┼──────────────────────┼──────────────────────────────┤\n", - "│ 1462877 │ 2013-05-19 10:35:00 │ way/222126104 │ 1 │ {natural=bare_rock… │ POLYGON ((-179.92694939999… │\n", - "│ 1462877 │ 2013-05-19 10:35:28 │ way/222126268 │ 1 │ {natural=bare_rock… │ POLYGON ((-179.7100523 -85… │\n", - "│ 1462877 │ 2013-05-19 10:36:17 │ relation/2939698 │ 1 │ {natural=bare_rock… │ POLYGON ((-178.7729392 -85… │\n", - "│ 1462877 │ 2013-05-19 10:36:26 │ relation/2939760 │ 1 │ {natural=glacier, … │ POLYGON ((-178.7067439 -85… │\n", - "│ 1462877 │ 2013-05-19 10:36:18 │ relation/2939702 │ 1 │ {natural=bare_rock… │ POLYGON ((-178.6380738 -85… │\n", - "└─────────┴─────────────────────┴──────────────────┴─────────────┴──────────────────────┴──────────────────────────────┘" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = \"\"\"\n", - "SELECT *\n", - "FROM osm_data\n", - "LIMIT 5;\n", - "\"\"\"\n", - "con.sql(query)" - ] - }, - { - "cell_type": "markdown", - "id": "9a670374-387b-4848-a295-cb2f2edc33b7", - "metadata": {}, - "source": [ - "Count the number of features in the table when applying a more detailed tag filter.\n", - "\n", - "Furthermore, apply detailed geometry filter for Heidelberg boundary." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "d070a68f-1615-4e5f-94ea-3cd837f2be4b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "┌──────────────┐\n", - "│ count_star() │\n", - "│ int64 │\n", - "├──────────────┤\n", - "│ 0 │\n", - "└──────────────┘" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = \"\"\"\n", - "SELECT count(*)\n", - "FROM\n", - " osm_data,\n", - " st_read('../data/Heidelberg.geojson') as heidelberg\n", - "WHERE 1=1\n", - " -- filter for all boundaries in OSM --> boundary=*\n", - " and list_contains(map_keys(tags), 'boundary')\n", - " -- intersect osm data with Heidelberg boundary\n", - " and ST_Intersects(st_GeomFromText(osm_data.geometry), heidelberg.geom)\n", - "\"\"\"\n", - "con.sql(query)" - ] - }, - { - "cell_type": "markdown", - "id": "7c406784-9c12-41f9-9f1e-87b9c571cdc3", - "metadata": {}, - "source": [ - "Export as GeoPackage via GeoPandas." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "3648c939-7813-4be3-b1d0-b51d3fbad4de", - "metadata": {}, - "outputs": [], - "source": [ - "import geopandas as gpd\n", - "\n", - "query = f\"\"\"\n", - " SELECT *\n", - " FROM\n", - " osm_data,\n", - " st_read('../data/Heidelberg.geojson') as heidelberg\n", - " WHERE 1=1\n", - " -- filter for all boundaries in OSM --> boundary=*\n", - " and list_contains(map_keys(tags), 'boundary')\n", - " -- intersect osm data with Heidelberg boundary\n", - " and ST_Intersects(st_GeomFromText(osm_data.geometry), heidelberg.geom)\n", - "\"\"\"\n", - "df = con.sql(query).df()\n", - "\n", - "gdf = gpd.GeoDataFrame(\n", - " df,\n", - " geometry=gpd.GeoSeries.from_wkt(df['geometry'])\n", - ").set_crs('epsg:4326')\n", - "\n", - "output_filename = \"heidelberg_osm_data.gpkg\"\n", - "gdf.to_file(output_filename, driver='GPKG')" - ] - }, - { - "cell_type": "markdown", - "id": "50bf5649-9176-4e7f-83ac-752fc9e6faae", - "metadata": {}, - "source": [ - "# Work with the data in QGIS\n", - "Add your geopackage file in QGIS, e.g. via drag-and-drop or through file manager." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6087000-150b-4580-a3ed-a89236fee716", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}