fix all headings and use pyiceberg and duckdb in all notebooks

GIScience · Aug 31, 2024 · 8db81fd · 8db81fd
1 parent 3a9054e
commit 8db81fd
Show file tree

Hide file tree

Showing 9 changed files with 1,206 additions and 377 deletions.
diff --git a/book/00_MinIO_Object_Store.ipynb b/book/00_MinIO_Object_Store.ipynb
@@ -29,7 +29,7 @@
    "id": "8a8d65c2-8638-4385-a40d-0dfa190d4601",
    "metadata": {},
    "source": [
-    "# Connect to MinIO via DuckDB"
+    "## Connect to MinIO via DuckDB"
    ]
   },
   {

diff --git a/book/01a_Data_Extraction_DuckDB_PyIceberg.ipynb b/book/01a_Data_Extraction_DuckDB_PyIceberg.ipynb
diff --git a/book/01b_Data_Extraction_DuckDB_only.ipynb b/book/01b_Data_Extraction_DuckDB_only.ipynb
@@ -22,7 +22,7 @@
    "id": "b6ec70ad-bce1-4052-b151-23fe47fad942",
    "metadata": {},
    "source": [
-    "# Getting started\n",
+    "## Getting started\n",
     "Set connection params."
    ]
   },
@@ -116,7 +116,7 @@
    "id": "bed363ef-ee28-411e-9d56-501c1005beac",
    "metadata": {},
    "source": [
-    "# Download with DuckDB\n",
+    "## Download with DuckDB\n",
     "In this step we can already filter all OSM contributions by four major factors. We will perform more detailed filtering (e.g. for OSM tags values) later:\n",
     "* **status** (e.g. latest, historic or deleted OSM features)\n",
     "* **location** (using the bounding box coordinates of each OSM feature)\n",
@@ -257,7 +257,7 @@
    "id": "73b8b5e7-41b7-4c3f-a901-3c5403dc4150",
    "metadata": {},
    "source": [
-    "# Save data as GeoPackage"
+    "## Save data as GeoPackage"
    ]
   },
   {
@@ -436,7 +436,7 @@
    "id": "50bf5649-9176-4e7f-83ac-752fc9e6faae",
    "metadata": {},
    "source": [
-    "# Work with the data in QGIS\n",
+    "## Work with the data in QGIS\n",
     "Add your geopackage file in QGIS, e.g. via drag-and-drop or through file manager."
    ]
   },

diff --git a/book/02a_buildings_currentness_DuckDB_PyIceberg.ipynb b/book/02a_buildings_currentness_DuckDB_PyIceberg.ipynb
diff --git a/book/02b_buildings_currentness_DuckDB_only.ipynb b/book/02b_buildings_currentness_DuckDB_only.ipynb
diff --git a/book/03_hot_tm_project_analysis.ipynb b/book/03_hot_tm_project_analysis.ipynb
@@ -23,7 +23,7 @@
    "id": "e1f3f905-3bea-40ba-b986-952783ac9a85",
    "metadata": {},
    "source": [
-    "# Getting started\n",
+    "## Getting started\n",
     "Set connection params."
    ]
   },

diff --git a/book/03_mapillary_data_analysis.ipynb b/book/03_mapillary_data_analysis.ipynb
diff --git a/book/04_Country_User_Activity_DuckDB_only.ipynb b/book/04_Country_User_Activity_DuckDB_only.ipynb
@@ -5,7 +5,7 @@
    "id": "e9dc6612-474d-4acc-b582-c1938fc934e7",
    "metadata": {},
    "source": [
-    "# DuckDB: Country user activity map\n",
+    "# User activity map\n",
     "In this notebook we demonstrate how to analyze and visualize daily OSM user activity for a country.\n",
     "\n",
     "These are the steps you see further down:\n",
@@ -21,13 +21,13 @@
    "id": "c05030dc-5478-4d48-a1c3-514393d1ce1c",
    "metadata": {},
    "source": [
-    "# Getting started\n",
+    "## Getting started\n",
     "Set connection params."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "id": "28abef98-ce6d-416f-b9e5-6902792febec",
    "metadata": {},
    "outputs": [],
@@ -48,7 +48,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "id": "d025997b-17a6-42b9-9b76-e95c5293fc79",
    "metadata": {},
    "outputs": [],
@@ -66,6 +66,46 @@
     "con.load_extension(\"spatial\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "3698a75f-ff95-463b-98dd-2dd05680c94e",
+   "metadata": {},
+   "source": [
+    "Set the connection params to Iceberg Rest Catalog."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39b78ffb-9f49-4a66-bbb3-b752992deb07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install \"pyiceberg[s3fs,duckdb,sql-sqlite,pyarrow]\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "939cfc99-0c29-4e09-872a-083d7364a957",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyiceberg.catalog.rest import RestCatalog\n",
+    "\n",
+    "catalog = RestCatalog(\n",
+    "    name=\"default\",\n",
+    "    **{\n",
+    "        \"uri\": \"https://sotm2024.iceberg.ohsome.org\",\n",
+    "        \"s3.endpoint\": \"https://sotm2024.minio.heigit.org\",\n",
+    "        \"py-io-impl\": \"pyiceberg.io.pyarrow.PyArrowFileIO\",\n",
+    "        \"s3.access-key-id\": s3_user,\n",
+    "        \"s3.secret-access-key\": s3_password,\n",
+    "        \"s3.region\": \"eu-central-1\"\n",
+    "    }\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7da992ee-ac70-4d31-98f7-a3a2d301855a",
@@ -123,14 +163,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 77,
    "id": "c125effb-70cc-4b35-9246-72e334c8f4e4",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Set s3 path for parquet input data\n",
-    "#parquet_data_path = \"s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions_germany_h3/**\"\n",
-    "parquet_data_path = \"s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions/**\"\n",
+    "# Set iceberg table\n",
+    "namespace = 'geo_sort'\n",
+    "tablename = 'contributions'\n",
+    "icebergtable = catalog.load_table((namespace, tablename))\n",
     "\n",
     "# Define location filter\n",
     "bboxes = {\n",
@@ -139,12 +180,11 @@
     "    'brazil': (-73.99, -33.77, -34.73, 5.24)\n",
     "}\n",
     "\n",
-    "selected_region = 'germany'\n",
+    "selected_region = 'kenya'\n",
     "xmin, ymin, xmax, ymax = bboxes[selected_region]\n",
     "\n",
     "# Define time range filter\n",
-    "start_timestamp = '2024-01-01'\n",
-    "end_timestamp = '2024-06-01'\n",
+    "start_timestamp = '2024-01-01T00:00:00'\n",
     "time_interval = 'day'"
    ]
   },
@@ -159,57 +199,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 148,
-   "id": "cb1514a0-6c0b-4313-a777-ae204ee7ce91",
+   "execution_count": 78,
+   "id": "b951b95d-11a7-4d2b-a81f-3ab8e5339a1b",
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f2e1851ba80f44b388711e63a96f87af",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "this took 328.781 sec.\n"
+      "download took 16.318 sec.\n"
      ]
     }
    ],
    "source": [
     "import time\n",
     "start_time = time.time()\n",
     "\n",
-    "query = f\"\"\"\n",
-    "DROP TABLE IF EXISTS user_contributions;\n",
-    "CREATE TABLE user_contributions AS\n",
-    "SELECT\n",
-    "    h3_r5,\n",
-    "    valid_from,\n",
-    "    user_id\n",
-    "FROM read_parquet('{parquet_data_path}', hive_partitioning=true) a \n",
-    "WHERE 1=1\n",
-    "    and (status = 'latest' or status = 'history')\n",
-    "    and valid_from >= '{start_timestamp}'\n",
-    "    and valid_from < '{end_timestamp}'\n",
-    "    -- spatial filtering part\n",
-    "    and (a.bbox.xmax >= {xmin} AND a.bbox.xmin <= {xmax})\n",
-    "    and (a.bbox.ymax >= {ymin} AND a.bbox.ymin <= {ymax})\n",
-    ";\n",
-    "\"\"\"\n",
-    "con.sql(query)\n",
+    "icebergtable.scan(\n",
+    "    row_filter=(\n",
+    "        f\"(status = 'latest' or status = 'history')\"\n",
+    "        f\"and (xmax >= {xmin} and xmin <= {xmax}) \"\n",
+    "        f\"and (ymax >= {ymin} and ymin <= {ymax}) \"\n",
+    "        f\"and valid_from >= '{start_timestamp}'\"\n",
+    "    ),\n",
+    "    selected_fields=(\n",
+    "        \"user_id\",\n",
+    "        \"valid_from\",\n",
+    "        \"h3_r5\"\n",
+    "    ),\n",
+    ").to_duckdb('raw_osm_data',connection=con)\n",
     "\n",
     "download_time = round(time.time() - start_time, 3)\n",
-    "print(f\"this took {download_time} sec.\")"
+    "print(f\"download took {download_time} sec.\")"
    ]
   },
   {
@@ -224,26 +245,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 149,
+   "execution_count": 79,
    "id": "7bde9b9b-b4bd-488e-8dc3-b0b5ccbdba79",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "processing took 0.166 sec.\n"
+     ]
+    }
+   ],
    "source": [
+    "start_time = time.time()\n",
+    "\n",
     "query = f\"\"\"\n",
     "INSTALL h3 FROM community;\n",
     "LOAD h3;\n",
     "\n",
     "SELECT\n",
     "    h3_r5,\n",
-    "    epoch_ms(date_trunc('{time_interval}', valid_from)) as time_interval,\n",
+    "    1.0 * epoch_ms(date_trunc('{time_interval}', valid_from)) as time_interval,\n",
     "    count(distinct user_id) as n_users,\n",
     "    h3_cell_to_boundary_wkt(h3_r5) as geometry\n",
-    "FROM user_contributions\n",
+    "FROM raw_osm_data\n",
     "GROUP BY 1, 2\n",
     "ORDER BY 2, 1;\n",
     "\"\"\"\n",
     "\n",
-    "df = con.sql(query).df()"
+    "df = con.sql(query).df()\n",
+    "\n",
+    "processing_time = round(time.time() - start_time, 3)\n",
+    "print(f\"processing took {processing_time} sec.\")"
    ]
   },
   {
@@ -257,7 +291,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 150,
+   "execution_count": 80,
    "id": "1d7426fa-44ab-461a-97f3-7f378cc15c6d",
    "metadata": {},
    "outputs": [],
@@ -282,7 +316,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
+   "execution_count": 81,
    "id": "9153d200-dbd6-45dd-ad91-1d6823b32d8a",
    "metadata": {},
    "outputs": [],
@@ -303,8 +337,8 @@
     "heights = gdf[\"height\"].to_numpy()\n",
     "heights = np.nan_to_num(heights, nan=1)\n",
     "\n",
-    "min_valid_from = 1000 * datetime.datetime(2024,1,1).timestamp()\n",
-    "max_valid_from = 1000 * datetime.datetime(2024,6,1).timestamp()\n",
+    "min_valid_from = 1000 * datetime.datetime(2023,1,1).replace(tzinfo=datetime.timezone.utc).timestamp()\n",
+    "max_valid_from = 1000 * datetime.datetime(2024,6,1).replace(tzinfo=datetime.timezone.utc).timestamp()\n",
     "\n",
     "# the lonboard map definition\n",
     "layer = lonboard.PolygonLayer.from_geopandas(\n",
@@ -341,17 +375,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 152,
+   "execution_count": 82,
    "id": "b6151959-5eb1-414f-a6d9-1b4d9dab41f3",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<traitlets.traitlets.directional_link at 0x742148909ac0>"
+       "<traitlets.traitlets.directional_link at 0x7da1d41f6720>"
       ]
      },
-     "execution_count": 152,
+     "execution_count": 82,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -365,11 +399,10 @@
     "end = datetime.datetime(2024,6,1)\n",
     "delta = end - start   # returns timedelta\n",
     "dates = [start + timedelta(days=i) for i in range(delta.days + 1)]\n",
-    "options = [(i.strftime('%d-%b-%Y'), int(1000* i.timestamp())) for i in dates]\n",
+    "options = [(i.strftime('%d-%b-%Y'), 1000* i.replace(tzinfo=datetime.timezone.utc).timestamp()) for i in dates]\n",
     "\n",
     "date_slider = ipywidgets.SelectionSlider(\n",
     "    options=options,\n",
-    "    #index=(0, len(dates)-1),\n",
     "    description='Day:',\n",
     "    layout=ipywidgets.Layout(width='1000px'),\n",
     "    disabled=False\n",
@@ -392,14 +425,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
+   "execution_count": 83,
    "id": "d4a7fba3-f047-464e-9c2d-ba3ce2a89726",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2a9704153ea24afdbe496d217d3923fb",
+       "model_id": "f0abda2707564f008a48071ccd169db0",
        "version_major": 2,
        "version_minor": 1
       },
@@ -413,12 +446,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "99b8681c31c148c6b7d0f42d1e15f601",
+       "model_id": "9b62e0719de04a948fb41741e2e977a8",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "SelectionSlider(description='Day:', layout=Layout(width='1000px'), options=(('01-Jan-2024', 1704067200000), ('…"
+       "SelectionSlider(description='Day:', layout=Layout(width='1000px'), options=(('01-Jan-2024', 1704067200000.0), …"
       ]
      },
      "metadata": {},