From 2197b426c4c1dde3710bffd7c390a7b2e9424a32 Mon Sep 17 00:00:00 2001 From: h9b Date: Sat, 31 Aug 2024 22:02:19 +0200 Subject: [PATCH] add coporate edits analysis --- _toc.yml | 1 + book/02_corporate_edits.ipynb | 611 ++++++++++++++++++++++++++++++++++ 2 files changed, 612 insertions(+) create mode 100644 book/02_corporate_edits.ipynb diff --git a/_toc.yml b/_toc.yml index 955a61e..fee8219 100644 --- a/_toc.yml +++ b/_toc.yml @@ -17,6 +17,7 @@ parts: chapters: - file: book/02a_buildings_currentness_DuckDB_PyIceberg.ipynb - file: book/02_vandalism_detection.ipynb + - file: book/02_corporate_edits.ipynb - caption: Data Integration chapters: - file: book/03_hot_tm_project_analysis.ipynb diff --git a/book/02_corporate_edits.ipynb b/book/02_corporate_edits.ipynb new file mode 100644 index 0000000..2bfa165 --- /dev/null +++ b/book/02_corporate_edits.ipynb @@ -0,0 +1,611 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c4f83750-7648-4ccd-8ac3-8c4eab15647a", + "metadata": {}, + "source": [ + "# Corporate Edits Analysis\n", + "In this notebook we demonstrate how you inspect the spatial extent of corporate mapping in OSM.\n", + "We are going to make use of the **Changeset attributes** we have added to ohsome-data-insights.\n", + "\n", + "These are the steps you see further down:\n", + "\n", + "* Set the connection parameters.\n", + "* Prepare your input parameters, e.g. define area of interest and changeset hashtag filters.\n", + "* **Download data** using PyIceberg and DuckDB.\n", + "* Filter and process data with DuckDB.\n", + "* Visualize the results on a map." + ] + }, + { + "cell_type": "markdown", + "id": "99ab0505-5557-49b6-844f-12c4f2d7bf91", + "metadata": {}, + "source": [ + "## Getting started\n", + "Set connection params." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dcb46232-2d10-46eb-9ef5-e1575aa02378", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "s3_user = os.environ[\"S3_ACCESS_KEY_ID\"] # add your user here\n", + "s3_password = os.environ[\"S3_SECRET_ACCESS_KEY\"] # add your password here" + ] + }, + { + "cell_type": "markdown", + "id": "65fa2719-058a-4d95-a566-58b099b7f674", + "metadata": {}, + "source": [ + "Configure DuckDB." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3a40d89b-25cc-403c-a640-e4d064e0c85a", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "con = duckdb.connect(\n", + " config={\n", + " 'threads': 8,\n", + " 'max_memory': '8GB',\n", + " }\n", + ")\n", + "con.install_extension(\"spatial\")\n", + "con.load_extension(\"spatial\")" + ] + }, + { + "cell_type": "markdown", + "id": "0b728f80-ac5c-4a40-8d05-b31a73ba58e9", + "metadata": {}, + "source": [ + "Set the connection params to Iceberg Rest Catalog." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b424c246-def3-4176-9236-2940a807445f", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install \"pyiceberg[s3fs,duckdb,sql-sqlite,pyarrow]\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "384dc0cc-4249-4511-8198-e6fef4770244", + "metadata": {}, + "outputs": [], + "source": [ + "from pyiceberg.catalog.rest import RestCatalog\n", + "\n", + "catalog = RestCatalog(\n", + " name=\"default\",\n", + " **{\n", + " \"uri\": \"https://sotm2024.iceberg.ohsome.org\",\n", + " \"s3.endpoint\": \"https://sotm2024.minio.heigit.org\",\n", + " \"py-io-impl\": \"pyiceberg.io.pyarrow.PyArrowFileIO\",\n", + " \"s3.access-key-id\": s3_user,\n", + " \"s3.secret-access-key\": s3_password,\n", + " \"s3.region\": \"eu-central-1\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3daa7378-4d95-4e41-9efc-4642ee2e98f4", + "metadata": {}, + "source": [ + "## Prepare the input parameters for your analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1f37d396-8858-46d5-8012-ee991aa02d0e", + "metadata": {}, + "outputs": [], + "source": [ + "# Set iceberg table\n", + "namespace = 'geo_sort'\n", + "tablename = 'contributions'\n", + "icebergtable = catalog.load_table((namespace, tablename))\n", + "\n", + "\n", + "# Define time filter (optional)\n", + "min_timestamp = '2019-06-01T00:00:00'\n", + "max_timestamp = '2024-06-01T00:00:00'\n", + "\n", + "\n", + "# Define location filter\n", + "bboxes = {\n", + " 'colombia': (-78.9909352282, -4.29818694419, -66.8763258531, 12.4373031682), \n", + " 'indonesia': (95.2930261576, -10.3599874813, 141.03385176, 5.47982086834),\n", + " 'united_arab_emirates': (51.5795186705, 22.4969475367, 56.3968473651, 26.055464179)\n", + "}\n", + "\n", + "selected_region = 'colombia'\n", + "xmin, ymin, xmax, ymax = bboxes[selected_region]\n", + "\n", + "\n", + "# Define hashtag filter\n", + "corporate_changeset_hashtags = {\n", + " \"amap\": 1,\n", + " \"adt\": 2,\n", + " \"bolt\": 3,\n", + " \"DigitalEgypt\": 4,\n", + " \"expedia\": 5,\n", + " \"gojek\": 6,\n", + " \"MSFTOpenMaps\": 7,\n", + " \"grab\": 8,\n", + " \"Kaart\": 9,\n", + " \"Kontur\": 10,\n", + " \"mbx\": 11,\n", + " \"RocketData\": 12,\n", + " \"disputed_by_claimed_by\": 13,\n", + " \"Snapp\": 14,\n", + " \"stackbox\": 15,\n", + " \"Telenav\": 16,\n", + " \"Lightcyphers\": 17,\n", + " \"tomtom\": 18,\n", + " \"TIDBO\": 19,\n", + " \"WIGeoGIS-OMV\": 20,\n", + " \"نشان\": 21,\n", + " \"mapbox\": 22,\n", + " \"Komoot\": 23,\n", + " \"AppLogica\": 24\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "d7df8d54-0178-4ae7-955b-f727cd6bae17", + "metadata": {}, + "source": [ + "## Get the Data\n", + "First, we do an iceberg table scan with a pre-filter. This is a fast way to download all potential OSM elements that are needed for our analysis.\n", + "\n", + "Here we use one of the **Changeset attributes** we have joined in advance for each OSM contribution. We use the `changeset.hashtags` attribute to inspect all the OSM elements with that have been added by corporate mappers." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9525211e-5cbc-47c1-84a8-f717ba4bb1e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "download took 32.696 sec.\n" + ] + } + ], + "source": [ + "import time\n", + "start_time = time.time()\n", + "\n", + "icebergtable.scan(\n", + " row_filter=(\n", + " f\"(status = 'latest' or status = 'history') \"\n", + " f\"and (xmax >= {xmin} and xmin <= {xmax}) \"\n", + " f\"and (ymax >= {ymin} and ymin <= {ymax}) \"\n", + " f\"and valid_from >= '{min_timestamp}' \"\n", + " f\"and valid_from < '{max_timestamp}' \"\n", + " ),\n", + " selected_fields=(\n", + " \"user_id\",\n", + " \"valid_from\",\n", + " \"changeset\",\n", + " \"h3_r5\",\n", + " \"centroid\"\n", + " )\n", + ").to_duckdb('raw_osm_data',connection=con)\n", + "\n", + "download_time = round(time.time() - start_time, 3)\n", + "print(f\"download took {download_time} sec.\")" + ] + }, + { + "cell_type": "markdown", + "id": "66b9720e-290c-4eed-84d8-0b456e982c2a", + "metadata": {}, + "source": [ + "Filter for the coporate hashtags and assign company name to each contribution." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2b855864-03f5-40bc-8f4c-5984047a1f6a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "54a805e4f6d64ea68e9a44ecaecdd579", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "processing took 30.636 sec.\n" + ] + } + ], + "source": [ + "import time\n", + "start_time = time.time()\n", + "\n", + "query = f\"\"\"\n", + "INSTALL h3 FROM community;\n", + "LOAD h3;\n", + "\n", + "DROP TABLE IF EXISTS osm_data;\n", + "CREATE TABLE osm_data AS\n", + "(\n", + "SELECT\n", + " a.user_id,\n", + " a.valid_from,\n", + " a.h3_r5,\n", + " h3_cell_to_boundary_wkt(a.h3_r5) as geometry,\n", + " CAST(changeset['hashtags'][1] AS VARCHAR) as hashtag_string,\n", + " CASE\n", + "\t\tWHEN hashtag_string ILIKE '%amap%' THEN 'amap'\n", + "\t\tWHEN hashtag_string ILIKE '%adt%' THEN 'adt'\n", + "\t\tWHEN hashtag_string ILIKE '%bolt%' THEN 'bolt'\n", + "\t\tWHEN hashtag_string ILIKE '%DigitalEgypt%' THEN 'DigitalEgypt'\n", + "\t\tWHEN hashtag_string ILIKE '%expedia%' THEN 'expedia'\n", + "\t\tWHEN hashtag_string ILIKE '%gojek%' THEN 'gojek'\n", + "\t\tWHEN hashtag_string ILIKE '%MSFTOpenMaps%' THEN 'MSFTOpenMaps'\n", + "\t\tWHEN hashtag_string ILIKE '%grab%' THEN 'grab'\n", + "\t\tWHEN hashtag_string ILIKE '%Kaart%' THEN 'Kaart'\n", + "\t\tWHEN hashtag_string ILIKE '%Kontur%' THEN 'Kontur'\n", + "\t\tWHEN hashtag_string ILIKE '%mbx%' THEN 'mbx'\n", + "\t\tWHEN hashtag_string ILIKE '%RocketData%' THEN 'RocketData'\n", + "\t\tWHEN hashtag_string ILIKE '%disputed_by_claimed_by%' THEN 'disputed_by_claimed_by'\n", + "\t\tWHEN hashtag_string ILIKE '%Snapp%' THEN 'Snapp'\n", + "\t\tWHEN hashtag_string ILIKE '%stackbox%' THEN 'stackbox'\n", + "\t\tWHEN hashtag_string ILIKE '%Telenav%' THEN 'Telenav'\n", + "\t\tWHEN hashtag_string ILIKE '%Lightcyphers%' THEN 'Lightcyphers'\n", + "\t\tWHEN hashtag_string ILIKE '%tomtom%' THEN 'tomtom'\n", + "\t\tWHEN hashtag_string ILIKE '%TIDBO%' THEN 'TIDBO'\n", + "\t\tWHEN hashtag_string ILIKE '%WIGeoGIS-OMV%' THEN 'WIGeoGIS-OMV'\n", + "\t\tWHEN hashtag_string ILIKE '%نشان%' THEN 'Neshan'\n", + "\t\tWHEN hashtag_string ILIKE '%mapbox%' THEN 'mapbox'\n", + "\t\tWHEN hashtag_string ILIKE '%Komoot%' THEN 'Komoot'\n", + "\t\tWHEN hashtag_string ILIKE '%AppLogica%' THEN 'AppLogica'\n", + "\t\tELSE 'nc'\n", + "\tEND AS corporation\n", + "FROM\n", + " raw_osm_data as a,\n", + "WHERE 1=1\n", + " and (centroid.x >= {xmin} and centroid.x <= {xmax})\n", + " and (centroid.y >= {ymin} and centroid.y <= {ymax})\n", + ")\n", + ";\n", + "\"\"\"\n", + "con.sql(query)\n", + "\n", + "processing_time = round(time.time() - start_time, 3)\n", + "print(f\"processing took {processing_time} sec.\")" + ] + }, + { + "cell_type": "markdown", + "id": "9daa8d23-2584-4031-bb0d-38aa751e2504", + "metadata": {}, + "source": [ + "## Display Heatmap of corporate edits" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d64aa54c-aee2-46a9-9b81-ea854d91e5a2", + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "\n", + "map_query = \"\"\"\n", + " SELECT\n", + " epoch_ms(date_trunc('month', valid_from)) as month,\n", + " corporation,\n", + " h3_r5,\n", + " geometry,\n", + " count(*) as n_edits,\n", + " count(distinct user_id) as n_users\n", + " FROM osm_data\n", + " GROUP by month, corporation, h3_r5, geometry;\n", + "\"\"\"\n", + "\n", + "df = con.sql(map_query).df()\n", + "\n", + "df[\"corporation_value\"] = df[\"corporation\"].map(corporate_changeset_hashtags)\n", + "df.fillna({\"corporation_value\": 0}, inplace=True)\n", + "\n", + "# convert the data to geodata\n", + "gdf = gpd.GeoDataFrame(\n", + " df,\n", + " geometry=gpd.GeoSeries.from_wkt(df['geometry'])\n", + ").set_crs('epsg:4326')" + ] + }, + { + "cell_type": "markdown", + "id": "6400d1c4-62be-48ef-a136-8b00d52f0fce", + "metadata": {}, + "source": [ + "Define map parameters and style." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a05ec464-a6f3-4922-9377-b845eb348492", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import datetime\n", + "import lonboard\n", + "from palettable.colorbrewer.sequential import Blues_9\n", + "\n", + "\n", + "# compute lonboard color style for contious color map\n", + "min_valid_from = 1000 * datetime.datetime(2019,6,1).timestamp()\n", + "max_valid_from = 1000 * datetime.datetime(2024,6,1).timestamp()\n", + "\n", + "min_value = 0\n", + "max_value = gdf[\"n_users\"].max()\n", + "\n", + "# normalized color values from 0 to 1\n", + "user_activity_style = gdf[\"n_edits\"].apply(\n", + " lambda x: (x - min_value) / (max_value - min_value))\n", + "\n", + "filter_values = np.column_stack(\n", + " [gdf[\"month\"], gdf[\"corporation_value\"]]\n", + ")\n", + "\n", + "\n", + "initial_filter_range = [\n", + " [min_valid_from, max_valid_from],\n", + " [0, 0]\n", + "]\n", + "\n", + "\n", + "gdf[\"height\"] = gdf[\"n_edits\"] \n", + "heights = gdf[\"height\"].to_numpy()\n", + "heights = np.nan_to_num(heights, nan=1)\n", + "\n", + "# the lonboard map definition\n", + "layer = lonboard.PolygonLayer.from_geopandas(\n", + " gdf,\n", + " extensions=[lonboard.layer_extension.DataFilterExtension(filter_size=2)],\n", + " extruded=True,\n", + " get_elevation=heights,\n", + " get_filter_value=filter_values, # replace with desired column\n", + " filter_range=initial_filter_range, # replace with desired filter range\n", + " get_fill_color=lonboard.colormap.apply_continuous_cmap(user_activity_style, Blues_9, alpha=.75),\n", + ")\n", + "\n", + "view_state = {\n", + " \"longitude\": xmin + ((xmax - xmin) / 2),\n", + " \"latitude\": ymin + ((ymax - ymin) / 2),\n", + " \"zoom\": 5,\n", + " \"pitch\": 25\n", + "}\n", + "\n", + "currentness_map = lonboard.Map(\n", + " basemap_style=lonboard.basemap.CartoBasemap.Positron,\n", + " layers=[layer],\n", + " view_state=view_state\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b584518f-0001-4b8a-afef-80737422446a", + "metadata": {}, + "source": [ + "Define Dates Slider Selection Widget and link to map filter range." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f7e04c25-06f2-42bc-9d7f-7386f22b419d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datetime import date, timedelta\n", + "import ipywidgets\n", + "from traitlets import directional_link\n", + "\n", + "start = datetime.datetime(2019,6,1)\n", + "end = datetime.datetime(2024,7,1)\n", + "delta = end - start # returns timedelta\n", + "dates = [start + timedelta(days=i) for i in range(delta.days + 1)]\n", + "date_options = [(i.strftime('%d-%b-%Y'), int(1000* i.timestamp())) for i in dates]\n", + "\n", + "date_slider = ipywidgets.SelectionRangeSlider(\n", + " options=date_options,\n", + " index=(0, len(dates)-1),\n", + " description='Last Edit:',\n", + " layout=ipywidgets.Layout(width='1000px'),\n", + " disabled=False\n", + ")\n", + "\n", + "group_options= [('non-corporate', 0)]\n", + "group_options += [(name, value) for name, value in corporate_changeset_hashtags.items() ]\n", + "\n", + "corporate_slider_range = ipywidgets.IntRangeSlider(\n", + " options=group_options,\n", + " index=(0, len(group_options)-1),\n", + " description='Corporate:',\n", + " layout=ipywidgets.Layout(width='1000px'),\n", + " disabled=False\n", + ")\n", + "\n", + "corporate_slider = ipywidgets.SelectionSlider(\n", + " options=group_options,\n", + " index=len(group_options)-1,\n", + " description='Group:',\n", + " layout=ipywidgets.Layout(width='1000px'),\n", + " disabled=False,\n", + " min=0,\n", + " max=25,\n", + ")\n", + "\n", + "corporate_slider = ipywidgets.ToggleButtons(\n", + " options=group_options,\n", + " description='Group:',\n", + " disabled=False,\n", + " button_style='', # 'success', 'info', 'warning', 'danger' or ''\n", + ")\n", + "\n", + "\n", + "directional_link(\n", + " (corporate_slider, 'value'),\n", + " (corporate_slider_range, \"value\"),\n", + " transform=lambda v: (v, v)\n", + ")\n", + "\n", + "multi_slider = lonboard.controls.MultiRangeSlider([date_slider, corporate_slider_range])\n", + "\n", + "directional_link(\n", + " (multi_slider, 'value'),\n", + " (layer, \"filter_range\"),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "540fd0f5-6340-459a-b57a-a0efd20598c0", + "metadata": {}, + "source": [ + "Display the map. Have fun exploring and moving around the time slider! Click on the buttons to switch between groups." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "78f120ab-f52b-4b80-8326-a90ebbcd5746", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4f4af7bb93a2481485b4ee372b642142", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "Map(basemap_style=, la…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0868f8b868464d45a2069d3fec7078de", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "SelectionRangeSlider(description='Last Edit:', index=(0, 1857), layout=Layout(width='1000px'), options=(('01-J…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "42e68e9d005941f9a8664597601bb736", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "ToggleButtons(description='Group:', options=(('non-corporate', 0), ('amap', 1), ('adt', 2), ('bolt', 3), ('Dig…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(currentness_map, date_slider, corporate_slider)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a1ff6f1-86f6-45de-8257-d15d6b6f564e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}