Skip to content

Commit

Permalink
Handle mixed geometry types in viz (#495)
Browse files Browse the repository at this point in the history
### Change list

- Handle mixed geometry types in `viz` for GeoDataFrames, Shapely
arrays, and `__geo_interface__` inputs
- Handle mixed geometry types in wkb-encoded geoarrow input.

Closes #491

Monaco OSM data from #491. cc @RaczeQ

<img width="577" alt="image"
src="https://github.com/developmentseed/lonboard/assets/15164633/601efa4c-a8f6-424c-9082-974b50d8b1b2">
  • Loading branch information
kylebarron authored May 2, 2024
1 parent b56f32a commit d29a31c
Show file tree
Hide file tree
Showing 6 changed files with 294 additions and 90 deletions.
115 changes: 77 additions & 38 deletions lonboard/_geoarrow/parse_wkb.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,103 @@
"""Handle GeoArrow tables with WKB-encoded geometry"""

import json
from typing import Tuple
from typing import List

import numpy as np
import pyarrow as pa
import shapely
import shapely.geometry
from shapely import GeometryType

from lonboard._constants import EXTENSION_NAME, OGC_84
from lonboard._geoarrow.crs import get_field_crs
from lonboard._geoarrow.extension_types import construct_geometry_array
from lonboard._utils import get_geometry_column_index


def parse_wkb_table(table: pa.Table) -> pa.Table:
def parse_wkb_table(table: pa.Table) -> List[pa.Table]:
"""Parse a table with a WKB column into GeoArrow-native geometries.
If no columns are WKB-encoded, returns the input. Note that WKB columns must be
tagged with an extension name of `geoarrow.wkb` or `ogc.wkb`
Returns one table per GeoArrow geometry type. E.g. if the input has points, lines,
and polygons, then it returns three tables.
"""
table = parse_geoparquet_table(table)
field_idx = get_geometry_column_index(table.schema)

wkb_names = {EXTENSION_NAME.WKB, EXTENSION_NAME.OGC_WKB}
for field_idx in range(len(table.schema)):
field = table.field(field_idx)
column = table.column(field_idx)
# For non-geometry table, just return as-is
if field_idx is None:
return [table]

if not field.metadata:
continue
field = table.field(field_idx)
column = table.column(field_idx)

extension_type_name = field.metadata.get(b"ARROW:extension:name")
if extension_type_name in wkb_names:
new_field, new_column = parse_wkb_column(field, column)
table = table.set_column(field_idx, new_field, new_column)
extension_type_name = field.metadata.get(b"ARROW:extension:name")

return table
# For native GeoArrow input, return table as-is
if extension_type_name not in {EXTENSION_NAME.WKB, EXTENSION_NAME.OGC_WKB}:
return [table]

# Handle WKB input
parsed_tables = []
crs_str = get_field_crs(field)
shapely_arr = shapely.from_wkb(column)

type_ids = np.array(shapely.get_type_id(shapely_arr))
unique_type_ids = set(np.unique(type_ids))

if GeometryType.GEOMETRYCOLLECTION in unique_type_ids:
raise ValueError("GeometryCollections not currently supported")

if GeometryType.LINEARRING in unique_type_ids:
raise ValueError("LinearRings not currently supported")

point_indices = np.where(
(type_ids == GeometryType.POINT) | (type_ids == GeometryType.MULTIPOINT)
)[0]

linestring_indices = np.where(
(type_ids == GeometryType.LINESTRING)
| (type_ids == GeometryType.MULTILINESTRING)
)[0]

polygon_indices = np.where(
(type_ids == GeometryType.POLYGON) | (type_ids == GeometryType.MULTIPOLYGON)
)[0]

if len(point_indices) > 0:
point_field, point_arr = construct_geometry_array(
shapely_arr[point_indices],
crs_str=crs_str,
)
point_table = table.take(point_indices).set_column(
field_idx, point_field, point_arr
)
parsed_tables.append(point_table)

if len(linestring_indices) > 0:
linestring_field, linestring_arr = construct_geometry_array(
shapely_arr[linestring_indices],
crs_str=crs_str,
)
linestring_table = table.take(linestring_indices).set_column(
field_idx, linestring_field, linestring_arr
)
parsed_tables.append(linestring_table)

if len(polygon_indices) > 0:
polygon_field, polygon_arr = construct_geometry_array(
shapely_arr[polygon_indices],
crs_str=crs_str,
)
polygon_table = table.take(polygon_indices).set_column(
field_idx, polygon_field, polygon_arr
)
parsed_tables.append(polygon_table)

return parsed_tables


def parse_geoparquet_table(table: pa.Table) -> pa.Table:
Expand Down Expand Up @@ -71,28 +135,3 @@ def parse_geoparquet_table(table: pa.Table) -> pa.Table:
table = table.set_column(column_idx, new_field, existing_column)

return table


def parse_wkb_column(
field: pa.Field, column: pa.ChunkedArray
) -> Tuple[pa.Field, pa.ChunkedArray]:
crs_str = get_field_crs(field)

# We call shapely.from_wkb on the _entire column_ so that we don't get mixed type
# arrays in each column.
shapely_arr = shapely.from_wkb(column)
new_field, geom_arr = construct_geometry_array(
shapely_arr,
crs_str=crs_str,
)

# Slice full array to maintain chunking
chunk_offsets = [0]
for chunk in column.chunks:
chunk_offsets.append(len(chunk) + chunk_offsets[-1])

chunks = []
for start_slice, end_slice in zip(chunk_offsets[:-1], chunk_offsets[1:]):
chunks.append(geom_arr.slice(start_slice, end_slice - start_slice))

return new_field, pa.chunked_array(chunks)
7 changes: 6 additions & 1 deletion lonboard/_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,12 @@ def __init__(
table = pa.table(table)

table = remove_extension_classes(table)
table = parse_wkb_table(table)
parsed_tables = parse_wkb_table(table)
assert len(parsed_tables) == 1, (
"Mixed geometry type input not supported here. Use the top "
"level viz() function or separate your geometry types in advanced."
)
table = parsed_tables[0]
table = transpose_table(table)

# Reproject table to WGS84 if needed
Expand Down
68 changes: 65 additions & 3 deletions lonboard/_utils.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,43 @@
from typing import Any, Dict, Optional, Sequence, TypeVar
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, TypeVar

import numpy as np
import pandas as pd
import pyarrow as pa
import shapely
from shapely import GeometryType

from lonboard._base import BaseExtension
from lonboard._constants import EXTENSION_NAME

if TYPE_CHECKING:
import geopandas as gpd

DF = TypeVar("DF", bound=pd.DataFrame)

GEOARROW_EXTENSION_TYPE_NAMES = {e.value for e in EXTENSION_NAME}


def get_geometry_column_index(schema: pa.Schema) -> Optional[int]:
"""Get the positional index of the geometry column in a pyarrow Schema"""
field_idxs = []

for field_idx in range(len(schema)):
field_metadata = schema.field(field_idx).metadata
if (
field_metadata
and field_metadata.get(b"ARROW:extension:name")
in GEOARROW_EXTENSION_TYPE_NAMES
):
return field_idx
field_idxs.append(field_idx)

return None
if len(field_idxs) > 1:
raise ValueError("Multiple geometry columns not supported.")
elif len(field_idxs) == 1:
return field_idxs[0]
else:
return None


def auto_downcast(df: DF) -> DF:
Expand Down Expand Up @@ -99,3 +113,51 @@ def remove_extension_kwargs(
)

return extension_kwargs


def split_mixed_gdf(gdf: gpd.GeoDataFrame) -> List[gpd.GeoDataFrame]:
"""Split a GeoDataFrame into one or more GeoDataFrames with unique geometry type"""

type_ids = np.array(shapely.get_type_id(gdf.geometry))
unique_type_ids = set(np.unique(type_ids))

if GeometryType.GEOMETRYCOLLECTION in unique_type_ids:
raise ValueError("GeometryCollections not currently supported")

if GeometryType.LINEARRING in unique_type_ids:
raise ValueError("LinearRings not currently supported")

if len(unique_type_ids) == 1:
return [gdf]

if len(unique_type_ids) == 2:
if unique_type_ids == {GeometryType.POINT, GeometryType.MULTIPOINT}:
return [gdf]

if unique_type_ids == {GeometryType.LINESTRING, GeometryType.MULTILINESTRING}:
return [gdf]

if unique_type_ids == {GeometryType.POLYGON, GeometryType.MULTIPOLYGON}:
return [gdf]

gdfs = []
point_indices = np.where(
(type_ids == GeometryType.POINT) | (type_ids == GeometryType.MULTIPOINT)
)[0]
if len(point_indices) > 0:
gdfs.append(gdf.iloc[point_indices])

linestring_indices = np.where(
(type_ids == GeometryType.LINESTRING)
| (type_ids == GeometryType.MULTILINESTRING)
)[0]
if len(linestring_indices) > 0:
gdfs.append(gdf.iloc[linestring_indices])

polygon_indices = np.where(
(type_ids == GeometryType.POLYGON) | (type_ids == GeometryType.MULTIPOLYGON)
)[0]
if len(polygon_indices) > 0:
gdfs.append(gdf.iloc[polygon_indices])

return gdfs
Loading

0 comments on commit d29a31c

Please sign in to comment.