Skip to content

Commit

Permalink
Merge pull request #192 from martincollignon/fix/geometry-orientation
Browse files Browse the repository at this point in the history
fix: ensure proper geometry orientation for BigQuery
  • Loading branch information
martincollignon authored Dec 14, 2024
2 parents d591f48 + be89ade commit 2768e9f
Showing 1 changed file with 153 additions and 1 deletion.
154 changes: 153 additions & 1 deletion backend/src/sources/utils/geometry_validator.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,143 @@
from shapely.geometry import Polygon, MultiPolygon
from shapely.geometry.polygon import orient
from shapely.validation import explain_validity
import geopandas as gpd
import logging
from shapely.wkt import loads, dumps
from shapely.ops import unary_union

logger = logging.getLogger(__name__)

def fix_for_bigquery(geom):
"""
Attempt to fix geometry issues that would make it invalid for BigQuery:
- Remove duplicate vertices
- Fix self-intersections
- Ensure proper ring orientation
- Fix edge crossings
Returns fixed geometry or None if unfixable
"""
try:
if not isinstance(geom, (Polygon, MultiPolygon)):
return geom

# First try basic buffer(0) to fix self-intersections
cleaned = geom.buffer(0)
if not cleaned.is_valid:
return None

# Convert to WKT and back to normalize
cleaned = loads(dumps(cleaned))

# Handle MultiPolygon vs Polygon
polygons = cleaned.geoms if isinstance(cleaned, MultiPolygon) else [cleaned]
fixed_polys = []

for poly in polygons:
# Fix exterior ring
ext_coords = list(poly.exterior.coords)
# Remove duplicate consecutive vertices
ext_coords = [ext_coords[i] for i in range(len(ext_coords))
if i == 0 or ext_coords[i] != ext_coords[i-1]]
# Ensure ring is closed
if ext_coords[0] != ext_coords[-1]:
ext_coords.append(ext_coords[0])

# Fix interior rings
int_rings = []
for interior in poly.interiors:
int_coords = list(interior.coords)
# Remove duplicate consecutive vertices
int_coords = [int_coords[i] for i in range(len(int_coords))
if i == 0 or int_coords[i] != int_coords[i-1]]
# Ensure ring is closed
if int_coords[0] != int_coords[-1]:
int_coords.append(int_coords[0])
if len(int_coords) >= 4: # Only keep valid rings
int_rings.append(int_coords)

# Create new polygon with fixed rings
if len(ext_coords) >= 4:
try:
fixed_poly = Polygon(ext_coords, int_rings)
if fixed_poly.is_valid:
fixed_polys.append(fixed_poly)
except Exception as e:
logger.warning(f"Could not create polygon: {str(e)}")
continue

if not fixed_polys:
return None

# Create final geometry
final_geom = MultiPolygon(fixed_polys) if len(fixed_polys) > 1 else fixed_polys[0]

# Ensure proper orientation
final_geom = orient(final_geom, sign=1.0)

# Final validity check
if not final_geom.is_valid:
# Try one last unary_union as a last resort
final_geom = unary_union([final_geom])
if not final_geom.is_valid:
return None

return final_geom

except Exception as e:
logger.error(f"Error fixing geometry for BigQuery: {str(e)}")
return None

def is_valid_for_bigquery(geom) -> bool:
"""
Check if geometry meets BigQuery geography requirements:
- No self-intersections
- Proper ring orientation
- No duplicate vertices
- No empty rings
- Edges can't cross
"""
try:
# Convert to WKT and back to catch any edge crossing issues
wkt = dumps(geom)
test_geom = loads(wkt)

if not test_geom.is_valid:
return False

if isinstance(test_geom, (Polygon, MultiPolygon)):
# Check each polygon
polygons = test_geom.geoms if isinstance(test_geom, MultiPolygon) else [test_geom]

for poly in polygons:
# Check exterior ring
ext_coords = list(poly.exterior.coords)
if len(ext_coords) < 4: # Need at least 4 points (first = last)
return False

# Check for duplicate consecutive vertices
for i in range(len(ext_coords)-1):
if ext_coords[i] == ext_coords[i+1]:
return False

# Check interior rings
for interior in poly.interiors:
int_coords = list(interior.coords)
if len(int_coords) < 4:
return False

# Check for duplicate consecutive vertices in interior
for i in range(len(int_coords)-1):
if int_coords[i] == int_coords[i+1]:
return False

return True

except Exception as e:
logger.error(f"Error checking BigQuery validity: {str(e)}")
return False

def validate_and_transform_geometries(gdf: gpd.GeoDataFrame, dataset_name: str) -> gpd.GeoDataFrame:
"""
Validates and transforms geometries while preserving original areas.
Expand Down Expand Up @@ -45,6 +178,25 @@ def validate_and_transform_geometries(gdf: gpd.GeoDataFrame, dataset_name: str)
lambda geom: orient(geom, sign=1.0) if isinstance(geom, (Polygon, MultiPolygon)) else geom
)

# Check and fix BigQuery compatibility
logger.info(f"{dataset_name}: Checking BigQuery compatibility")
bq_valid_mask = gdf.geometry.apply(is_valid_for_bigquery)
if not bq_valid_mask.all():
invalid_count = (~bq_valid_mask).sum()
logger.warning(f"{dataset_name}: Found {invalid_count} geometries not valid for BigQuery. Attempting to fix...")

# Try to fix invalid geometries
invalid_indices = gdf[~bq_valid_mask].index
fixed_geometries = gdf.loc[invalid_indices, 'geometry'].apply(fix_for_bigquery)

# Update fixed geometries and remove unfixable ones
fixed_mask = ~fixed_geometries.isna()
if fixed_mask.any():
gdf.loc[invalid_indices[fixed_mask], 'geometry'] = fixed_geometries[fixed_mask]

# Remove remaining invalid geometries
gdf = gdf[gdf.geometry.apply(is_valid_for_bigquery)]

# Remove nulls and empty geometries
gdf = gdf.dropna(subset=['geometry'])
gdf = gdf[~gdf.geometry.is_empty]
Expand All @@ -69,4 +221,4 @@ def validate_and_transform_geometries(gdf: gpd.GeoDataFrame, dataset_name: str)

except Exception as e:
logger.error(f"{dataset_name}: Error in geometry validation: {str(e)}")
raise
raise

0 comments on commit 2768e9f

Please sign in to comment.