diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index cf615c6..621e7e8 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -22,7 +22,7 @@ jobs: - {os: windows-latest, r: 'release'} - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} - - {os: ubuntu-latest, r: 'oldrel-1'} +# - {os: ubuntu-latest, r: 'oldrel-1'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} diff --git a/DESCRIPTION b/DESCRIPTION index e308d2f..9ca47f4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,8 @@ Imports: DBI, dbplyr, dplyr, - duckdb (>= 0.8.1) + duckdb (>= 0.8.1), + fs Suggests: curl, sf, diff --git a/NAMESPACE b/NAMESPACE index 403c0ac..b3b2166 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,11 @@ # Generated by roxygen2: do not edit by hand +export(as_view) export(cached_connection) export(close_connection) export(duckdb_s3_config) export(load_spatial) export(open_dataset) +export(spatial_join) export(to_sf) export(write_dataset) diff --git a/NEWS.md b/NEWS.md index 294ab10..b6a4dd7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,14 @@ # duckdbfs 0.0.4 -* `open_dataset()` gains option `sf` to format, allowing users to parse - spatial vector data in simple features standard (objects read by `sf`) +* `open_dataset()` gains the ability to read spatial vector data formats + (objects read by `sf`) using `format="sf"` * default geometry column in `to_sf()` is now termed `geom`, to match the default used in `duckdb`'s `st_read()` function. +* `open_dataset()` now tries to guess the data format instead of defaulting to + parquet when no format is explicitly provided. + +* new function, `spatial_join()`, allows a variety of spatial joins. +* new helper function, `as_view()`, creates a temporary view of a query. # duckdbfs 0.0.3 diff --git a/R/open_dataset.R b/R/open_dataset.R index a8aa2a4..4d87249 100644 --- a/R/open_dataset.R +++ b/R/open_dataset.R @@ -14,8 +14,9 @@ #' column name across all files (NOTE: this can add considerably to #' the initial execution time) #' @param format The format of the dataset files. One of `"parquet"`, `"csv"`, -#' `"tsv"`, `"text"` or `"sf"` (for any Simple Features spatial dataset supported -#' by the sf package). +#' `"tsv"`, or `"sf"` (spatial vector files supported by the sf package / GDAL). +#' if no argument is provided, the function will try to guess the type based +#' on minimal heuristics. #' @param conn A connection to a database. #' @param tblname The name of the table to create in the database. #' @param mode The mode to create the table in. One of `"VIEW"` or `"TABLE"`. @@ -60,7 +61,7 @@ open_dataset <- function(sources, schema = NULL, hive_style = TRUE, unify_schemas = FALSE, - format = c("parquet", "csv", "tsv", "text", "sf"), + format = c("parquet", "csv", "tsv", "sf"), conn = cached_connection(), tblname = tmp_tbl_name(), mode = "VIEW", @@ -68,6 +69,8 @@ open_dataset <- function(sources, recursive = TRUE, ...) { + format <- select_format(sources, format) + sources <- parse_uri(sources, conn = conn, recursive = recursive) if(length(list(...)) > 0) { # can also be specified in URI query notation @@ -77,8 +80,6 @@ open_dataset <- function(sources, # ensure active connection version <- DBI::dbExecute(conn, "PRAGMA version;") - - format <- match.arg(format) if(format == "sf") { load_spatial(conn = conn) } @@ -95,6 +96,46 @@ open_dataset <- function(sources, dplyr::tbl(conn, tblname) } +select_format <- function(sources, format) { + ## does not guess file types in s3 buckets. + + if(length(format) == 1) { + return(format) + } + + # format for vector sources always based on first element + sources <- sources[[1]] + + # default to parquet for S3 addresses + if(grepl("^s3://", sources)) { + return("parquet") + } + + if( fs::is_dir(sources) ) { + sources <- fs::dir_ls(sources, recurse = TRUE, type="file") + sources <- sources[[1]] + } + format <- tools::file_ext(sources) + + # detect spatial types + if(grepl("^/vsi", sources)) { + return("sf") + } + if(format %in% c("fgb", "shp", "json", "geojson", "gdb", "gpkg", + "kml", "gmt")) { + return("sf") + } + + + # default + if (format == "") { + return("parquet") + } + + format +} + + use_recursive <- function(sources) { !all(identical(tools::file_ext(sources), "")) } @@ -112,7 +153,14 @@ query_string <- function(tblname, union_by_name = FALSE, filename = FALSE) { - format <- match.arg(format) + # format <- match.arg(format) + scanner <- switch(format, + "parquet" = "parquet_scan(", + "csv" = "read_csv_auto(", + "sf" = "st_read(", + "read_csv_auto(" + ) + source_uris <- vec_as_str(sources) ## Allow overwrites on VIEW @@ -120,14 +168,6 @@ query_string <- function(tblname, "VIEW" = "OR REPLACE TEMPORARY VIEW", "TABLE" = "TABLE") - scanner <- switch(format, - "parquet" = "parquet_scan(", - "csv" = "read_csv_auto(", - "tsv" = "read_csv_auto(", - "text" = "read_csv_auto(", - "sf" = "st_read(" - ) - tabular_options <- paste0( ", HIVE_PARTITIONING=",hive_partitioning, ", UNION_BY_NAME=",union_by_name, @@ -136,12 +176,9 @@ query_string <- function(tblname, options <- switch(format, "parquet" = tabular_options, "csv" = tabular_options, - "tsv" = tabular_options, - "text" = tabular_options, - "sf" = "" + "sf" = "", + tabular_options ) - - paste0( paste("CREATE", mode, tblname, "AS SELECT * FROM "), paste0(scanner, source_uris, options, @@ -152,8 +189,6 @@ query_string <- function(tblname, tmp_tbl_name <- function(n = 15) { paste0(sample(letters, n, replace = TRUE), collapse = "") } - - remote_src <- function(conn) { dbplyr::remote_src(conn) } diff --git a/R/spatial_join.R b/R/spatial_join.R new file mode 100644 index 0000000..5aa865d --- /dev/null +++ b/R/spatial_join.R @@ -0,0 +1,129 @@ +#' spatial_join +#' +#' @param x a duckdb table with a spatial geometry column called "geom" +#' @param y a duckdb table with a spatial geometry column called "geom" +#' @param by A spatial join function, see details. +#' @param join JOIN type (left, right, inner, full) +#' @param args additional arguments to join function (e.g. distance for st_dwithin) +#' @param tblname name for the temporary view +#' @param conn the duckdb connection (imputed by duckdbfs by default, +#' must be shared across both tables) +#' @return a (lazy) view of the resulting table. Users can continue to operate +#' on using dplyr operations and call to_st() to collect this as an sf object. +#' @details +#' +#' Possible [spatial joins](https://postgis.net/workshops/postgis-intro/spatial_relationships.html) include: +#' +#' Function | Description +#' -------------------- | -------------------------------------------------------------------------------------------- +#' st_intersects | Geometry A intersects with geometry B +#' st_disjoint | The complement of intersects +#' st_within | Geometry A is within geometry B (complement of contains) +#' st_dwithin | Geometries are within a specified distance, expressed in the same units as the coordinate reference system. +#' st_touches | Two polygons touch if the that have at least one point in common, even if their interiors do not touch. +#' st_contains | Geometry A entirely contains to geometry B. (complement of within) +#' st_containsproperly | stricter version of `st_contains` (boundary counts as external) +#' st_covers | geometry B is inside or on boundary of A. (A polygon covers a point on its boundary but does not contain it.) +#' st_overlaps | geometry A intersects but does not completely contain geometry B +#' st_equals | geometry A is equal to geometry B +#' st_crosses | Lines or points in geometry A cross geometry B. +#' +#' All though SQL is not case sensitive, this function expects only +#' lower case names for "by" functions. +#' +#' @examplesIf interactive() +#' +#' # note we can read in remote data in a variety of vector formats: +#' countries <- +#' paste0("/vsicurl/", +#' "https://github.com/cboettig/duckdbfs/", +#' "raw/spatial-read/inst/extdata/world.gpkg") |> +#' open_dataset(format = "sf") +#' +#' cities <- +#' paste0("/vsicurl/https://github.com/cboettig/duckdbfs/raw/", +#' "spatial-read/inst/extdata/metro.fgb") |> +#' open_dataset(format = "sf") +#' +#' countries |> +#' dplyr::filter(iso_a3 == "AUS") |> +#' spatial_join(cities) +#' +#' @export +spatial_join <- function(x, + y, + by=c("st_intersects", "st_within", + "st_dwithin", "st_touches", + "st_contains", "st_containsproperly", + "st_covers", "st_overlaps", + "st_crosses", "st_equals", + "st_disjoint"), + args = "", + join="left", + tblname = tmp_tbl_name(), + conn = cached_connection()) { + + by <- match.arg(by) + ## x,y may be promised queries + x <- as_view(x) + y <- as_view(y) + + # buil spatial join query + x.name <- remote_name(x, conn) + y.name <- remote_name(y, conn) + x.geom <- paste0(x.name, ".geom") + y.geom <- paste0(y.name, ".geom") + + if(args != ""){ + args <- paste(",", args) + } + + # be more careful than SELECT * + + # x.geom becomes the "geom" column, y.geom becomes geom:1 + query <- paste( + "SELECT *", + "FROM", x.name, + join, "JOIN", y.name, + "ON", paste0(by, "(", x.geom, ", ", y.geom, args, ")") + ) + query_to_view(query, tblname, conn) + +} + + +#' as_view +#' +#' Create a View of the current query. This can be an effective way to allow +#' a query chain to remain lazy +#' @param x a duckdb spatial dataset +#' @inheritParams open_dataset +#' @examplesIf interactive() +#' path <- system.file("extdata/spatial-test.csv", package="duckdbfs") +#' df <- open_dataset(path) +#' library(dplyr) +#' +#' df |> filter(latitude > 5) |> as_view() +#' +#' @export +as_view <- function(x, tblname = tmp_tbl_name(), conn = cached_connection()) { + + # assert x is a tbl_lazy, a tbl_sql, and a tbl_duckdb_connection + + ## lazy_base_query objects are good to go. + if(inherits(x$lazy_query, "lazy_base_query")) { + return(x) + } + ## lazy_select_query objects are unnamed, + ## convert to named views so we can re-use them in queries + q <- dbplyr::sql_render(x) + query_to_view(q, tblname, conn) +} + +query_to_view <- function(query, + tblname = tmp_tbl_name(), + conn = cached_connection()) { + q <- paste("CREATE OR REPLACE TEMPORARY VIEW", tblname, "AS", query) + DBI::dbSendQuery(conn, q) + dplyr::tbl(conn, tblname) +} diff --git a/R/write_dataset.R b/R/write_dataset.R index f727995..f4d55c1 100644 --- a/R/write_dataset.R +++ b/R/write_dataset.R @@ -32,9 +32,7 @@ write_dataset <- function(dataset, DBI::dbWriteTable(conn, name = tblname, value = dataset) } else { - tblname <- as.character(remote_name(dataset, conn)) - } path <- parse_uri(path, conn = conn, recursive = FALSE) diff --git a/README.Rmd b/README.Rmd index 42081b8..46c462b 100644 --- a/README.Rmd +++ b/README.Rmd @@ -162,6 +162,42 @@ sf_obj <- countries |> filter(continent == "Africa") |> to_sf() plot(sf_obj["name"]) ``` +## Spatial joins + +One very common operation are spatial joins, which can be a very powerful way to subset large data. +For instance, we can return all points (cities) within a set of polygons + +```{r} +cities <- + paste0("/vsicurl/https://github.com/cboettig/duckdbfs/raw/", + "spatial-read/inst/extdata/metro.fgb") |> + open_dataset(format = "sf") + +countries |> + dplyr::filter(continent == "Oceania") |> + spatial_join(cities, by = "st_intersects", join="inner") |> + select(name_long, sovereignt, pop2020) + +``` + + +Possible [spatial joins](https://postgis.net/workshops/postgis-intro/spatial_relationships.html) include: + + Function | Description +-------------------- | -------------------------------------------------------------------------------------------- + st_intersects | Geometry A intersects with geometry B + st_disjoint | The complement of intersects + st_within | Geometry A is within geometry B (complement of contains) + st_dwithin | Geometries are within a specified distance, expressed in the same units as the coordinate reference system. + st_touches | Two polygons touch if the that have at least one point in common, even if their interiors do not touch. + st_contains | Geometry A entirely contains to geometry B. (complement of within) + st_containsproperly | stricter version of `st_contains` (boundary counts as external) + st_covers | geometry B is inside or on boundary of A. (A polygon covers a point on its boundary but does not contain it.) + st_overlaps | geometry A intersects but does not completely contain geometry B + st_equals | geometry A is equal to geometry B + st_crosses | Lines or points in geometry A cross geometry B. + +Note that while SQL functions are not case-sensitive, `spatial_join` expects lower-case names. ## Writing datasets diff --git a/README.md b/README.md index c375ded..65d6184 100644 --- a/README.md +++ b/README.md @@ -61,13 +61,13 @@ explicitly request `duckdb` join the two schemas. Leave this as default, ``` r ds <- open_dataset(urls, unify_schemas = TRUE) ds -#> # Source: table [3 x 4] -#> # Database: DuckDB 0.8.1 [unknown@Linux 6.4.6-76060406-generic:R 4.3.1/:memory:] -#> i j x k -#> -#> 1 42 84 1 NA -#> 2 42 84 1 NA -#> 3 NA 128 2 33 +#> # Source: table [3 x 4] +#> # Database: DuckDB v0.9.2 [unknown@Linux 6.5.6-76060506-generic:R 4.3.2/:memory:] +#> i j x k +#> +#> 1 42 84 1 NA +#> 2 42 84 1 NA +#> 3 NA 128 2 33 ``` Use `filter()`, `select()`, etc from dplyr to subset and process data – @@ -107,12 +107,17 @@ efi <- open_dataset("s3://anonymous@neon4cast-scores/parquet/aquatics?endpoint_o `duckdb` can also understand a wide array of spatial data queries for spatial vector data, similar to operations found in the popular `sf` -package. Most spatial query operations require an geometry column that -expresses the simple feature geometry in `duckdb`’s internal geometry -format (nearly but not exactly WKB). A common pattern will first -generate the geometry column from raw columns, such as `latitude` and -`lognitude` columns, using the `duckdb` implementation of the a method -familiar to postgis, `ST_Point`: +package. See [the list of supported +functions](https://github.com/duckdb/duckdb_spatial#supported-functions) +for details. Most spatial query operations require an geometry column +that expresses the simple feature geometry in `duckdb`’s internal +geometry format (nearly but not exactly WKB). + +### Generating spatial data from tabular + +A common pattern will first generate the geometry column from raw +columns, such as `latitude` and `lognitude` columns, using the `duckdb` +implementation of the a method familiar to postgis, `st_point`: ``` r spatial_ex <- paste0("https://raw.githubusercontent.com/cboettig/duckdbfs/", @@ -120,51 +125,15 @@ spatial_ex <- paste0("https://raw.githubusercontent.com/cboettig/duckdbfs/", open_dataset(format = "csv") spatial_ex |> - mutate(geometry = ST_Point(longitude, latitude)) |> - to_sf() -#> Simple feature collection with 10 features and 3 fields -#> Geometry type: POINT -#> Dimension: XY -#> Bounding box: xmin: 1 ymin: 1 xmax: 10 ymax: 10 -#> CRS: NA -#> site latitude longitude geometry -#> 1 a 1 1 POINT (1 1) -#> 2 b 2 2 POINT (2 2) -#> 3 c 3 3 POINT (3 3) -#> 4 d 4 4 POINT (4 4) -#> 5 e 5 5 POINT (5 5) -#> 6 f 6 6 POINT (6 6) -#> 7 g 7 7 POINT (7 7) -#> 8 h 8 8 POINT (8 8) -#> 9 i 9 9 POINT (9 9) -#> 10 j 10 10 POINT (10 10) -``` - -Recall that when used against any sort of external database like -`duckdb`, most `dplyr` functions like `dplyr::mutate()` are being -transcribed into SQL by `dbplyr`, and not actually ever run in R. This -allows us to seamlessly pass along spatial functions like `ST_Point`, -despite this not being an available R function. The `to_sf()` coercion -will parse its input into a SQL query that gets passed to `duckdb`, and -the return object will be collected through `sf::st_read`, returning an -(in-memory) `sf` object. - -Note that we can add arbitrary spatial functions that operate on this -geometry, provided we do so prior to our call to `to_sf`. For instance, -here we first create our geometry column from lat/lon columns, and then -compute the distance from each element to a spatial point: - -``` r -spatial_ex |> - mutate(geometry = ST_Point(longitude, latitude)) |> - mutate(dist = ST_Distance(geometry, ST_Point(0,0))) |> + mutate(geometry = st_point(longitude, latitude)) |> + mutate(dist = st_distance(geometry, st_point(0,0))) |> to_sf() #> Simple feature collection with 10 features and 4 fields #> Geometry type: POINT #> Dimension: XY #> Bounding box: xmin: 1 ymin: 1 xmax: 10 ymax: 10 #> CRS: NA -#> site latitude longitude dist geometry +#> site latitude longitude dist geom #> 1 a 1 1 1.414214 POINT (1 1) #> 2 b 2 2 2.828427 POINT (2 2) #> 3 c 3 3 4.242641 POINT (3 3) @@ -177,11 +146,123 @@ spatial_ex |> #> 10 j 10 10 14.142136 POINT (10 10) ``` +Recall that when used against any sort of external database like +`duckdb`, most `dplyr` functions like `dplyr::mutate()` are being +transcribed into SQL by `dbplyr`, and not actually ever run in R. This +allows us to seamlessly pass along spatial functions like `st_point`, +despite this not being an available R function. (Also note that SQL is +not case-sensitive, so this function is also written as `ST_Point`). +Optionally, we can do additional operations on this geometry column, +such as computing distances (`st_distance` shown here), spatial filters, +and so forth. The `to_sf()` coercion will parse its input into a SQL +query that gets passed to `duckdb`, and the return object will be +collected through `sf::st_read`, returning an (in-memory) `sf` object. + For more details including a complete list of the dozens of spatial operations currently supported and notes on performance and current limitations, see the [duckdb spatial docs](https://github.com/duckdb/duckdb_spatial) +### Reading spatial vector files + +The `duckdb` spatial package can also use GDAL to read large spatial +vector files. This includes support for the GDAL virtual filesystem. +This means that we can easily subset columns from a wide array of +potentially remote file types and filter on rows and columns, and +perform many spatial operations without ever reading the entire objects +into memory in R. + +To read spatial vector (simple feature) files, indicate `format="sf"`. +Use virtual filesystem prefixes to access range requests over http, S3, +and other such systems. + +``` r +url <- "https://github.com/cboettig/duckdbfs/raw/25744032021cc2b9bbc560f95b77b3eb088c9abb/inst/extdata/world.gpkg" + +countries <- + paste0("/vsicurl/", url) |> + open_dataset(format="sf") +``` + +Which country polygon contains Melbourne? Note the result is still a +lazy read, we haven’t downloaded or read in the full spatial data +object. + +``` r +library(sf) +#> Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE +melbourne <- st_point(c(144.9633, -37.814)) |> st_as_text() + +countries |> + filter(st_contains(geom, ST_GeomFromText({melbourne}))) +#> # Source: SQL [1 x 16] +#> # Database: DuckDB v0.9.2 [unknown@Linux 6.5.6-76060506-generic:R 4.3.2/:memory:] +#> iso_a3 name sovereignt continent area pop_est pop_est_dens economy +#> +#> 1 AUS Australia Australia Oceania 7682300 21262641 2.77 2. Develo… +#> # ℹ 8 more variables: income_grp , gdp_cap_est , life_exp , +#> # well_being , footprint , inequality , HPI , geom +``` + +As before, we use `to_sf()` to read in the query results as a native +(in-memory) `sf` object: + +``` r +sf_obj <- countries |> filter(continent == "Africa") |> to_sf() +plot(sf_obj["name"]) +``` + + + +## Spatial joins + +One very common operation are spatial joins, which can be a very +powerful way to subset large data. For instance, we can return all +points (cities) within a set of polygons + +``` r +cities <- + paste0("/vsicurl/https://github.com/cboettig/duckdbfs/raw/", + "spatial-read/inst/extdata/metro.fgb") |> + open_dataset(format = "sf") + +countries |> + dplyr::filter(continent == "Oceania") |> + spatial_join(cities, by = "st_intersects", join="inner") |> + select(name_long, sovereignt, pop2020) +#> # Source: SQL [6 x 3] +#> # Database: DuckDB v0.9.2 [unknown@Linux 6.5.6-76060506-generic:R 4.3.2/:memory:] +#> name_long sovereignt pop2020 +#> +#> 1 Brisbane Australia 2388517 +#> 2 Perth Australia 2036118 +#> 3 Sydney Australia 4729406 +#> 4 Adelaide Australia 1320783 +#> 5 Auckland New Zealand 1426070 +#> 6 Melbourne Australia 4500501 +``` + +Possible [spatial +joins](https://postgis.net/workshops/postgis-intro/spatial_relationships.html) +include: + +| Function | Description | +|---------------------|---------------------------------------------------------------------------------------------------------------| +| st_intersects | Geometry A intersects with geometry B | +| st_disjoint | The complement of intersects | +| st_within | Geometry A is within geometry B (complement of contains) | +| st_dwithin | Geometries are within a specified distance, expressed in the same units as the coordinate reference system. | +| st_touches | Two polygons touch if the that have at least one point in common, even if their interiors do not touch. | +| st_contains | Geometry A entirely contains to geometry B. (complement of within) | +| st_containsproperly | stricter version of `st_contains` (boundary counts as external) | +| st_covers | geometry B is inside or on boundary of A. (A polygon covers a point on its boundary but does not contain it.) | +| st_overlaps | geometry A intersects but does not completely contain geometry B | +| st_equals | geometry A is equal to geometry B | +| st_crosses | Lines or points in geometry A cross geometry B. | + +Note that while SQL functions are not case-sensitive, `spatial_join` +expects lower-case names. + ## Writing datasets Like `arrow::write_dataset()`, `duckdbfs::write_dataset()` can write diff --git a/inst/examples/spatial_module.R b/inst/examples/spatial_module.R new file mode 100644 index 0000000..4b59e2a --- /dev/null +++ b/inst/examples/spatial_module.R @@ -0,0 +1,36 @@ +st_read <- function() { + +} + +st_write <- function() { + +} + +st_perimeter <- function() { + # no sf equivalent +} + +# functions that operate on geometries already work within `mutate` calls +st_area <- function() { + +} + + +st_intersection <- function(x, y, ...) { + sf::st_intersection(x, y, ...) +} + +st_intersects <- function(x, y, ...) { + if(inherits(x, "sf")) { + sf::st_intersection(x, y, ...) + } + +} + + +st_union <- function(x, y, ..., + by_feature = by_feature, is_coverage = is_coverage) { + if(inherits(x, "sf")) { + sf::st_union(x, y, ..., by_feature = by_feature, is_coverage = is_coverage) + } +} diff --git a/man/as_view.Rd b/man/as_view.Rd new file mode 100644 index 0000000..f256bdb --- /dev/null +++ b/man/as_view.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/spatial_join.R +\name{as_view} +\alias{as_view} +\title{as_view} +\usage{ +as_view(x, tblname = tmp_tbl_name(), conn = cached_connection()) +} +\arguments{ +\item{x}{a duckdb spatial dataset} + +\item{tblname}{The name of the table to create in the database.} + +\item{conn}{A connection to a database.} +} +\description{ +Create a View of the current query. This can be an effective way to allow +a query chain to remain lazy +} +\examples{ +\dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +path <- system.file("extdata/spatial-test.csv", package="duckdbfs") +df <- open_dataset(path) +library(dplyr) + +df |> filter(latitude > 5) |> as_view() +\dontshow{\}) # examplesIf} +} diff --git a/man/figures/README-unnamed-chunk-8-1.png b/man/figures/README-unnamed-chunk-8-1.png new file mode 100644 index 0000000..ba26f80 Binary files /dev/null and b/man/figures/README-unnamed-chunk-8-1.png differ diff --git a/man/open_dataset.Rd b/man/open_dataset.Rd index bff4270..50115e8 100644 --- a/man/open_dataset.Rd +++ b/man/open_dataset.Rd @@ -9,7 +9,7 @@ open_dataset( schema = NULL, hive_style = TRUE, unify_schemas = FALSE, - format = c("parquet", "csv", "tsv", "text", "sf"), + format = c("parquet", "csv", "tsv", "sf"), conn = cached_connection(), tblname = tmp_tbl_name(), mode = "VIEW", @@ -33,8 +33,9 @@ column name across all files (NOTE: this can add considerably to the initial execution time)} \item{format}{The format of the dataset files. One of \code{"parquet"}, \code{"csv"}, -\code{"tsv"}, \code{"text"} or \code{"sf"} (for any Simple Features spatial dataset supported -by the sf package).} +\code{"tsv"}, or \code{"sf"} (spatial vector files supported by the sf package / GDAL). +if no argument is provided, the function will try to guess the type based +on minimal heuristics.} \item{conn}{A connection to a database.} diff --git a/man/spatial_join.Rd b/man/spatial_join.Rd new file mode 100644 index 0000000..a2001cb --- /dev/null +++ b/man/spatial_join.Rd @@ -0,0 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/spatial_join.R +\name{spatial_join} +\alias{spatial_join} +\title{spatial_join} +\usage{ +spatial_join( + x, + y, + by = c("st_intersects", "st_within", "st_dwithin", "st_touches", "st_contains", + "st_containsproperly", "st_covers", "st_overlaps", "st_crosses", "st_equals", + "st_disjoint"), + args = "", + join = "left", + tblname = tmp_tbl_name(), + conn = cached_connection() +) +} +\arguments{ +\item{x}{a duckdb table with a spatial geometry column called "geom"} + +\item{y}{a duckdb table with a spatial geometry column called "geom"} + +\item{by}{A spatial join function, see details.} + +\item{args}{additional arguments to join function (e.g. distance for st_dwithin)} + +\item{join}{JOIN type (left, right, inner, full)} + +\item{tblname}{name for the temporary view} + +\item{conn}{the duckdb connection (imputed by duckdbfs by default, +must be shared across both tables)} +} +\value{ +a (lazy) view of the resulting table. Users can continue to operate +on using dplyr operations and call to_st() to collect this as an sf object. +} +\description{ +spatial_join +} +\details{ +Possible \href{https://postgis.net/workshops/postgis-intro/spatial_relationships.html}{spatial joins} include:\tabular{ll}{ + Function \tab Description \cr + st_intersects \tab Geometry A intersects with geometry B \cr + st_disjoint \tab The complement of intersects \cr + st_within \tab Geometry A is within geometry B (complement of contains) \cr + st_dwithin \tab Geometries are within a specified distance, expressed in the same units as the coordinate reference system. \cr + st_touches \tab Two polygons touch if the that have at least one point in common, even if their interiors do not touch. \cr + st_contains \tab Geometry A entirely contains to geometry B. (complement of within) \cr + st_containsproperly \tab stricter version of \code{st_contains} (boundary counts as external) \cr + st_covers \tab geometry B is inside or on boundary of A. (A polygon covers a point on its boundary but does not contain it.) \cr + st_overlaps \tab geometry A intersects but does not completely contain geometry B \cr + st_equals \tab geometry A is equal to geometry B \cr + st_crosses \tab Lines or points in geometry A cross geometry B. \cr +} + + +All though SQL is not case sensitive, this function expects only +lower case names for "by" functions. +} +\examples{ +\dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} + +# note we can read in remote data in a variety of vector formats: +countries <- +paste0("/vsicurl/", + "https://github.com/cboettig/duckdbfs/", + "raw/spatial-read/inst/extdata/world.gpkg") |> +open_dataset(format = "sf") + +cities <- + paste0("/vsicurl/https://github.com/cboettig/duckdbfs/raw/", + "spatial-read/inst/extdata/metro.fgb") |> + open_dataset(format = "sf") + +countries |> + dplyr::filter(iso_a3 == "AUS") |> + spatial_join(cities) +\dontshow{\}) # examplesIf} +} diff --git a/tests/testthat/test-spatial.R b/tests/testthat/test-spatial.R index 8f66eb9..d83566d 100644 --- a/tests/testthat/test-spatial.R +++ b/tests/testthat/test-spatial.R @@ -37,3 +37,37 @@ test_that("spatial vector read", { }) + +test_that("spatial_join", { + + + skip_if_not_installed("sf") + skip_on_os("windows") # come on duckdb, support extensions on windows + skip_if_offline() # needs to be able to load the spatial module + skip_on_cran() + + countries <- + paste0("/vsicurl/", + "https://github.com/cboettig/duckdbfs/", + "raw/spatial-read/inst/extdata/world.gpkg") |> + open_dataset() + + cities <- + paste0("/vsicurl/https://github.com/cboettig/duckdbfs/raw/", + "spatial-read/inst/extdata/metro.fgb") |> + open_dataset() + + out <- + countries |> + dplyr::filter(iso_a3 == "AUS") |> + spatial_join(cities) + + expect_s3_class(out, "tbl_lazy") + + local <- to_sf(out) + expect_s3_class(local, "sf") + expect_true(all(local$iso_a3 == "AUS")) + + ## add examples of other types of spatial joins +}) +