Skip to content

Commit

Permalink
Merge pull request #15 from cboettig/spatial-read
Browse files Browse the repository at this point in the history
support st_read() in open_dataset()
  • Loading branch information
cboettig authored Nov 23, 2023
2 parents fca0150 + f5899fb commit 1a593ea
Show file tree
Hide file tree
Showing 11 changed files with 181 additions and 36 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: duckdbfs
Title: High Performance Remote File System Access Using 'duckdb'
Version: 0.0.3
Version: 0.0.3.99
Authors@R:
person("Carl", "Boettiger", , "[email protected]", c("aut", "cre"),
comment = c(ORCID = "0000-0002-1642-628X"))
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# duckdbfs 0.0.4

* `open_dataset()` gains option `sf` to format, allowing users to parse
spatial vector data in simple features standard (objects read by `sf`)
* default geometry column in `to_sf()` is now termed `geom`, to match the default
used in `duckdb`'s `st_read()` function.

# duckdbfs 0.0.3

* `write_dataset()` now understands lazy queries, not just lazy tables.
Expand Down
36 changes: 28 additions & 8 deletions R/open_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
#' column name across all files (NOTE: this can add considerably to
#' the initial execution time)
#' @param format The format of the dataset files. One of `"parquet"`, `"csv"`,
#' `"tsv"`, or `"text"`.
#' `"tsv"`, `"text"` or `"sf"` (for any Simple Features spatial dataset supported
#' by the sf package).
#' @param conn A connection to a database.
#' @param tblname The name of the table to create in the database.
#' @param mode The mode to create the table in. One of `"VIEW"` or `"TABLE"`.
Expand Down Expand Up @@ -59,7 +60,7 @@ open_dataset <- function(sources,
schema = NULL,
hive_style = TRUE,
unify_schemas = FALSE,
format = c("parquet", "csv", "tsv", "text"),
format = c("parquet", "csv", "tsv", "text", "sf"),
conn = cached_connection(),
tblname = tmp_tbl_name(),
mode = "VIEW",
Expand All @@ -78,6 +79,9 @@ open_dataset <- function(sources,


format <- match.arg(format)
if(format == "sf") {
load_spatial(conn = conn)
}
view_query <- query_string(tblname,
sources,
format = format,
Expand All @@ -102,7 +106,7 @@ vec_as_str <- function(x) {

query_string <- function(tblname,
sources,
format = c("parquet", "csv", "tsv", "text"),
format = c("parquet", "csv", "tsv", "text", "sf"),
mode = c("VIEW", "TABLE"),
hive_partitioning = TRUE,
union_by_name = FALSE,
Expand All @@ -118,13 +122,29 @@ query_string <- function(tblname,

scanner <- switch(format,
"parquet" = "parquet_scan(",
"read_csv_auto(")
"csv" = "read_csv_auto(",
"tsv" = "read_csv_auto(",
"text" = "read_csv_auto(",
"sf" = "st_read("
)

tabular_options <- paste0(
", HIVE_PARTITIONING=",hive_partitioning,
", UNION_BY_NAME=",union_by_name,
", FILENAME=",filename)

options <- switch(format,
"parquet" = tabular_options,
"csv" = tabular_options,
"tsv" = tabular_options,
"text" = tabular_options,
"sf" = ""
)


paste0(
paste("CREATE", mode, tblname, "AS SELECT * FROM "),
paste0(scanner, source_uris,
", HIVE_PARTITIONING=",hive_partitioning,
", UNION_BY_NAME=",union_by_name,
", FILENAME=",filename,
paste0(scanner, source_uris, options,
");")
)
}
Expand Down
15 changes: 9 additions & 6 deletions R/to_sf.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,28 +24,31 @@
#' # from lat/long columns using the `st_point` method.
#' sf <-
#' open_dataset(csv_file, format = "csv") |>
#' mutate(geometry = ST_Point(longitude, latitude)) |>
#' mutate(geom = ST_Point(longitude, latitude)) |>
#' to_sf()
#'
#' # We can use the full space of spatial operations, including spatial
#' # and normal dplyr filters. All operations are translated into a
#' # spatial SQL query by `to_sf`:
#' open_dataset(csv_file, format = "csv") |>
#' mutate(geometry = ST_Point(longitude, latitude)) |>
#' mutate(dist = ST_Distance(geometry, ST_Point(0,0))) |>
#' mutate(geom = ST_Point(longitude, latitude)) |>
#' mutate(dist = ST_Distance(geom, ST_Point(0,0))) |>
#' filter(site %in% c("a", "b", "e")) |>
#' to_sf()
#'
#'
#' @export
to_sf <- function(x, conn = cached_connection()) {
load_spatial(conn)
if("geometry" %in% colnames(x)) {
x <- x |> dplyr::rename(geom=geometry)
}
sql <- x |>
dplyr::mutate(geometry = ST_AsWKB(geometry)) |>
dplyr::mutate(geom = ST_AsWKB(geom)) |>
dbplyr::sql_render()

requireNamespace("sf", quietly = TRUE)
sf::st_read(conn, query=sql, geometry_column = "geometry")
sf::st_read(conn, query=sql, geometry_column = "geom")
}

utils::globalVariables(c("ST_AsWKB", "geometry"), package = "duckdbfs")
utils::globalVariables(c("ST_AsWKB", "geom", "geometry"), package = "duckdbfs")
68 changes: 52 additions & 16 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -86,46 +86,82 @@ efi <- open_dataset("s3://anonymous@neon4cast-scores/parquet/aquatics?endpoint_o
## Spatial data

`duckdb` can also understand a wide array of spatial data queries for spatial
vector data, similar to operations found in the popular `sf` package.
vector data, similar to operations found in the popular `sf` package.
See [the list of supported functions](https://github.com/duckdb/duckdb_spatial#supported-functions) for details.
Most spatial query operations require an geometry column that expresses the
simple feature geometry in `duckdb`'s internal geometry format
(nearly but not exactly WKB). A common pattern will first generate the
geometry column from raw columns, such as `latitude` and `lognitude` columns,
using the `duckdb` implementation of the a method familiar to postgis, `ST_Point`:
(nearly but not exactly WKB).

### Generating spatial data from tabular

A common pattern will first generate the
geometry column from raw columns, such as `latitude` and `lognitude` columns,
using the `duckdb` implementation of the a method familiar to postgis, `st_point`:

```{r}
spatial_ex <- paste0("https://raw.githubusercontent.com/cboettig/duckdbfs/",
"main/inst/extdata/spatial-test.csv") |>
open_dataset(format = "csv")
spatial_ex |>
mutate(geometry = ST_Point(longitude, latitude)) |>
mutate(geometry = st_point(longitude, latitude)) |>
mutate(dist = st_distance(geometry, st_point(0,0))) |>
to_sf()
```

Recall that when used against any sort of external database like `duckdb`,
most `dplyr` functions like `dplyr::mutate()` are being transcribed into SQL
by `dbplyr`, and not actually ever run in R. This allows us to seamlessly pass
along spatial functions like `ST_Point`, despite this not being an available R
function. The `to_sf()` coercion will parse its input into a SQL query that gets
along spatial functions like `st_point`, despite this not being an available R
function. (Also note that SQL is not case-sensitive, so this function is also
written as `ST_Point`).
Optionally, we can do additional operations on this geometry column, such as
computing distances (`st_distance` shown here), spatial filters, and so forth.
The `to_sf()` coercion will parse its input into a SQL query that gets
passed to `duckdb`, and the return object will be collected through
`sf::st_read`, returning an (in-memory) `sf` object.

For more details including a complete list of the dozens of spatial operations currently supported and notes on performance and current limitations, see the [duckdb spatial docs](https://github.com/duckdb/duckdb_spatial)

### Reading spatial vector files

Note that we can add arbitrary spatial functions that operate on this geometry,
provided we do so prior to our call to `to_sf`. For instance, here we first
create our geometry column from lat/lon columns, and then compute the distance
from each element to a spatial point:
The `duckdb` spatial package can also use GDAL to read large spatial vector files.
This includes support for the GDAL virtual filesystem. This means that we can
easily subset columns from a wide array of potentially remote file types and
filter on rows and columns, and perform many spatial operations without ever
reading the entire objects into memory in R.

To read spatial vector (simple feature) files, indicate `format="sf"`.
Use virtual filesystem prefixes to access range requests over http, S3, and other such systems.

```{r}
spatial_ex |>
mutate(geometry = ST_Point(longitude, latitude)) |>
mutate(dist = ST_Distance(geometry, ST_Point(0,0))) |>
to_sf()
url <- "https://github.com/cboettig/duckdbfs/raw/25744032021cc2b9bbc560f95b77b3eb088c9abb/inst/extdata/world.gpkg"
countries <-
paste0("/vsicurl/", url) |>
open_dataset(format="sf")
```

Which country polygon contains Melbourne? Note the result is still a lazy read,
we haven't downloaded or read in the full spatial data object.

```{r}
library(sf)
melbourne <- st_point(c(144.9633, -37.814)) |> st_as_text()
countries |>
filter(st_contains(geom, ST_GeomFromText({melbourne})))
```

As before, we use `to_sf()` to read in the query results as a native (in-memory) `sf` object:

```{r}
sf_obj <- countries |> filter(continent == "Africa") |> to_sf()
plot(sf_obj["name"])
```

For more details including a complete list of the dozens of spatial operations currently supported and notes on performance and current limitations, see the [duckdb spatial docs](https://github.com/duckdb/duckdb_spatial)

## Writing datasets

Expand Down
57 changes: 57 additions & 0 deletions inst/examples/more-spatial.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
library(duckdbfs)
library(dplyr)
library(DBI)

#load_spatial()
#con <- cached_connection()

devtools::install_github("cboettig/duckdbfs@spatial-read")
countries <- open_dataset("/vsicurl/https://github.com/cboettig/duckdbfs/raw/spatial-read/inst/extdata/world.gpkg",
format = "sf", tblname = "countries")

cities <- open_dataset("/vsicurl/https://github.com/cboettig/duckdbfs/raw/spatial-read/inst/extdata/metro.fgb",
format = "sf", tblname = "cities")

## We can count number of cities in each country with a bit of SQL
x <- DBI::dbGetQuery(con, "
SELECT countries.iso_a3, count(kbas.geom) AS total
FROM countries
LEFT JOIN kbas ON st_contains(countries.geom,kbas.geom)
GROUP BY countries.iso_a3;
")

# in dplyr this could be nice and pretty, but `join_by` refuses the syntax
countries |>
left_join(cities, join_by(st_contains(geom, geom))) |>
count(iso_a3, sort=TRUE)


# other dplyr functions have no difficulty passing on these arguments:
melbourne <- st_point(c(144.9633, -37.814)) |> st_as_text()
countries |> filter(st_contains(geom, ST_GeomFromText({melbourne})))


# Aside: left_join() without count() looks like this in SQL .. much more verbose than dplyr
x <- DBI::dbGetQuery(con,"
SELECT countries.iso_a3, cities.geom, countries.geom AS geometry
FROM countries
LEFT JOIN cities
ON st_contains(countries.geom, cities.geom)
") |> as_tibble()






## accessing secure data with credentials

KBAs <- "/vsis3/biodiversity/KBAsGlobal_2023_March_01_POL.shp"
kba_pts <- "/vsis3/biodiversity/KBAsGlobal_2023_March_01_PNT.shp"
Sys.setenv("AWS_ACCESS_KEY_ID"=Sys.getenv("NVME_KEY"))
Sys.setenv("AWS_SECRET_ACCESS_KEY"=Sys.getenv("NVME_SECRET"))
Sys.setenv("AWS_S3_ENDPOINT"="minio.carlboettiger.info")
Sys.setenv("AWS_VIRTUAL_HOSTING"=FALSE)
x <- sf::read_sf(kba_pts)
kbas <- open_dataset(kba_pts, format="sf", tblname="kbas")

Binary file added inst/extdata/metro.fgb
Binary file not shown.
Binary file added inst/extdata/world.gpkg
Binary file not shown.
5 changes: 3 additions & 2 deletions man/open_dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/to_sf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions tests/testthat/test-spatial.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,24 @@ test_that("spatial", {
expect_true(TRUE)

})

test_that("spatial vector read", {

skip_if_not_installed("sf")
skip_on_os("windows") # come on duckdb, support extensions on windows
skip_if_offline() # needs to be able to load the spatial module
skip_on_cran()

# lazy-read external data (/vsicurl/ urls work too!)
path <- system.file("extdata/world.gpkg", package = "duckdbfs")
x <- open_dataset(path, format = "sf")

# read into R
y <- x |> to_sf()

expect_s3_class(x, "tbl_lazy")
expect_s3_class(x, "tbl")
expect_s3_class(y, "sf")

})

0 comments on commit 1a593ea

Please sign in to comment.