diff --git a/DESCRIPTION b/DESCRIPTION index 6434ac1..9a45811 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,15 +11,16 @@ Description: Provides friendly wrappers for creating 'duckdb'-backed connections License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 URL: https://github.com/cboettig/duckdbfs, https://cboettig.github.io/duckdbfs/ BugReports: https://github.com/cboettig/duckdbfs/issues Imports: DBI, dbplyr, dplyr, - duckdb (>= 0.8.1), - fs + duckdb (>= 0.9.2), + fs, + glue Suggests: curl, sf, diff --git a/R/open_dataset.R b/R/open_dataset.R index 9ba9dda..d91024a 100644 --- a/R/open_dataset.R +++ b/R/open_dataset.R @@ -107,7 +107,7 @@ select_format <- function(sources, format) { # format for vector sources always based on first element sources <- sources[[1]] - # default to parquet for S3 addresses + # default to parquet for S3 addresses if(grepl("^s3://", sources)) { return("parquet") } @@ -116,18 +116,19 @@ select_format <- function(sources, format) { sources <- fs::dir_ls(sources, recurse = TRUE, type="file") sources <- sources[[1]] } + format <- tools::file_ext(sources) + #if(grepl("^/vsi", sources)) { + # return("sf") + #} + # detect spatial types - if(grepl("^/vsi", sources)) { - return("sf") - } if(format %in% c("fgb", "shp", "json", "geojson", "gdb", "gpkg", "kml", "gmt")) { return("sf") } - # default if (format == "") { return("parquet") diff --git a/README.Rmd b/README.Rmd index 46c462b..5fb4b48 100644 --- a/README.Rmd +++ b/README.Rmd @@ -126,21 +126,15 @@ For more details including a complete list of the dozens of spatial operations c ### Reading spatial vector files The `duckdb` spatial package can also use GDAL to read large spatial vector files. -This includes support for the GDAL virtual filesystem. This means that we can +This includes support for remote files. This means that we can easily subset columns from a wide array of potentially remote file types and filter on rows and columns, and perform many spatial operations without ever reading the entire objects into memory in R. -To read spatial vector (simple feature) files, indicate `format="sf"`. -Use virtual filesystem prefixes to access range requests over http, S3, and other such systems. ```{r} -url <- "https://github.com/cboettig/duckdbfs/raw/25744032021cc2b9bbc560f95b77b3eb088c9abb/inst/extdata/world.gpkg" - -countries <- - paste0("/vsicurl/", url) |> - open_dataset(format="sf") - +url <- "https://github.com/cboettig/duckdbfs/raw/main/inst/extdata/world.fgb" +countries <- open_dataset(url, format = "sf") ``` Which country polygon contains Melbourne? Note the result is still a lazy read, @@ -168,10 +162,9 @@ One very common operation are spatial joins, which can be a very powerful way to For instance, we can return all points (cities) within a set of polygons ```{r} -cities <- - paste0("/vsicurl/https://github.com/cboettig/duckdbfs/raw/", - "spatial-read/inst/extdata/metro.fgb") |> - open_dataset(format = "sf") +cities <- open_dataset(paste0("https://github.com/cboettig/duckdbfs/raw/", + "spatial-read/inst/extdata/metro.fgb"), + format="sf") countries |> dplyr::filter(continent == "Oceania") |> diff --git a/README.md b/README.md index 65d6184..2b867c9 100644 --- a/README.md +++ b/README.md @@ -61,8 +61,8 @@ explicitly request `duckdb` join the two schemas. Leave this as default, ``` r ds <- open_dataset(urls, unify_schemas = TRUE) ds -#> # Source: table [3 x 4] -#> # Database: DuckDB v0.9.2 [unknown@Linux 6.5.6-76060506-generic:R 4.3.2/:memory:] +#> # Source: table [3 x 4] +#> # Database: DuckDB v0.9.2 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:] #> i j x k #> #> 1 42 84 1 NA @@ -166,22 +166,14 @@ docs](https://github.com/duckdb/duckdb_spatial) ### Reading spatial vector files The `duckdb` spatial package can also use GDAL to read large spatial -vector files. This includes support for the GDAL virtual filesystem. -This means that we can easily subset columns from a wide array of -potentially remote file types and filter on rows and columns, and -perform many spatial operations without ever reading the entire objects -into memory in R. - -To read spatial vector (simple feature) files, indicate `format="sf"`. -Use virtual filesystem prefixes to access range requests over http, S3, -and other such systems. +vector files. This includes support for remote files. This means that we +can easily subset columns from a wide array of potentially remote file +types and filter on rows and columns, and perform many spatial +operations without ever reading the entire objects into memory in R. ``` r -url <- "https://github.com/cboettig/duckdbfs/raw/25744032021cc2b9bbc560f95b77b3eb088c9abb/inst/extdata/world.gpkg" - -countries <- - paste0("/vsicurl/", url) |> - open_dataset(format="sf") +url <- "https://github.com/cboettig/duckdbfs/raw/main/inst/extdata/world.fgb" +countries <- open_dataset(url, format = "sf") ``` Which country polygon contains Melbourne? Note the result is still a @@ -190,18 +182,17 @@ object. ``` r library(sf) -#> Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE +#> Linking to GEOS 3.12.1, GDAL 3.8.3, PROJ 9.3.1; sf_use_s2() is TRUE melbourne <- st_point(c(144.9633, -37.814)) |> st_as_text() countries |> filter(st_contains(geom, ST_GeomFromText({melbourne}))) #> # Source: SQL [1 x 16] -#> # Database: DuckDB v0.9.2 [unknown@Linux 6.5.6-76060506-generic:R 4.3.2/:memory:] -#> iso_a3 name sovereignt continent area pop_est pop_est_dens economy -#> -#> 1 AUS Australia Australia Oceania 7682300 21262641 2.77 2. Develo… -#> # ℹ 8 more variables: income_grp , gdp_cap_est , life_exp , -#> # well_being , footprint , inequality , HPI , geom +#> # Database: DuckDB v0.9.2 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:] +#> iso_a3 name sovereignt continent area pop_est pop_est_dens economy income_grp gdp_cap_est life_exp well_being footprint inequality HPI +#> +#> 1 AUS Australia Australia Oceania 7682300 2.13e7 2.77 2. Dev… 1. High i… 37634. 82.1 7.2 9.31 0.0807 21.2 +#> # ℹ 1 more variable: geom ``` As before, we use `to_sf()` to read in the query results as a native @@ -221,17 +212,16 @@ powerful way to subset large data. For instance, we can return all points (cities) within a set of polygons ``` r -cities <- - paste0("/vsicurl/https://github.com/cboettig/duckdbfs/raw/", - "spatial-read/inst/extdata/metro.fgb") |> - open_dataset(format = "sf") +cities <- open_dataset(paste0("https://github.com/cboettig/duckdbfs/raw/", + "spatial-read/inst/extdata/metro.fgb"), + format="sf") countries |> dplyr::filter(continent == "Oceania") |> spatial_join(cities, by = "st_intersects", join="inner") |> select(name_long, sovereignt, pop2020) #> # Source: SQL [6 x 3] -#> # Database: DuckDB v0.9.2 [unknown@Linux 6.5.6-76060506-generic:R 4.3.2/:memory:] +#> # Database: DuckDB v0.9.2 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:] #> name_long sovereignt pop2020 #> #> 1 Brisbane Australia 2388517 diff --git a/inst/WORDLIST b/inst/WORDLIST index f078908..35c6ea0 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,6 +1,8 @@ CMD GBIF GC +GDAL +Geospatial MINIO Quickstart README @@ -10,12 +12,14 @@ URIs WKB behaviour cachable +containsproperly csv dbpylr dplyr duckdb duckdb's duckdb’s +dwithin filesize finalizer gc @@ -24,7 +28,6 @@ globbing http httpfs https -lon md minio pipline diff --git a/man/figures/README-unnamed-chunk-8-1.png b/man/figures/README-unnamed-chunk-8-1.png index ba26f80..7aa3d17 100644 Binary files a/man/figures/README-unnamed-chunk-8-1.png and b/man/figures/README-unnamed-chunk-8-1.png differ diff --git a/man/st_read_meta.Rd b/man/st_read_meta.Rd new file mode 100644 index 0000000..f340f7f --- /dev/null +++ b/man/st_read_meta.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/spatial_meta.R +\name{st_read_meta} +\alias{st_read_meta} +\title{read spatial metadata} +\usage{ +st_read_meta( + path, + layer = 1L, + tblname = basename(tools::file_path_sans_ext(path)), + conn = cached_connection(), + ... +) +} +\arguments{ +\item{path}{URL or path to spatial data file} + +\item{layer}{layer number to read metadata for, defaults to first layer.} + +\item{tblname}{metadata will be stored as a view with this name, +by default this is based on the name of the file.} + +\item{conn}{A connection to a database.} + +\item{...}{optional additional arguments passed to \code{\link[=duckdb_s3_config]{duckdb_s3_config()}}. +Note these apply after those set by the URI notation and thus may be used +to override or provide settings not supported in that format.} +} +\value{ +A lazy \code{dplyr::tbl} object containing core spatial metadata such +as projection information. +} +\description{ +At this time, reads a subset of spatial metadata. +This is similar to what is reported by \code{ogrinfo -json} +} +\examples{ +\dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +st_read_meta("https://github.com/duckdb/duckdb_spatial/raw/main/test/data/amsterdam_roads.fgb") +\dontshow{\}) # examplesIf} +}