diff --git a/DESCRIPTION b/DESCRIPTION index 7879b774..6a5a9f68 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,7 +8,7 @@ Description: A programmatic interface to the Web Service methods retrieving information on data providers, getting species occurrence records, getting counts of occurrence records, and using the GBIF tile map service to make rasters summarizing huge amounts of data. -Version: 3.8.0.1 +Version: 3.8.0.2 License: MIT + file LICENSE Authors@R: c( person("Scott", "Chamberlain", role = "aut", comment = c(ORCID="0000-0003-1444-9135")), diff --git a/NAMESPACE b/NAMESPACE index f4665b9f..f62e0f2f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -102,6 +102,9 @@ export(occ_download_list) export(occ_download_meta) export(occ_download_prep) export(occ_download_queue) +export(occ_download_sql) +export(occ_download_sql_prep) +export(occ_download_sql_validate) export(occ_download_wait) export(occ_facet) export(occ_get) diff --git a/R/occ_download_describe.R b/R/occ_download_describe.R index 7ce95d7c..cd103a2f 100644 --- a/R/occ_download_describe.R +++ b/R/occ_download_describe.R @@ -16,7 +16,7 @@ #' occ_download_describe("simpleCsv")$fields #' } occ_download_describe <- function(x="dwca") { - acc_args <- c("dwca","simpleCsv","simpleAvro","simpleParquet","speciesList") + acc_args <- c("dwca","simpleCsv","simpleAvro","simpleParquet","speciesList","sql") stopifnot(x %in% acc_args) url <- paste0(gbif_base(),"/occurrence/download/describe/",x) out <- gbif_GET(url,args=NULL,parse=TRUE) diff --git a/R/occ_download_sql.R b/R/occ_download_sql.R new file mode 100644 index 00000000..c140f5d8 --- /dev/null +++ b/R/occ_download_sql.R @@ -0,0 +1,153 @@ +#' @title Download occurrence data using a SQL query +#' +#' @param q sql query +#' @param format only "SQL_TSV_ZIP" is supported right now +#' @param user your GBIF user name +#' @param pwd your GBIF password +#' @param email your email address +#' @param validate should the query be validated before submission. Default is +#' TRUE. +#' @param curlopts list of curl options +#' +#' @details +#' This is an experimental feature, and the implementation may change throughout +#' 2024. The feature is currently only available for preview by invited users. +#' Contact `helpdesk@gbif.org` to request access. +#' +#' Please see the article here for more information: +#' \url{https://docs.ropensci.org/rgbif/articles/getting_occurrence_data.html} +#' +#' @return an object of class 'occ_download_sql' +#' +#' @references +#' \url{https://techdocs.gbif.org/en/data-use/api-sql-downloads} +#' +#' @name occ_download_sql +#' @export +#' +#' @examples \dontrun{ +#' occ_download_sql("SELECT gbifid,countryCode FROM occurrence +#' WHERE genusKey = 2435098") +#' } +#' +occ_download_sql <- function(q = NULL, + format = "SQL_TSV_ZIP", + user = NULL, + pwd = NULL, + email = NULL, + validate = TRUE, + curlopts = list()) { + + z <- occ_download_sql_prep(q=q, + format=format, + user=user, + pwd=pwd, + email=email, + validate=validate, + curlopts=curlopts) + + out <- rg_POST(z$url, req = z$request, user = z$user, pwd = z$pwd, curlopts=curlopts) + md <- occ_download_meta(out) # get meta_data for printing + citation <- gbif_citation(md)$download # get citation + + structure(out, + class = "occ_download_sql", + user = z$user, + email = z$email, + format = z$format, + status = md$status, + created = md$created, + downloadLink = md$downloadLink, + doi = md$doi, + citation = citation + ) + +} + +#' @name occ_download_sql +#' @export +occ_download_sql_validate <- function(q = NULL, + user = NULL, + pwd = NULL) { + stopifnot(is.list(q)) + url <- "https://api.gbif.org/v1/occurrence/download/request/validate" + user <- check_user(user) + pwd <- check_pwd(pwd) + out <- rg_POST(url=url, req=q, user=user, pwd=pwd, curlopts=list()) + out +} + +#' @name occ_download_sql +#' @export +occ_download_sql_prep <- function(q=NULL, + format = "SQL_TSV_ZIP", + user = NULL, + pwd = NULL, + email = NULL, + validate = TRUE, + curlopts = list()) { + + url <- paste0(gbif_base(), '/occurrence/download/request') + assert(q,"character") + assert(format,"character") + if(!format == "SQL_TSV_ZIP") stop("Only format='SQL_TSV_ZIP' is supported at this time.") + + user <- check_user(user) + pwd <- check_pwd(pwd) + email <- check_email(email) + + req <- list( + sendNotification = TRUE, + notificationAddresses = email, + format = unbox(format), + sql = unbox(q) + ) + + if(validate) occ_download_sql_validate(q = req, user = user, pwd = pwd) + + structure(list( + url = url, + request = req, + json_request = jsonlite::prettify(check_inputs(req),indent = 1), + user = user, + pwd = pwd, + email = email, + format = format, + curlopts = curlopts), + class = "occ_download_sql_prep") + +} + +print.occ_download_sql <- function(x) { + stopifnot(inherits(x, 'occ_download_sql')) + cat_n("<>") + cat_n(" Your download is being processed by GBIF:") + cat_n(" https://www.gbif.org/occurrence/download/",x) + cat_n(" Check status with") + cat_n(" occ_download_wait('",x,"')") + cat_n(" After it finishes, use") + cat_n(" d <- occ_download_get('",x,"') %>%") + cat_n(" occ_download_import()") + cat_n(" to retrieve your download.") + cat_n("Download Info:") + cat_n(" Username: ", attr(x, "user")) + cat_n(" E-mail: ", attr(x, "email")) + cat_n(" Format: ", attr(x, "format")) + cat_n(" Download key: ", x) + cat_n(" Created: ",attr(x, "created")) + cat_n("Citation Info: ") + cat_n(" Please always cite the download DOI when using this data.") + cat_n(" https://www.gbif.org/citation-guidelines") + cat_n(" DOI: ", attr(x,"doi")) + cat_n(" Citation:") + cat_n(" ", attr(x,"citation")) +} + + +print.occ_download_sql_prep <- function(x) { + stopifnot(inherits(x, 'occ_download_sql_prep')) + cat_n("<>") + cat_n("Format: ", x$format) + cat_n("Email: ", x$email) + cat_n("Request: ", x$json_request) +} diff --git a/man/occ_download_sql.Rd b/man/occ_download_sql.Rd new file mode 100644 index 00000000..df0a835f --- /dev/null +++ b/man/occ_download_sql.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/occ_download_sql.R +\name{occ_download_sql} +\alias{occ_download_sql} +\alias{occ_download_sql_validate} +\alias{occ_download_sql_prep} +\title{Download occurrence data using a SQL query} +\usage{ +occ_download_sql( + q = NULL, + format = "SQL_TSV_ZIP", + user = NULL, + pwd = NULL, + email = NULL, + validate = TRUE, + curlopts = list() +) + +occ_download_sql_validate(q = NULL, user = NULL, pwd = NULL) + +occ_download_sql_prep( + q = NULL, + format = "SQL_TSV_ZIP", + user = NULL, + pwd = NULL, + email = NULL, + validate = TRUE, + curlopts = list() +) +} +\arguments{ +\item{q}{sql query} + +\item{format}{only "SQL_TSV_ZIP" is supported right now} + +\item{user}{your GBIF user name} + +\item{pwd}{your GBIF password} + +\item{email}{your email address} + +\item{validate}{should the query be validated before submission. Default is +TRUE.} + +\item{curlopts}{list of curl options} +} +\value{ +an object of class 'occ_download_sql' +} +\description{ +Download occurrence data using a SQL query +} +\details{ +This is an experimental feature, and the implementation may change throughout +2024. The feature is currently only available for preview by invited users. +Contact \code{helpdesk@gbif.org} to request access. + +Please see the article here for more information: +\url{https://docs.ropensci.org/rgbif/articles/getting_occurrence_data.html} +} +\examples{ +\dontrun{ +occ_download_sql("SELECT gbifid,countryCode FROM occurrence + WHERE genusKey = 2435098") +} + +} +\references{ +\url{https://techdocs.gbif.org/en/data-use/api-sql-downloads} +} diff --git a/man/rgbif-package.Rd b/man/rgbif-package.Rd index a9e9b991..b8ac6c7b 100644 --- a/man/rgbif-package.Rd +++ b/man/rgbif-package.Rd @@ -28,7 +28,7 @@ names and metadata \item occurrences \url{https://www.gbif.org/developer/occurrence} - Occurrences \item maps \url{https://www.gbif.org/developer/maps} - Maps - these APIs -are not implemented in \pkg{rgbif}, and are meant more for intergration +are not implemented in \pkg{rgbif}, and are meant more for integration with web based maps. } } diff --git a/tests/fixtures/occ_download_sql_1.yml b/tests/fixtures/occ_download_sql_1.yml new file mode 100644 index 00000000..8e217852 --- /dev/null +++ b/tests/fixtures/occ_download_sql_1.yml @@ -0,0 +1,76 @@ +http_interactions: +- request: + method: post + uri: https://api.gbif.org/v1/occurrence/download/request/validate + body: + encoding: '' + string: '{"sendNotification":[true],"notificationAddresses":["@gbif.org"],"format":"SQL_TSV_ZIP","sql":"SELECT + gbifid,countryCode FROM occurrence WHERE genusKey = 2435098"}' + headers: + Accept-Encoding: gzip, deflate + Content-Type: application/json + Accept: application/json + response: + status: + status_code: '201' + message: Created + explanation: Document created, URL follows + headers: + status: HTTP/1.1 201 Created + content-type: application/json + body: + encoding: '' + file: no + string: '{"sql":"SELECT gbifid, countrycode\nFROM occurrence\nWHERE occurrence.genuskey + = 2435098","notificationAddresses":["@gbif.org"],"sendNotification":false,"type":"OCCURRENCE","format":"SQL_TSV_ZIP"}' + recorded_at: 2024-09-24 14:05:47 GMT + recorded_with: vcr/1.2.0, webmockr/0.9.0 +- request: + method: post + uri: https://api.gbif.org/v1/occurrence/download/request + body: + encoding: '' + string: '{"sendNotification":[true],"notificationAddresses":["@gbif.org"],"format":"SQL_TSV_ZIP","sql":"SELECT + gbifid,countryCode FROM occurrence WHERE genusKey = 2435098"}' + headers: + Accept-Encoding: gzip, deflate + Content-Type: application/json + Accept: application/json + response: + status: + status_code: '201' + message: Created + explanation: Document created, URL follows + headers: + status: HTTP/1.1 201 Created + content-type: application/json + body: + encoding: '' + file: no + string: 0028400-240906103802322 + recorded_at: 2024-09-24 14:05:47 GMT + recorded_with: vcr/1.2.0, webmockr/0.9.0 +- request: + method: get + uri: https://api.gbif.org/v1/occurrence/download/0028400-240906103802322 + body: + encoding: '' + string: '' + headers: + Accept-Encoding: gzip, deflate + Accept: application/json, text/xml, application/xml, */* + response: + status: + status_code: '200' + message: OK + explanation: Request fulfilled, document follows + headers: + status: HTTP/1.1 200 OK + content-type: application/json + body: + encoding: '' + file: no + string: '{"key":"0028400-240906103802322","doi":"10.15468/dl.fnrv3s","license":"unspecified","request":{"sql":"SELECT + gbifid,countryCode FROM occurrence WHERE genusKey = 2435098","creator":"","notificationAddresses":["@gbif.org"],"sendNotification":false,"type":"OCCURRENCE","format":"SQL_TSV_ZIP"},"created":"2024-09-24T14:05:47.599+00:00","modified":"2024-09-24T14:05:47.599+00:00","eraseAfter":"2025-03-24T14:05:47.552+00:00","status":"PREPARING","downloadLink":"https://api.gbif.org/v1/occurrence/download/request/0028400-240906103802322.zip","size":0,"totalRecords":0,"numberDatasets":0,"source":"rgbif"}' + recorded_at: 2024-09-24 14:05:47 GMT + recorded_with: vcr/1.2.0, webmockr/0.9.0 diff --git a/tests/testthat/test-occ_download_sql.R b/tests/testthat/test-occ_download_sql.R new file mode 100644 index 00000000..2652917b --- /dev/null +++ b/tests/testthat/test-occ_download_sql.R @@ -0,0 +1,30 @@ + +test_that("occ_download_sql : real requests work", { + skip_on_cran() + skip_on_ci() + + vcr::use_cassette("occ_download_sql_1", { + qqq <- occ_download_sql("SELECT gbifid,countryCode FROM occurrence WHERE genusKey = 2435098") + }, match_requests_on = c("method", "uri", "body")) + expect_is(qqq, "occ_download_sql") + expect_equal(attr(qqq, "status"), "PREPARING") + expect_equal(attr(qqq, "format"), "SQL_TSV_ZIP") + print(qqq) + +}) + +test_that("occ_download_sql : fails well", { + skip_on_cran() + skip_on_ci() + + expect_error(occ_download_sql("dog")) + expect_error(occ_download_sql("SELECT * FROM occurrence")) + expect_error(occ_download_sql("SELECT dog FROM occurrence")) +}) + + + + + + + diff --git a/vignettes/gbif_sql_downloads.Rmd b/vignettes/gbif_sql_downloads.Rmd new file mode 100644 index 00000000..76de9c3b --- /dev/null +++ b/vignettes/gbif_sql_downloads.Rmd @@ -0,0 +1,238 @@ +--- +title: GBIF SQL Downloads +author: John Waller +date: "2024-09-25" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{gbif_sql_download} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +> This is an experimental feature, and the implementation may change throughout 2024. The feature is currently only available for preview by invited users. Contact [**helpdesk\@gbif.org**](mailto:helpdesk@gbif.org){.email} to request access. + +> If your download can be formulated using the traditional predicate downloads, it is usually going to be **much** faster to use `occ_download()`. See article [Getting Occurrence Counts From GBIF](https://docs.ropensci.org/rgbif/articles/occ_counts.html). + +The experimental **Occurrence SQL Download API** allows users to query GBIF occurrences using SQL. In contrast to the [Predicate Download API](https://techdocs.gbif.org/en/data-use/api-sql-downloads), the SQL API allows selection of the columns of interest and generation of summary views of GBIF data. + +SQL downloads, like regular downloads, required you to set up your **GBIF credentials**. I suggest that you follow this [short tutorial](https://docs.ropensci.org/rgbif/articles/gbif_credentials.html) before continuing. For the time being, SQL downloads are only available for preview by invited users. Contact **helpdesk\@gbif.org** to request access. + +``` r +# test if your download is set up correctly +# occ_download_sql_prep("SELECT datasetKey, countryCode, COUNT(*) FROM occurrence WHERE continent = 'EUROPE' GROUP BY datasetKey, countryCode") +occ_download_sql("SELECT datasetKey, countryCode, COUNT(*) FROM occurrence WHERE continent = 'EUROPE' GROUP BY datasetKey, countryCode") +``` + +`occ_download_get()` and `occ_download_import()` still work with SQL downloads. + +``` r +occ_download_get("0000967-240425142415019") %>% + occ_download_import() +``` + +## Supported SQL + +Only `SELECT` queries are supported, and only queries against a single table named `occurrence`. `JOIN` queries and sub-queries are not allowed. Selecting `*` is also not allowed. One must explicitly select the columns needed. + +`GROUP BY` queries are supported, as are basic SQL window functions (`OVER` and `PARTITION BY`). `ORDER BY` is supported. + +Most common SQL operators and functions are supported, such as `AND`, `OR`, `NOT`, `IS NULL`, `RAND()`, `ROUND(…)`, `LOWER(…)`, etc. Case is ignored by the GBIF SQL parser, and all column names are returned as lowercase. + +You can use `occ_download_sql_prep()` to check if your query is valid. + +``` r +occ_download_sql_prep("SELECT * FROM occurrence WHERE year = 2000") +# Should return an ERROR since "*" is not allowed +``` + +If you need **all** occurrence columns, you can use the regular [download interface](https://docs.ropensci.org/rgbif/articles/getting_occurrence_data.html) `occ_download(pred("year", "2000"))` instead of the SQL interface. + +> Note that if you doing `GROUP BY` `COUNT(*)` type queries for a single dimension, then `occ_count(facet="x")` is usually going to be a much faster option. See article [Getting Occurrence Counts From GBIF](https://docs.ropensci.org/rgbif/articles/occ_counts.html). + +To get a list of all +400 columns and definitions in the **occurrence** table, you can use `occ_download_describe("sql")$fields`. + +## SQL examples - Species Counts + +One common query that is difficult to do with the regular download interface is to get a count of species by some other dimension. Keep in mind that if you only need species counts for one dimension, then `occ_count(facet="x")` is usually going to be a much faster option. + +**Countries with the most species published to GBIF.** + +``` r +sql <- +" +SELECT publishingcountry, specieskey, COUNT(*) as occurrence_count +FROM occurrence +WHERE publishingcountry IS NOT NULL AND specieskey IS NOT NULL +GROUP BY country, specieskey +ORDER BY occurrence_count DESC; +" +occ_download_sql(sql) +``` + +## SQL examples - Time series + +Here is an example that retrieves the number of species published to GBIF grouped by year and basis of record. Keep in mind that `year` in this context is the collection/event date not the publication date to GBIF. + +``` r +library(rgbif) +library(ggplot2) + +sql <- 'SELECT "year", basisofrecord, COUNT(DISTINCT specieskey) as unique_species_count FROM occurrence GROUP BY basisofrecord, "year"' + +occ_download_sql(sql) + +d <- occ_download_get('0001697-240626123714530') %>% + occ_download_import() %>% + filter(!year == 2024) %>% + mutate(date = as.Date(paste0(year, "-01-01"))) + +ggplot(d, aes(x = date, y = unique_species_count, fill = basisofrecord)) + + geom_bar(stat = "identity") + + labs(x = "Year", y = "Unique Species Count", fill = "Basis of Record") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +``` + +Note that `"year"` needs to be double quoted as it is a reserve word. This is true also for other reserve words like `"month"`, `"day"` etc. + +```{r, echo = FALSE,out.width="50%"} +knitr::include_graphics("img/ts.png") +``` + +## SQL examples - Grid Functions + +Making a global map of unique species counts per grid cell is a common task, but because it requires a spatial join with the chosen spatial grid, it can be difficult to do without working with sometimes extremely large amounts of occurrences. + +For this reason GBIF's SQL downloads provide support for a few pre-defined [grid functions](https://techdocs.gbif.org/en/data-use/api-sql-download-functions). These functions will return a **grid cell code** for each occurrence, which can then be used to aggregate or plot the data. + +- **EEA Reference Grid**, GBIF_EEARGCode +- **Military Grid Reference System**, GBIF_MGRSCode +- **Quarter degree cell code**, GBIF_QDGCCode +- **ISEA3H Grid cell code GBIF_ISEA3HCode**, GBIF_ISEA3HCode + +Below is an example of working with the **Military Grid Reference System** (MGRS) grid. This example uses shapefiles from [this repository](https://github.com/klaukh/MGRS). The example shows how to produce a simple map of species counts per grid cell, using `occ_download_sql()` and the custom sql function `GBIF_MGRSCode()`. + +The sql grid functions were originally designed to be used for creating [species occurrence cubes](https://b-cubed.eu/data-and-evidence). Therefore a randomization parameter was supported `0`. This should be set to 0 if you want to use the grid functions with **no randomization**. + +You can download the combined shapefile I used for this example here: + +``` r +library(sf) +library(dplyr) +library(rgbif) +library(ggplot2) +library(purrr) + +sql <- +" +SELECT + GBIF_MGRSCode( + 100000, + decimalLatitude, + decimalLongitude, + 0 + ) AS mgrs, + COUNT(DISTINCT speciesKey) AS unique_species_count +FROM + occurrence +GROUP BY + mgrs +" + +# uncomment to run +# occ_download_sql(sql) + +# change id 0029823-240906103802322 to your download id +d <- occ_download_get('0029823-240906103802322') %>% + occ_download_import() %>% + filter(!mgrs == "") %>% + rename(MGRS = mgrs) + +path <- "mgrs-grid/" + +mgrs_grid <- st_layers(path)$name %>% +map(~ st_read(path, layer = .) %>% st_transform(4326)) %>% +dplyr::bind_rows() %>% +left_join(d, by = "MGRS") + +p <- ggplot(mgrs_grid) + +geom_sf(aes(fill = log(unique_species_count))) + +scale_fill_viridis_c() + +theme_minimal() + +``` + + + + + +> Note, there is an open issue with the grid MGRS where certain codes are missing from the shapefile near Austrailia. + +```{r, echo = FALSE,out.width="50%"} +knitr::include_graphics("img/species_count_grid.png") +``` + + + +Below is another example working with the **EEA Reference Grid**, which only covers continental Europe. + +The EEA reference shapefile for example below can be found here: + +``` r +library(sf) +library(dplyr) +library(rgbif) +library(ggplot2) +library(purrr) + +sql <- +" +SELECT + GBIF_EEARGCode( + 10000, + decimalLatitude, + decimalLongitude, + 0 + ) AS cellcode, + COUNT(DISTINCT speciesKey) AS unique_species_count +FROM + occurrence +GROUP BY + cellcode +" + +# uncomment to run +# occ_download_sql(sql) + +# change id 0030075-240906103802322 to your download id +d <- occ_download_get('0030075-240906103802322') %>% + occ_download_import() %>% + filter(!cellcode == "") + +eea_grid <- st_read("eea_v_3035_10_km_eea-ref-grid-gb_p_2013_v02_r00") %>% +mutate(cellcode = CELLCODE) %>% +left_join(d, by = "cellcode") + +p <- ggplot(eea_grid) + +geom_sf(aes(fill = log(unique_species_count))) + +scale_fill_viridis_c() + +theme_minimal() + +``` + + + + +```{r, echo = FALSE,out.width="50%"} +knitr::include_graphics("img/uk-grid.jpg") +``` + +## Further reading + +- +- +- diff --git a/vignettes/img/species_count_grid.png b/vignettes/img/species_count_grid.png new file mode 100644 index 00000000..78b2e2ad Binary files /dev/null and b/vignettes/img/species_count_grid.png differ diff --git a/vignettes/img/ts.png b/vignettes/img/ts.png new file mode 100644 index 00000000..c0b03d73 Binary files /dev/null and b/vignettes/img/ts.png differ diff --git a/vignettes/img/uk-grid.jpg b/vignettes/img/uk-grid.jpg new file mode 100644 index 00000000..270f650a Binary files /dev/null and b/vignettes/img/uk-grid.jpg differ