dataset interface improvements (#693)

* adding dataset_doi * adding support for dataset_export and adding search terms to dataset_search * fixing some tests * ensure proper column types for dataset_export * updating dataset_suggest * deleting unecessary template * adding tests for dataset_suggest * adding support for dataset endpoint * adding support for datatset_list_funs * updating docs * fixing tests * deprecating datasets --------- Co-authored-by: John Waller <[email protected]>
ropensci · Jan 4, 2024 · 84da7b4 · 84da7b4
1 parent 64d4762
commit 84da7b4
Show file tree

Hide file tree

Showing 45 changed files with 17,523 additions and 19,089 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -26,10 +26,25 @@ export(as.download)
 export(blanktheme)
 export(check_wkt)
 export(count_facet)
+export(dataset)
+export(dataset_comment)
+export(dataset_constituents)
+export(dataset_contact)
+export(dataset_doi)
+export(dataset_duplicate)
+export(dataset_endpoint)
+export(dataset_export)
+export(dataset_get)
 export(dataset_gridded)
+export(dataset_identifier)
+export(dataset_machinetag)
 export(dataset_metrics)
+export(dataset_networks)
+export(dataset_noendpoint)
+export(dataset_process)
 export(dataset_search)
 export(dataset_suggest)
+export(dataset_tag)
 export(datasets)
 export(derived_dataset)
 export(derived_dataset_prep)

diff --git a/R/dataset.R b/R/dataset.R
@@ -0,0 +1,115 @@
+#' Search for more obscure dataset metadata. 
+#'
+#' @param country The 2-letter country code (as per ISO-3166-1) of the country 
+#' publishing the dataset.
+#' @param type The primary type of the dataset. 
+#' Available values : OCCURRENCE, CHECKLIST, METADATA, SAMPLING_EVENT, 
+#' MATERIAL_ENTITY.
+#' @param identifierType An identifier type for the identifier parameter.
+#' Available values : URL, LSID, HANDLER, DOI, UUID, FTP, URI, UNKNOWN, 
+#' GBIF_PORTAL, GBIF_NODE, GBIF_PARTICIPANT, GRSCICOLL_ID, GRSCICOLL_URI,
+#' IH_IRN, ROR, GRID, CITES, SYMBIOTA_UUID, WIKIDATA, NCBI_BIOCOLLECTION.
+#' @param identifier An identifier of the type given by the identifierType 
+#' parameter. 
+#' @param machineTagNamespace Filters for entities with a machine tag in the 
+#' specified namespace.
+#' @param machineTagName Filters for entities with a machine tag with the 
+#' specified name (use in combination with the machineTagNamespace parameter).
+#' @param machineTagValue Filters for entities with a machine tag with the 
+#' specified value (use in combination with the machineTagNamespace and machineTagName parameters).
+#' @param modified The modified date of the dataset. Accepts ranges and a '' 
+#' can be used as a wildcard, e.g.:modified=2023-04-01,
+#' @param query Simple full text search parameter. The value for this parameter 
+#' can be a simple word or a phrase. Wildcards are not supported.
+#' @param deleted Logical specifying whether to return only deleted datasets. 
+#' @param limit Controls the number of results in the page. 
+#' @param start Determines the start for the search results.
+#' @param curlopts options passed on to [crul::HttpClient].
+#'
+#' @return A `list`. 
+#' 
+#' @details
+#' This function allows you to search for some more obscure dataset metadata
+#' that might not be possible with `dataset_search()`. For example, searching 
+#' through registry machinetags. 
+#' 
+#' @export
+#'
+#' @examples \dontrun{
+#' dataset(limit=3)
+#' dataset(country="US",limit=3)
+#' dataset(type="CHECKLIST",limit=3)
+#' dataset(identifierType = "URL",limit=3)
+#' dataset(identifier = 168,limit=3)
+#' dataset(machineTagNamespace = "metasync.gbif.org",limit=3)
+#' dataset(machineTagName = "datasetTitle",limit=3)
+#' dataset(machineTagValue = "Borkhart",limit=3)
+#' dataset(modified = "2023-04-01", limit=3) 
+#' dataset(q = "dog", limit=3) 
+#' dataset(deleted=TRUE,limit=3)
+#' }
+dataset <- function(country = NULL, 
+                    type = NULL, 
+                    identifierType = NULL, 
+                    identifier = NULL, 
+                    machineTagNamespace = NULL,
+                    machineTagName = NULL, 
+                    machineTagValue = NULL,
+                    modified = NULL, 
+                    query = NULL,
+                    deleted = FALSE,
+                    limit = NULL, 
+                    start = NULL,
+                    curlopts = list()) {
+
+  assert(country, "character") 
+  assert(type, "character")
+  assert(identifierType, "character") 
+  assert(machineTagNamespace, "character")
+  assert(machineTagName, "character")
+  assert(machineTagValue, "character")
+  assert(modified, "character")
+  assert(query, "character")           
+
+  args <- as.list(
+    rgbif_compact(c(q=query,
+                    limit=limit,
+                    offset=start
+    )))
+
+  args <- as.list(
+    rgbif_compact(c(
+    args,
+    convmany(country),
+    convmany(type), 
+    convmany(identifierType),
+    convmany(identifier), 
+    convmany(machineTagNamespace),
+    convmany(machineTagName),
+    convmany(machineTagValue),
+    convmany(modified)
+    )))
+
+  if(deleted) {
+    url <- paste0(gbif_base(), '/dataset/deleted/')
+  } else {
+    url <- paste0(gbif_base(), '/dataset/')
+  }
+  tt <- gbif_GET(url, args, FALSE, curlopts)
+
+  meta <- tt[c('offset','limit','endOfRecords','count')]
+
+  if (length(tt$results) == 0) {
+    out <- NULL
+  } else {
+    nest_if_needed <- function(x) ifelse(length(x) > 1, list(x), x)
+    out <- lapply(tt$results,function(x) tibble::as_tibble(lapply(x, nest_if_needed))) 
+    out <- bind_rows(out)
+    }
+
+  list(meta = data.frame(meta), data = out)
+} 
+
+
+
+
diff --git a/R/dataset_doi.R b/R/dataset_doi.R
@@ -0,0 +1,27 @@
+#' Get a GBIF dataset from a doi
+#'
+#' @param doi the doi of the dataset you wish to lookup. 
+#' @param limit Controls the number of results in the page.
+#' @param start Determines the offset for the search results.
+#' @param curlopts options passed on to [crul::HttpClient]. 
+#' 
+#' @details This function allows for dataset lookup using a doi. Be aware that 
+#' some doi have more than one dataset associated with them. 
+#' 
+#' @return A `list`. 
+#' @export
+#'
+#' @examples \dontrun{
+#' dataset_doi('10.15468/igasai')
+#' }
+dataset_doi <- function(doi=NULL, limit = 20, start=NULL, curlopts = list()) {
+  assert(doi,"character")
+  is_doi <- grepl("^(10\\.\\d{4,9}/[-._;()/:A-Z0-9]+)$", doi, perl = TRUE, 
+                  ignore.case = TRUE)
+  if(!is_doi) warning("The doi you supplied might not be valid.")
+  url <- paste0(gbif_base(), '/dataset/doi/',doi)
+  args <- rgbif_compact(list(limit = as.integer(limit),
+                             offset = start))
+  res <- gbif_GET(url, args, TRUE, curlopts)
+  structure(list(meta = get_meta(res), data = parse_results(res,NULL)))
+}
diff --git a/R/dataset_export.R b/R/dataset_export.R
@@ -0,0 +1,68 @@
+#' @name dataset_search
+#' @export
+dataset_export <- function(query = NULL,
+                           type = NULL,
+                           publishingCountry= NULL,  
+                           subtype = NULL,
+                           license = NULL,
+                           keyword = NULL,
+                           publishingOrg = NULL,
+                           hostingOrg = NULL,
+                           endorsingNodeKey = NULL,
+                           decade = NULL,
+                           projectId = NULL,
+                           hostingCountry = NULL,
+                           networkKey = NULL,
+                           doi = NULL
+                           ) {
+
+  assert(query,"character")
+  assert(type,"character")
+  assert(subtype,"character")
+  assert(license,"character")
+  assert(keyword,"character")
+  assert(publishingOrg,"character")
+  assert(hostingOrg,"character")
+  assert(endorsingNodeKey,"character")
+  assert(publishingCountry,"character")
+  assert(projectId,"character")
+  assert(hostingCountry,"character")
+  assert(networkKey,"character")
+  assert(doi,"character")
+
+  # args with single value 
+  args <- rgbif_compact(list(
+            format = "TSV",
+            q = query
+            ))
+
+  args <- rgbif_compact(c(
+                args,
+                convmany(type),
+                convmany(subtype),
+                convmany(license),
+                convmany(keyword),
+                convmany(publishingOrg),
+                convmany(hostingOrg),
+                convmany(endorsingNodeKey),
+                convmany(decade),
+                convmany(publishingCountry),
+                convmany(projectId),
+                convmany(hostingCountry),
+                convmany(networkKey),
+                convmany(doi)
+                ))
+
+  url_query <- paste0(names(args),"=",args,collapse="&")
+  url <- paste0(gbif_base(),"/dataset/search/export?",url_query)
+  url <- gsub("\\[|\\]","",url)
+  url <- utils::URLencode(url)
+  temp_file <- tempfile()
+  utils::download.file(url,destfile=temp_file,quiet=TRUE)
+  out <- tibble::as_tibble(data.table::fread(temp_file, showProgress=FALSE))
+  colnames(out) <- to_camel(colnames(out))
+  out[] <- lapply(out, as.character)
+  out$occurrenceRecordsCount <- as.numeric(out$occurrenceRecordsCount)
+  out$nameUsagesCount <- as.numeric(out$nameUsagesCount)
+  out
+}
diff --git a/R/dataset_list_funs.R b/R/dataset_list_funs.R
@@ -0,0 +1,52 @@
+#' List datasets that are deleted or have no endpoint. 
+#' 
+#' 
+#' @param limit Controls the number of results in the page. 
+#' @param start Determines the start for the search results.
+#' @param curlopts options passed on to [crul::HttpClient].
+#'
+#' @return A `list`.
+#'
+#' @details
+#' Get a list of deleted datasets or datasets with no endpoint. You get the full
+#' and no parameters aside from `limit` and `start` are accepted. 
+#' 
+#'
+#' @examples \dontrun{
+#' dataset_noendpoint(limit=3)
+#' }
+
+#' @name dataset_list_funs
+#' @export
+dataset_duplicate <- function(limit=20,start=NULL,curlopts=list()) {
+  dataset_list_get_(endpoint="duplicate/",limit=limit,start=start,
+                    curlopts=curlopts,meta=TRUE) 
+}
+
+#' @name dataset_list_funs
+#' @export
+dataset_noendpoint <- function(limit=20,start=NULL,curlopts=list()) {
+  dataset_list_get_(endpoint="withNoEndpoint/",limit=limit,start=start,
+                    curlopts=curlopts,meta=TRUE) 
+}
+
+dataset_list_get_ <- function(endpoint,limit=NULL,start=NULL,curlopts,meta) {
+  url <- paste0(gbif_base(),"/dataset/",endpoint)
+  if(!is.null(limit)) {
+    args <- rgbif_compact(c(limit=limit,offset=start))
+    tt <- gbif_GET(url, args, TRUE, curlopts)
+  } else {
+    tt <- gbif_GET(url, args = NULL, TRUE, curlopts)
+  }
+  if(meta) {
+    meta <- tt[c('offset','limit','endOfRecords','count')]
+    if (length(tt$results) == 0) {
+      out <- NULL
+    } else {
+      out <- tibble::as_tibble(tt$results)  
+    }
+    list(meta = data.frame(meta), data = out) 
+  } else {
+    tibble::as_tibble(tt)
+  }
+}
diff --git a/R/dataset_metrics.r b/R/dataset_metrics.r