Skip to content

Commit

Permalink
dataset interface improvements (#693)
Browse files Browse the repository at this point in the history
* adding dataset_doi

* adding support for dataset_export and adding search terms to dataset_search

* fixing some tests

* ensure proper column types for dataset_export

* updating dataset_suggest

* deleting unecessary template

* adding tests for dataset_suggest

* adding support for dataset endpoint

* adding support for datatset_list_funs

* updating docs

* fixing tests

* deprecating datasets

---------

Co-authored-by: John Waller <[email protected]>
  • Loading branch information
jhnwllr and John Waller authored Jan 4, 2024
1 parent 64d4762 commit 84da7b4
Show file tree
Hide file tree
Showing 45 changed files with 17,523 additions and 19,089 deletions.
15 changes: 15 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,25 @@ export(as.download)
export(blanktheme)
export(check_wkt)
export(count_facet)
export(dataset)
export(dataset_comment)
export(dataset_constituents)
export(dataset_contact)
export(dataset_doi)
export(dataset_duplicate)
export(dataset_endpoint)
export(dataset_export)
export(dataset_get)
export(dataset_gridded)
export(dataset_identifier)
export(dataset_machinetag)
export(dataset_metrics)
export(dataset_networks)
export(dataset_noendpoint)
export(dataset_process)
export(dataset_search)
export(dataset_suggest)
export(dataset_tag)
export(datasets)
export(derived_dataset)
export(derived_dataset_prep)
Expand Down
115 changes: 115 additions & 0 deletions R/dataset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#' Search for more obscure dataset metadata.
#'
#' @param country The 2-letter country code (as per ISO-3166-1) of the country
#' publishing the dataset.
#' @param type The primary type of the dataset.
#' Available values : OCCURRENCE, CHECKLIST, METADATA, SAMPLING_EVENT,
#' MATERIAL_ENTITY.
#' @param identifierType An identifier type for the identifier parameter.
#' Available values : URL, LSID, HANDLER, DOI, UUID, FTP, URI, UNKNOWN,
#' GBIF_PORTAL, GBIF_NODE, GBIF_PARTICIPANT, GRSCICOLL_ID, GRSCICOLL_URI,
#' IH_IRN, ROR, GRID, CITES, SYMBIOTA_UUID, WIKIDATA, NCBI_BIOCOLLECTION.
#' @param identifier An identifier of the type given by the identifierType
#' parameter.
#' @param machineTagNamespace Filters for entities with a machine tag in the
#' specified namespace.
#' @param machineTagName Filters for entities with a machine tag with the
#' specified name (use in combination with the machineTagNamespace parameter).
#' @param machineTagValue Filters for entities with a machine tag with the
#' specified value (use in combination with the machineTagNamespace and machineTagName parameters).
#' @param modified The modified date of the dataset. Accepts ranges and a ''
#' can be used as a wildcard, e.g.:modified=2023-04-01,
#' @param query Simple full text search parameter. The value for this parameter
#' can be a simple word or a phrase. Wildcards are not supported.
#' @param deleted Logical specifying whether to return only deleted datasets.
#' @param limit Controls the number of results in the page.
#' @param start Determines the start for the search results.
#' @param curlopts options passed on to [crul::HttpClient].
#'
#' @return A `list`.
#'
#' @details
#' This function allows you to search for some more obscure dataset metadata
#' that might not be possible with `dataset_search()`. For example, searching
#' through registry machinetags.
#'
#' @export
#'
#' @examples \dontrun{
#' dataset(limit=3)
#' dataset(country="US",limit=3)
#' dataset(type="CHECKLIST",limit=3)
#' dataset(identifierType = "URL",limit=3)
#' dataset(identifier = 168,limit=3)
#' dataset(machineTagNamespace = "metasync.gbif.org",limit=3)
#' dataset(machineTagName = "datasetTitle",limit=3)
#' dataset(machineTagValue = "Borkhart",limit=3)
#' dataset(modified = "2023-04-01", limit=3)
#' dataset(q = "dog", limit=3)
#' dataset(deleted=TRUE,limit=3)
#' }
dataset <- function(country = NULL,
type = NULL,
identifierType = NULL,
identifier = NULL,
machineTagNamespace = NULL,
machineTagName = NULL,
machineTagValue = NULL,
modified = NULL,
query = NULL,
deleted = FALSE,
limit = NULL,
start = NULL,
curlopts = list()) {

assert(country, "character")
assert(type, "character")
assert(identifierType, "character")
assert(machineTagNamespace, "character")
assert(machineTagName, "character")
assert(machineTagValue, "character")
assert(modified, "character")
assert(query, "character")

args <- as.list(
rgbif_compact(c(q=query,
limit=limit,
offset=start
)))

args <- as.list(
rgbif_compact(c(
args,
convmany(country),
convmany(type),
convmany(identifierType),
convmany(identifier),
convmany(machineTagNamespace),
convmany(machineTagName),
convmany(machineTagValue),
convmany(modified)
)))

if(deleted) {
url <- paste0(gbif_base(), '/dataset/deleted/')
} else {
url <- paste0(gbif_base(), '/dataset/')
}
tt <- gbif_GET(url, args, FALSE, curlopts)

meta <- tt[c('offset','limit','endOfRecords','count')]

if (length(tt$results) == 0) {
out <- NULL
} else {
nest_if_needed <- function(x) ifelse(length(x) > 1, list(x), x)
out <- lapply(tt$results,function(x) tibble::as_tibble(lapply(x, nest_if_needed)))
out <- bind_rows(out)
}

list(meta = data.frame(meta), data = out)
}




27 changes: 27 additions & 0 deletions R/dataset_doi.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#' Get a GBIF dataset from a doi
#'
#' @param doi the doi of the dataset you wish to lookup.
#' @param limit Controls the number of results in the page.
#' @param start Determines the offset for the search results.
#' @param curlopts options passed on to [crul::HttpClient].
#'
#' @details This function allows for dataset lookup using a doi. Be aware that
#' some doi have more than one dataset associated with them.
#'
#' @return A `list`.
#' @export
#'
#' @examples \dontrun{
#' dataset_doi('10.15468/igasai')
#' }
dataset_doi <- function(doi=NULL, limit = 20, start=NULL, curlopts = list()) {
assert(doi,"character")
is_doi <- grepl("^(10\\.\\d{4,9}/[-._;()/:A-Z0-9]+)$", doi, perl = TRUE,
ignore.case = TRUE)
if(!is_doi) warning("The doi you supplied might not be valid.")
url <- paste0(gbif_base(), '/dataset/doi/',doi)
args <- rgbif_compact(list(limit = as.integer(limit),
offset = start))
res <- gbif_GET(url, args, TRUE, curlopts)
structure(list(meta = get_meta(res), data = parse_results(res,NULL)))
}
68 changes: 68 additions & 0 deletions R/dataset_export.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#' @name dataset_search
#' @export
dataset_export <- function(query = NULL,
type = NULL,
publishingCountry= NULL,
subtype = NULL,
license = NULL,
keyword = NULL,
publishingOrg = NULL,
hostingOrg = NULL,
endorsingNodeKey = NULL,
decade = NULL,
projectId = NULL,
hostingCountry = NULL,
networkKey = NULL,
doi = NULL
) {

assert(query,"character")
assert(type,"character")
assert(subtype,"character")
assert(license,"character")
assert(keyword,"character")
assert(publishingOrg,"character")
assert(hostingOrg,"character")
assert(endorsingNodeKey,"character")
assert(publishingCountry,"character")
assert(projectId,"character")
assert(hostingCountry,"character")
assert(networkKey,"character")
assert(doi,"character")

# args with single value
args <- rgbif_compact(list(
format = "TSV",
q = query
))

args <- rgbif_compact(c(
args,
convmany(type),
convmany(subtype),
convmany(license),
convmany(keyword),
convmany(publishingOrg),
convmany(hostingOrg),
convmany(endorsingNodeKey),
convmany(decade),
convmany(publishingCountry),
convmany(projectId),
convmany(hostingCountry),
convmany(networkKey),
convmany(doi)
))

url_query <- paste0(names(args),"=",args,collapse="&")
url <- paste0(gbif_base(),"/dataset/search/export?",url_query)
url <- gsub("\\[|\\]","",url)
url <- utils::URLencode(url)
temp_file <- tempfile()
utils::download.file(url,destfile=temp_file,quiet=TRUE)
out <- tibble::as_tibble(data.table::fread(temp_file, showProgress=FALSE))
colnames(out) <- to_camel(colnames(out))
out[] <- lapply(out, as.character)
out$occurrenceRecordsCount <- as.numeric(out$occurrenceRecordsCount)
out$nameUsagesCount <- as.numeric(out$nameUsagesCount)
out
}
52 changes: 52 additions & 0 deletions R/dataset_list_funs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#' List datasets that are deleted or have no endpoint.
#'
#'
#' @param limit Controls the number of results in the page.
#' @param start Determines the start for the search results.
#' @param curlopts options passed on to [crul::HttpClient].
#'
#' @return A `list`.
#'
#' @details
#' Get a list of deleted datasets or datasets with no endpoint. You get the full
#' and no parameters aside from `limit` and `start` are accepted.
#'
#'
#' @examples \dontrun{
#' dataset_noendpoint(limit=3)
#' }

#' @name dataset_list_funs
#' @export
dataset_duplicate <- function(limit=20,start=NULL,curlopts=list()) {
dataset_list_get_(endpoint="duplicate/",limit=limit,start=start,
curlopts=curlopts,meta=TRUE)
}

#' @name dataset_list_funs
#' @export
dataset_noendpoint <- function(limit=20,start=NULL,curlopts=list()) {
dataset_list_get_(endpoint="withNoEndpoint/",limit=limit,start=start,
curlopts=curlopts,meta=TRUE)
}

dataset_list_get_ <- function(endpoint,limit=NULL,start=NULL,curlopts,meta) {
url <- paste0(gbif_base(),"/dataset/",endpoint)
if(!is.null(limit)) {
args <- rgbif_compact(c(limit=limit,offset=start))
tt <- gbif_GET(url, args, TRUE, curlopts)
} else {
tt <- gbif_GET(url, args = NULL, TRUE, curlopts)
}
if(meta) {
meta <- tt[c('offset','limit','endOfRecords','count')]
if (length(tt$results) == 0) {
out <- NULL
} else {
out <- tibble::as_tibble(tt$results)
}
list(meta = data.frame(meta), data = out)
} else {
tibble::as_tibble(tt)
}
}
28 changes: 0 additions & 28 deletions R/dataset_metrics.r

This file was deleted.

Loading

0 comments on commit 84da7b4

Please sign in to comment.