-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
dataset interface improvements (#693)
* adding dataset_doi * adding support for dataset_export and adding search terms to dataset_search * fixing some tests * ensure proper column types for dataset_export * updating dataset_suggest * deleting unecessary template * adding tests for dataset_suggest * adding support for dataset endpoint * adding support for datatset_list_funs * updating docs * fixing tests * deprecating datasets --------- Co-authored-by: John Waller <[email protected]>
- Loading branch information
Showing
45 changed files
with
17,523 additions
and
19,089 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#' Search for more obscure dataset metadata. | ||
#' | ||
#' @param country The 2-letter country code (as per ISO-3166-1) of the country | ||
#' publishing the dataset. | ||
#' @param type The primary type of the dataset. | ||
#' Available values : OCCURRENCE, CHECKLIST, METADATA, SAMPLING_EVENT, | ||
#' MATERIAL_ENTITY. | ||
#' @param identifierType An identifier type for the identifier parameter. | ||
#' Available values : URL, LSID, HANDLER, DOI, UUID, FTP, URI, UNKNOWN, | ||
#' GBIF_PORTAL, GBIF_NODE, GBIF_PARTICIPANT, GRSCICOLL_ID, GRSCICOLL_URI, | ||
#' IH_IRN, ROR, GRID, CITES, SYMBIOTA_UUID, WIKIDATA, NCBI_BIOCOLLECTION. | ||
#' @param identifier An identifier of the type given by the identifierType | ||
#' parameter. | ||
#' @param machineTagNamespace Filters for entities with a machine tag in the | ||
#' specified namespace. | ||
#' @param machineTagName Filters for entities with a machine tag with the | ||
#' specified name (use in combination with the machineTagNamespace parameter). | ||
#' @param machineTagValue Filters for entities with a machine tag with the | ||
#' specified value (use in combination with the machineTagNamespace and machineTagName parameters). | ||
#' @param modified The modified date of the dataset. Accepts ranges and a '' | ||
#' can be used as a wildcard, e.g.:modified=2023-04-01, | ||
#' @param query Simple full text search parameter. The value for this parameter | ||
#' can be a simple word or a phrase. Wildcards are not supported. | ||
#' @param deleted Logical specifying whether to return only deleted datasets. | ||
#' @param limit Controls the number of results in the page. | ||
#' @param start Determines the start for the search results. | ||
#' @param curlopts options passed on to [crul::HttpClient]. | ||
#' | ||
#' @return A `list`. | ||
#' | ||
#' @details | ||
#' This function allows you to search for some more obscure dataset metadata | ||
#' that might not be possible with `dataset_search()`. For example, searching | ||
#' through registry machinetags. | ||
#' | ||
#' @export | ||
#' | ||
#' @examples \dontrun{ | ||
#' dataset(limit=3) | ||
#' dataset(country="US",limit=3) | ||
#' dataset(type="CHECKLIST",limit=3) | ||
#' dataset(identifierType = "URL",limit=3) | ||
#' dataset(identifier = 168,limit=3) | ||
#' dataset(machineTagNamespace = "metasync.gbif.org",limit=3) | ||
#' dataset(machineTagName = "datasetTitle",limit=3) | ||
#' dataset(machineTagValue = "Borkhart",limit=3) | ||
#' dataset(modified = "2023-04-01", limit=3) | ||
#' dataset(q = "dog", limit=3) | ||
#' dataset(deleted=TRUE,limit=3) | ||
#' } | ||
dataset <- function(country = NULL, | ||
type = NULL, | ||
identifierType = NULL, | ||
identifier = NULL, | ||
machineTagNamespace = NULL, | ||
machineTagName = NULL, | ||
machineTagValue = NULL, | ||
modified = NULL, | ||
query = NULL, | ||
deleted = FALSE, | ||
limit = NULL, | ||
start = NULL, | ||
curlopts = list()) { | ||
|
||
assert(country, "character") | ||
assert(type, "character") | ||
assert(identifierType, "character") | ||
assert(machineTagNamespace, "character") | ||
assert(machineTagName, "character") | ||
assert(machineTagValue, "character") | ||
assert(modified, "character") | ||
assert(query, "character") | ||
|
||
args <- as.list( | ||
rgbif_compact(c(q=query, | ||
limit=limit, | ||
offset=start | ||
))) | ||
|
||
args <- as.list( | ||
rgbif_compact(c( | ||
args, | ||
convmany(country), | ||
convmany(type), | ||
convmany(identifierType), | ||
convmany(identifier), | ||
convmany(machineTagNamespace), | ||
convmany(machineTagName), | ||
convmany(machineTagValue), | ||
convmany(modified) | ||
))) | ||
|
||
if(deleted) { | ||
url <- paste0(gbif_base(), '/dataset/deleted/') | ||
} else { | ||
url <- paste0(gbif_base(), '/dataset/') | ||
} | ||
tt <- gbif_GET(url, args, FALSE, curlopts) | ||
|
||
meta <- tt[c('offset','limit','endOfRecords','count')] | ||
|
||
if (length(tt$results) == 0) { | ||
out <- NULL | ||
} else { | ||
nest_if_needed <- function(x) ifelse(length(x) > 1, list(x), x) | ||
out <- lapply(tt$results,function(x) tibble::as_tibble(lapply(x, nest_if_needed))) | ||
out <- bind_rows(out) | ||
} | ||
|
||
list(meta = data.frame(meta), data = out) | ||
} | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#' Get a GBIF dataset from a doi | ||
#' | ||
#' @param doi the doi of the dataset you wish to lookup. | ||
#' @param limit Controls the number of results in the page. | ||
#' @param start Determines the offset for the search results. | ||
#' @param curlopts options passed on to [crul::HttpClient]. | ||
#' | ||
#' @details This function allows for dataset lookup using a doi. Be aware that | ||
#' some doi have more than one dataset associated with them. | ||
#' | ||
#' @return A `list`. | ||
#' @export | ||
#' | ||
#' @examples \dontrun{ | ||
#' dataset_doi('10.15468/igasai') | ||
#' } | ||
dataset_doi <- function(doi=NULL, limit = 20, start=NULL, curlopts = list()) { | ||
assert(doi,"character") | ||
is_doi <- grepl("^(10\\.\\d{4,9}/[-._;()/:A-Z0-9]+)$", doi, perl = TRUE, | ||
ignore.case = TRUE) | ||
if(!is_doi) warning("The doi you supplied might not be valid.") | ||
url <- paste0(gbif_base(), '/dataset/doi/',doi) | ||
args <- rgbif_compact(list(limit = as.integer(limit), | ||
offset = start)) | ||
res <- gbif_GET(url, args, TRUE, curlopts) | ||
structure(list(meta = get_meta(res), data = parse_results(res,NULL))) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#' @name dataset_search | ||
#' @export | ||
dataset_export <- function(query = NULL, | ||
type = NULL, | ||
publishingCountry= NULL, | ||
subtype = NULL, | ||
license = NULL, | ||
keyword = NULL, | ||
publishingOrg = NULL, | ||
hostingOrg = NULL, | ||
endorsingNodeKey = NULL, | ||
decade = NULL, | ||
projectId = NULL, | ||
hostingCountry = NULL, | ||
networkKey = NULL, | ||
doi = NULL | ||
) { | ||
|
||
assert(query,"character") | ||
assert(type,"character") | ||
assert(subtype,"character") | ||
assert(license,"character") | ||
assert(keyword,"character") | ||
assert(publishingOrg,"character") | ||
assert(hostingOrg,"character") | ||
assert(endorsingNodeKey,"character") | ||
assert(publishingCountry,"character") | ||
assert(projectId,"character") | ||
assert(hostingCountry,"character") | ||
assert(networkKey,"character") | ||
assert(doi,"character") | ||
|
||
# args with single value | ||
args <- rgbif_compact(list( | ||
format = "TSV", | ||
q = query | ||
)) | ||
|
||
args <- rgbif_compact(c( | ||
args, | ||
convmany(type), | ||
convmany(subtype), | ||
convmany(license), | ||
convmany(keyword), | ||
convmany(publishingOrg), | ||
convmany(hostingOrg), | ||
convmany(endorsingNodeKey), | ||
convmany(decade), | ||
convmany(publishingCountry), | ||
convmany(projectId), | ||
convmany(hostingCountry), | ||
convmany(networkKey), | ||
convmany(doi) | ||
)) | ||
|
||
url_query <- paste0(names(args),"=",args,collapse="&") | ||
url <- paste0(gbif_base(),"/dataset/search/export?",url_query) | ||
url <- gsub("\\[|\\]","",url) | ||
url <- utils::URLencode(url) | ||
temp_file <- tempfile() | ||
utils::download.file(url,destfile=temp_file,quiet=TRUE) | ||
out <- tibble::as_tibble(data.table::fread(temp_file, showProgress=FALSE)) | ||
colnames(out) <- to_camel(colnames(out)) | ||
out[] <- lapply(out, as.character) | ||
out$occurrenceRecordsCount <- as.numeric(out$occurrenceRecordsCount) | ||
out$nameUsagesCount <- as.numeric(out$nameUsagesCount) | ||
out | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#' List datasets that are deleted or have no endpoint. | ||
#' | ||
#' | ||
#' @param limit Controls the number of results in the page. | ||
#' @param start Determines the start for the search results. | ||
#' @param curlopts options passed on to [crul::HttpClient]. | ||
#' | ||
#' @return A `list`. | ||
#' | ||
#' @details | ||
#' Get a list of deleted datasets or datasets with no endpoint. You get the full | ||
#' and no parameters aside from `limit` and `start` are accepted. | ||
#' | ||
#' | ||
#' @examples \dontrun{ | ||
#' dataset_noendpoint(limit=3) | ||
#' } | ||
|
||
#' @name dataset_list_funs | ||
#' @export | ||
dataset_duplicate <- function(limit=20,start=NULL,curlopts=list()) { | ||
dataset_list_get_(endpoint="duplicate/",limit=limit,start=start, | ||
curlopts=curlopts,meta=TRUE) | ||
} | ||
|
||
#' @name dataset_list_funs | ||
#' @export | ||
dataset_noendpoint <- function(limit=20,start=NULL,curlopts=list()) { | ||
dataset_list_get_(endpoint="withNoEndpoint/",limit=limit,start=start, | ||
curlopts=curlopts,meta=TRUE) | ||
} | ||
|
||
dataset_list_get_ <- function(endpoint,limit=NULL,start=NULL,curlopts,meta) { | ||
url <- paste0(gbif_base(),"/dataset/",endpoint) | ||
if(!is.null(limit)) { | ||
args <- rgbif_compact(c(limit=limit,offset=start)) | ||
tt <- gbif_GET(url, args, TRUE, curlopts) | ||
} else { | ||
tt <- gbif_GET(url, args = NULL, TRUE, curlopts) | ||
} | ||
if(meta) { | ||
meta <- tt[c('offset','limit','endOfRecords','count')] | ||
if (length(tt$results) == 0) { | ||
out <- NULL | ||
} else { | ||
out <- tibble::as_tibble(tt$results) | ||
} | ||
list(meta = data.frame(meta), data = out) | ||
} else { | ||
tibble::as_tibble(tt) | ||
} | ||
} |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.