diff --git a/NAMESPACE b/NAMESPACE index 33e0412..98c7e65 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,7 @@ export(filter_media) export(filter_observations) export(locations) export(media) +export(merge_camtrapdp) export(observations) export(read_camtrapdp) export(round_coordinates) diff --git a/NEWS.md b/NEWS.md index 4960dc5..5afc36f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # camtrapdp (development version) * New function `write_camtrapdp()` writes a Camera Trap Data Package to disk as a `datapackage.json` and CSV files (#137). +* New function `merge_camtrapdp()` allows to merge two datasets (#112). * New function `write_eml()` transforms Camtrap DP metadata to EML (#99). * New function `round_coordinates()` allows to fuzzy/generalize location information by rounding deployment `latitude` and `longitude`. It also updates `coordinateUncertainty` in the deployments and `coordinatePrecision` and spatial scope in the metadata (#106). * New function `shift_time()` allows to shift/correct date-times in data and metadata for specified deploymentIDs and duration (#108). diff --git a/R/merge_camtrapdp.R b/R/merge_camtrapdp.R new file mode 100644 index 0000000..3aa499d --- /dev/null +++ b/R/merge_camtrapdp.R @@ -0,0 +1,159 @@ +#' Merge Camera Trap Data packages +#' +#' @param x,y Camera Trap Data Package objects (as returned by +#' `read_camtrapdp()`), to be coerced to one. +#' @param prefix If there are duplicate IDs between x and y, prefixes will be +#' added to all the values of each identifier with duplicates, to disambiguate +#' them. Should be a character vector of length 2. By default, the prefixes are +#' the ID's of the Data Package. +#' @return `xy_merged` Merged Camera Trap Data Package +#' @family transformation functions +#' @export +#' @section Merging details: +#' Deployments, media and observations are combined. If there are duplicate IDs +#' between x and y, prefixes will be added to all the values of each identifier +#' with duplicates, to disambiguate them. +#' Additional resources are added, but not combined. If additional resources +#' have the same name, prefixes will be added to the resource name. +#' The following properties are set: +#' - **name**: Set to NA. +#' - **id**: Set to NULL. +#' - **created**: Set to current timestamp. +#' - **title**: Set to NA. +#' - **contributors**: A combination is made and duplicates are removed. +#' - **description**: A combination is made. +#' - **version**: Set to 1.0. +#' - **keywords**: A combination is made and duplicates are removed. +#' - **image**: Set to NULL. +#' - **homepage**: Set to NULL. +#' - **sources**: A combination is made and duplicates are removed. +#' - **licenses**: A combination is made and duplicates are removed. +#' - **bibliographicCitation**: Set to NULL. +#' - **project**: List of the projects. +#' - **coordinatePrecision**: Set to the least precise `coordinatePrecision`. +#' - **spatial**: Reset based on the new deployments. +#' - **temporal**: Reset based on the new deployments. +#' - **taxonomic**: A combination is made and duplicates are removed. +#' - **relatedIdentifiers**: A combination is made and duplicates are removed. +#' - **references**: A combination is made and duplicates are removed. +#' @section Merging multiple Camera Trap Data Packages: +#' `merge_camtrapdp()` can be used in a pipe to merge multiple camtrap DP. +#' - x %>% merge_camtrapdp(y) %>% merge_camtrapdp(z) +#' @examples +#' x <- example_dataset() %>% +#' filter_deployments(deploymentID %in% c("00a2c20d", "29b7d356")) +#' y <- example_dataset() %>% +#' filter_deployments(deploymentID %in% c("577b543a", "62c200a9")) +#' x$id <- "1" +#' y$id <- "2" +#' xy_merged <- merge_camtrapdp(x, y) +merge_camtrapdp <- function(x, y, prefix = c(x$id, y$id)) { + check_camtrapdp(x) + check_camtrapdp(y) + + if (!is.null(x$id) & !is.null(y$id)) { + if (x$id == y$id) { + cli::cli_abort( + c( + paste0( + "{.arg x} and {.arg y} should be different Camera Trap Data", + "Package objects with unique identifiers." + ), + x = "{.arg x} and {.arg y} have the same id: {.value x$id}" + ), + class = "camtrapdp_error_camtrapdpid_duplicated" + ) + } + } + + # check if identifiers have duplicates + results_duplicate_ids <- check_duplicate_ids(x, y) + + # Add prefix to identifiers with duplicates + if (TRUE %in% results_duplicate_ids) { + + if (!is.character(prefix) || length(prefix) != 2) { + cli::cli_abort( + c( + paste( + "{.arg prefix} must be a character vector of length 2, not", + "a {class(prefix)} object of length {length(prefix)}." + ) + ), + class = "camtrapdp_error_prefix_invalid" + ) + } + + if (any(is.na(prefix))) { + cli::cli_abort( + "{.arg prefix} can't be 'NA'.", + class = "camtrapdp_error_prefix_NA" + ) + } + + x <- add_prefix(x, results_duplicate_ids, paste0(prefix[1], "_")) + y <- add_prefix(y, results_duplicate_ids, paste0(prefix[2], "_")) + } + + # Merge Camera Trap DP resources + xy_merged <- x + deployments(xy_merged) <- dplyr::bind_rows(deployments(x), deployments(y)) + media(xy_merged) <- dplyr::bind_rows(media(x), media(y)) + observations(xy_merged) <- dplyr::bind_rows(observations(x), observations(y)) + + # Merge additional resources + xy_merged <- merge_additional_resources(xy_merged, x, y, prefix) + + # Merge/update metadata + xy_merged$name <- NA + xy_merged$id <- NULL + xy_merged$created <- format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ") + xy_merged$title <- NA + xy_merged$contributors <- remove_duplicates(c(x$contributors, y$contributors)) + xy_merged$description <- paste(x$description, y$description, sep = "/n") + xy_merged$version <- "1.0" + xy_merged$keywords <- unique(c(x$keywords, y$keywords)) + xy_merged$image <- NULL + xy_merged$homepage <- NULL + xy_merged$sources <- remove_duplicates(c(x$sources, y$sources)) + xy_merged$licenses <- remove_duplicates(c(x$licenses, y$licenses)) + xy_merged$project <- list(x$project, y$project) + xy_merged$bibliographicCitation <- NULL + xy_merged$coordinatePrecision <- + max(x$coordinatePrecision, y$coordinatePrecision, na.rm = TRUE) + + if (!is.null(x$id)) { + relatedIdentifiers_x <- list( + relationType = "IsDerivedFrom", + relatedIdentifier = as.character(x$id), + resourceTypeGeneral = "Data package", + relatedIdentifierType = "id" + ) + } else { + relatedIdentifiers_x <- list() + } + if (!is.null(y$id)) { + relatedIdentifiers_y <- list( + relationType = "IsDerivedFrom", + relatedIdentifier = as.character(y$id), + resourceTypeGeneral = "Data package", + relatedIdentifierType = "id" + ) + } else { + relatedIdentifiers_y <- list() + } + new_relatedIdentifiers <- list(relatedIdentifiers_x, relatedIdentifiers_y) + xy_merged$relatedIdentifiers <- remove_duplicates( + c(x$relatedIdentifiers, y$relatedIdentifiers, new_relatedIdentifiers) + ) + + xy_merged$references <- unique(c(x$references, y$references)) + xy_merged$directory <- "." + + xy_merged <- xy_merged %>% + update_spatial() %>% + update_temporal() %>% + update_taxonomic() + + return(xy_merged) +} diff --git a/R/taxa.R b/R/taxa.R index 156dec9..8288e0e 100644 --- a/R/taxa.R +++ b/R/taxa.R @@ -23,7 +23,7 @@ taxa <- function(x) { dplyr::select("scientificName", dplyr::starts_with("taxon.")) %>% dplyr::distinct() %>% dplyr::rename_with(~ sub("^taxon.", "", .x)) %>% - dplyr::arrange(scientificName) + dplyr::arrange(.data$scientificName) # Remove duplicates without taxonID if ("taxonID" %in% names(taxa)) { diff --git a/R/taxonomic.R b/R/taxonomic.R index 1a23ae0..2829467 100644 --- a/R/taxonomic.R +++ b/R/taxonomic.R @@ -15,6 +15,9 @@ taxonomic <- function(x) { return(NULL) } + # Replace NULL with NA + taxonomic_list <- replace_null_recursive(taxonomic_list) + # Convert list into a data.frame taxa <- purrr::map( diff --git a/R/utils.R b/R/utils.R index 246af43..415351f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -37,7 +37,355 @@ expand_cols <- function(df, colnames) { return(df) } -#' Creates list of contributors in EML format +#' Check for duplicated IDs +#' +#' Checks for duplicated IDs (deploymentID, mediaID, observationID and eventID) +#' in two Camera Trap Data Package objects combined. +#' +#' @param x,y Camera Trap Data Package objects (as returned by +#' `read_camtrapdp()`), to be coerced to one. +#' @return List with logical for each type of ID, that indicates whether that +#' ID type has duplicates between x and y. +#' @family helper functions +#' @noRd +check_duplicate_ids <- function(x, y) { + result = list( + deploymentID = FALSE, mediaID = FALSE, observationID = FALSE, + eventID = FALSE) + + deploymentIDs <- c( + unique(purrr::pluck(deployments(x), "deploymentID")), + unique(purrr::pluck(deployments(y), "deploymentID")) + ) + mediaIDs <- c( + unique(purrr::pluck(media(x), "mediaID")), + unique(purrr::pluck(media(y), "mediaID")) + ) + observationIDs <- c( + unique(purrr::pluck(observations(x), "observationID")), + unique(purrr::pluck(observations(y), "observationID")) + ) + eventIDs <- c( + unique(purrr::pluck(media(x), "eventID")), + unique(purrr::pluck(media(y), "eventID")) + ) + + # Check for duplicates + if (any(duplicated(deploymentIDs))) {result$deploymentID <- TRUE} + if (any(duplicated(mediaIDs))) {result$mediaID <- TRUE} + if (any(duplicated(observationIDs))) {result$observationID <- TRUE} + if (any(duplicated(eventIDs))) {result$eventID <- TRUE} + + return(result) +} + +#' Add prefix to identifiers with duplicates +#' +#' Adds prefix to all values of each identifier (deploymentID, mediaID, +#' observationID and eventID) that has duplicates. +#' +#' @inheritParams print.camtrapdp +#' @param prefix The prefix to add to the IDs. +#' @param results_duplicate_ids Output generated with `check_duplicate_ids()`. +#' List with logical for each type of ID, that indicates whether that ID type +#' has duplicates. +#' @return `x` +#' @family helper functions +#' @noRd +#' @examples +#' results_duplicate_ids <- list(deploymentID = TRUE, mediaID = TRUE, +#' observationID = TRUE, eventID = TRUE) +#' x <- add_prefix(example_dataset(), results_duplicate_ids, prefix = ".x") +add_prefix <- function(x, results_duplicate_ids, prefix) { + + # deploymentID + if (results_duplicate_ids$deploymentID) { + # Add prefix to deploymentIDs in deployments + deployments(x) <- + deployments(x) %>% + dplyr::mutate(deploymentID = paste0(prefix, .data$deploymentID)) + + # Add prefix to deploymentIDs in observations + observations(x) <- + observations(x) %>% + dplyr::mutate(deploymentID = paste0(prefix, .data$deploymentID)) + + # Add prefix to deploymentIDs in media + media(x) <- + media(x) %>% + dplyr::mutate(deploymentID = paste0(prefix, .data$deploymentID)) + } + + # mediaID + if (results_duplicate_ids$mediaID) { + # Add prefix to mediaIDs in media + media(x) <- + media(x) %>% + dplyr::mutate( + mediaID = ifelse( + !is.na(.data$mediaID), paste0(prefix, .data$mediaID), NA + ) + ) + + # Add prefix to mediaIDs in observations + observations(x) <- + observations(x) %>% + dplyr::mutate( + mediaID = ifelse( + !is.na(.data$mediaID), paste0(prefix, .data$mediaID), NA + ) + ) + } + + # observationID + if (results_duplicate_ids$observationID) { + # Add prefix to observationIDs in observations + observations(x) <- + observations(x) %>% + dplyr::mutate(observationID = paste0(prefix, .data$observationID)) + } + + # eventID + if (results_duplicate_ids$eventID) { + # Add prefix to eventIDs in media + media(x) <- + media(x) %>% + dplyr::mutate( + eventID = ifelse( + !is.na(.data$eventID), paste0(prefix, .data$eventID), NA + ) + ) + + # Add prefix to eventIDs in observations + observations(x) <- + observations(x) %>% + dplyr::mutate( + eventID = ifelse( + !is.na(.data$eventID), paste0(prefix, .data$eventID), NA + ) + ) + } + + return(x) +} + +#' Merge additional resources +#' +#' Merges resources that are different from the required Camera Trap Data +#' Package resources (deployments, media and observations). Resources with the +#' same name are not combined, but prefixes are added to the resource names. +#' +#' @param xy_merged Merged Camera Trap Data Package +#' @inheritParams merge_camtrapdp +#' +#' @return `xy_merged` Merged Camera Trap Data Package +#' @family helper functions +#' @noRd +merge_additional_resources <- function(xy_merged, x, y, prefix) { + camtrapdp_resources <- c("deployments", "media", "observations") + x_resource_names <- frictionless::resources(x) + y_resource_names <- frictionless::resources(y) + x_additional_resources <- + x_resource_names[!x_resource_names %in% camtrapdp_resources] + y_additional_resources <- + y_resource_names[!y_resource_names %in% camtrapdp_resources] + + all_additional_resources <- c(x_additional_resources, y_additional_resources) + + if (length(all_additional_resources) > 0) { + duplicated_resources <- duplicated(all_additional_resources) + duplicated_names <- all_additional_resources[duplicated_resources] + + # Add prefixes to resource names that are not unique + if (any(duplicated_resources)) { + purrr::map(duplicated_names, function(duplicated_name) { + xy_index <- + which(purrr::map(xy_merged$resources, "name") == duplicated_name) + y_index <- which(purrr::map(y$resources, "name") == duplicated_name) + xy_merged$resources[[xy_index]]$name <- + paste0(prefix[1], "_", duplicated_name) + y$resources[[y_index]]$name <- paste0(prefix[2], "_", duplicated_name) + xy_merged$resources <<- append(xy_merged$resources, y$resources[y_index]) + }) + } + + # Add unique resources from y + y_unique_resources <- + y_additional_resources[!y_additional_resources %in% duplicated_names] + purrr::map(y_unique_resources, function(resource_name) { + index <- which(purrr::map(y$resources, "name") == resource_name) + resource <- y$resources[index] + xy_merged$resources <<- append(xy_merged$resources, resource) + }) + } + + return(xy_merged) +} + +#' Normalize list elements +#' +#' Converts each list element to a named vector with consistent handling of +#' missing values (NA), using determined `unique_names`. +#' +#' @param data_list list to be normalized. +#' @param unique_names the names that the list must have. +#' @return named vector with all `unique_names` present. +#' @family helper functions +#' @noRd +#' @examples +#' data_list <- list( +#' title = "Peter Desmet", +#' email = "peter.desmet@inbo.be", +#' organization = "Research Institute for Nature and Forest (INBO)" +#' ) +#' unique_names <- c("title", "email", "path", "role", "organization") +#' normalize_list(data_list, unique_names) +normalize_list <- function(data_list, unique_names) { + vector <- purrr::map_vec( + unique_names, + ~ ifelse(!is.null(data_list[[.x]]), data_list[[.x]], NA) + ) + names(vector) <- unique_names + return(vector) +} + +#' Check if one element is equal to or a subset of another and vice versa +#' +#' @param element1,element2 elements to compare. +#' @return logical. +#' @family helper functions +#' @noRd +#' @examples +#' element1 <- list( +#' title = "Peter Desmet", +#' email = "peter.desmet@inbo.be", +#' organization = "Research Institute for Nature and Forest (INBO)" +#' ) +#' element2 <- list( +#' title = "Peter Desmet", +#' email = "peter.desmet@inbo.be", +#' path = "https://orcid.org/0000-0002-8442-8025", +#' role = "principalInvestigator", +#' organization = "Research Institute for Nature and Forest (INBO)" +#' ) +#' is_subset(element1, element2) +is_subset <- function(element1, element2) { + all( + purrr::map_vec(names(element1), function(field) { + if (is.na(element1[[field]])) { + TRUE + } else if (is.na(element2[[field]])) { + TRUE + } else { + element1[[field]] == element2[[field]] + } + }) + ) +} + +#' Update a list of unique elements +#' +#' Updates a list of unique elements by adding a new element if it is not a +#' subset of any existing element in the list. It also removes any elements that +#' are subsets of the new element. +#' +#' @param unique_data A list of elements. Each element must be a vector or +#' list. +#' @param current_element A vector or list representing the current element to +#' be added to the list. +#' @return `unique_data`, a list of unique elements updated with the current +#' element, ensuring no element is a subset of another. +#' @family helper functions +#' @noRd +#' @examples +#' unique_data <- list(c(1, 2, 3), c(4, 5), c(1, 2, 3, 4, 5)) +#' current_element <- c(2, 3) +#' update_unique(unique_data, current_element) +update_unique <- function(unique_data, current_element) { + # Check if current element is already a subset of any element in unique_data + is_already_present <- + any( + purrr::map_lgl(unique_data, ~ is_subset(current_element, .x)) + ) + if (!is_already_present) { + # Remove subsets from unique_data + subsets_to_remove <- + purrr::map_lgl(unique_data, ~ is_subset(.x, current_element)) + unique_data <- + unique_data[!subsets_to_remove] %>% + c(list(current_element)) + } + return(unique_data) +} + +#' Remove duplicates and subsets +#' +#' Removes duplicate and subset elements from a list of lists. Elements are +#' considered subsets if all their non-NA fields match. +#' +#' @param data_list List of lists, where each inner list represents an element +#' with named fields. +#' @return List of lists with duplicates and subsets removed. +#' @family helper functions +#' @noRd +#' @examples +#' data_list <- list( +#' list( +#' title = "Axel Neukermans", +#' email = "axel.neukermans@inbo.be", +#' path = "https://orcid.org/0000-0003-0272-9180", +#' role = "contributor", +#' organization = "Research Institute for Nature and Forest (INBO)" +#' ), +#' list( +#' title = "Peter Desmet", +#' email = "peter.desmet@inbo.be", +#' path = "https://orcid.org/0000-0002-8442-8025", +#' role = "principalInvestigator", +#' organization = "Research Institute for Nature and Forest (INBO)" +#' ), +#' list( +#' title = "Research Institute for Nature and Forest (INBO)", +#' path = "https://inbo.be", +#' role = "rightsHolder" +#' ), +#' list( +#' title = "Peter Desmet", +#' email = "peter.desmet@inbo.be", +#' organization = "Research Institute for Nature and Forest (INBO)" +#' ), +#' list( +#' title = "Research Institute for Nature and Forest (INBO)", +#' path = "https://inbo.be", +#' role = "rightsHolder" +#' ) +#' ) +#' remove_duplicates(data_list) +remove_duplicates <- function(data_list) { + # Find all unique field names + unique_names <- + purrr::map(data_list, names) %>% + unlist() %>% + unique() + + # Normalize all elements + normalized_data <- + purrr::map(data_list, ~ normalize_list(.x, unique_names)) + + # Reduce the list to unique elements using update_unique() + unique_data <- Reduce(update_unique, normalized_data, init = list()) + + # Convert back to original list format and remove NA's + unique_data_list <- + purrr::map(unique_data, function(x) { + x <- as.list(x) + x[!sapply(x, is.na)] + }) + + return(unique_data_list) +} + +#' Create list of contributors in EML format #' #' @param contributor_list List of contributors #' @return List of contributors as emld responsibleParty objects. @@ -57,3 +405,23 @@ create_eml_contributors <- function(contributor_list) { onlineUrl = .$path )) } + +#' Replace NULL values recursively +#' +#' Replaces NULL values with NA by recursively iterating through each element of +#' the input list. +#' +#' @param x A nested list. +#' @return A nested list identical to the input x, but with all NULL values +#' replaced by NA. +#' @family helper functions +#' @noRd +replace_null_recursive <- function(x) { + purrr::map(x, function(element) { + if (is.list(element) && !is.null(element)) { + replace_null_recursive(element) + } else { + ifelse(is.null(element), NA, element) + } + }) +} diff --git a/man/merge_camtrapdp.Rd b/man/merge_camtrapdp.Rd new file mode 100644 index 0000000..6763591 --- /dev/null +++ b/man/merge_camtrapdp.Rd @@ -0,0 +1,80 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merge_camtrapdp.R +\name{merge_camtrapdp} +\alias{merge_camtrapdp} +\title{Merge Camera Trap Data packages} +\usage{ +merge_camtrapdp(x, y, prefix = c(x$id, y$id)) +} +\arguments{ +\item{x, y}{Camera Trap Data Package objects (as returned by +\code{read_camtrapdp()}), to be coerced to one.} + +\item{prefix}{If there are duplicate IDs between x and y, prefixes will be +added to all the values of each identifier with duplicates, to disambiguate +them. Should be a character vector of length 2. By default, the prefixes are +the ID's of the Data Package.} +} +\value{ +\code{xy_merged} Merged Camera Trap Data Package +} +\description{ +Merge Camera Trap Data packages +} +\section{Merging details}{ + +Deployments, media and observations are combined. If there are duplicate IDs +between x and y, prefixes will be added to all the values of each identifier +with duplicates, to disambiguate them. +Additional resources are added, but not combined. If additional resources +have the same name, prefixes will be added to the resource name. +The following properties are set: +\itemize{ +\item \strong{name}: Set to NA. +\item \strong{id}: Set to NULL. +\item \strong{created}: Set to current timestamp. +\item \strong{title}: Set to NA. +\item \strong{contributors}: A combination is made and duplicates are removed. +\item \strong{description}: A combination is made. +\item \strong{version}: Set to 1.0. +\item \strong{keywords}: A combination is made and duplicates are removed. +\item \strong{image}: Set to NULL. +\item \strong{homepage}: Set to NULL. +\item \strong{sources}: A combination is made and duplicates are removed. +\item \strong{licenses}: A combination is made and duplicates are removed. +\item \strong{bibliographicCitation}: Set to NULL. +\item \strong{project}: List of the projects. +\item \strong{coordinatePrecision}: Set to the least precise \code{coordinatePrecision}. +\item \strong{spatial}: Reset based on the new deployments. +\item \strong{temporal}: Reset based on the new deployments. +\item \strong{taxonomic}: A combination is made and duplicates are removed. +\item \strong{relatedIdentifiers}: A combination is made and duplicates are removed. +\item \strong{references}: A combination is made and duplicates are removed. +} +} + +\section{Merging multiple Camera Trap Data Packages}{ + +\code{merge_camtrapdp()} can be used in a pipe to merge multiple camtrap DP. +\itemize{ +\item x \%>\% merge_camtrapdp(y) \%>\% merge_camtrapdp(z) +} +} + +\examples{ +x <- example_dataset() \%>\% + filter_deployments(deploymentID \%in\% c("00a2c20d", "29b7d356")) +y <- example_dataset() \%>\% + filter_deployments(deploymentID \%in\% c("577b543a", "62c200a9")) +x$id <- "1" +y$id <- "2" +xy_merged <- merge_camtrapdp(x, y) +} +\seealso{ +Other transformation functions: +\code{\link{round_coordinates}()}, +\code{\link{shift_time}()}, +\code{\link{write_dwc}()}, +\code{\link{write_eml}()} +} +\concept{transformation functions} diff --git a/man/round_coordinates.Rd b/man/round_coordinates.Rd index abf2785..496e6c2 100644 --- a/man/round_coordinates.Rd +++ b/man/round_coordinates.Rd @@ -88,6 +88,7 @@ deployments(x_rounded)[c("latitude", "longitude", "coordinateUncertainty")] } \seealso{ Other transformation functions: +\code{\link{merge_camtrapdp}()}, \code{\link{shift_time}()}, \code{\link{write_dwc}()}, \code{\link{write_eml}()} diff --git a/man/shift_time.Rd b/man/shift_time.Rd index db3d5b1..7aa9e11 100644 --- a/man/shift_time.Rd +++ b/man/shift_time.Rd @@ -50,6 +50,7 @@ deployments(x_shifted)[, c("deploymentID", "deploymentStart", "deploymentEnd")] } \seealso{ Other transformation functions: +\code{\link{merge_camtrapdp}()}, \code{\link{round_coordinates}()}, \code{\link{write_dwc}()}, \code{\link{write_eml}()} diff --git a/man/write_dwc.Rd b/man/write_dwc.Rd index 040a9b4..dadea2d 100644 --- a/man/write_dwc.Rd +++ b/man/write_dwc.Rd @@ -69,6 +69,7 @@ unlink("my_directory", recursive = TRUE) } \seealso{ Other transformation functions: +\code{\link{merge_camtrapdp}()}, \code{\link{round_coordinates}()}, \code{\link{shift_time}()}, \code{\link{write_eml}()} diff --git a/man/write_eml.Rd b/man/write_eml.Rd index e35f857..eead93e 100644 --- a/man/write_eml.Rd +++ b/man/write_eml.Rd @@ -73,6 +73,7 @@ unlink("my_directory", recursive = TRUE) } \seealso{ Other transformation functions: +\code{\link{merge_camtrapdp}()}, \code{\link{round_coordinates}()}, \code{\link{shift_time}()}, \code{\link{write_dwc}()} diff --git a/tests/testthat/test-merge_camtrapdp.R b/tests/testthat/test-merge_camtrapdp.R new file mode 100644 index 0000000..1f5992d --- /dev/null +++ b/tests/testthat/test-merge_camtrapdp.R @@ -0,0 +1,551 @@ +test_that("merge_camtrapdp() returns a valid camtrapdp object", { + skip_if_offline() + x <- example_dataset() + y <- example_dataset() + x$id <- "1" + y$id <- "2" + + expect_no_error(check_camtrapdp(merge_camtrapdp(x, y))) +}) + +test_that("merge_camtrapdp() returns error on duplicate Data Package id", { + skip_if_offline() + x <- example_dataset() + expect_error( + merge_camtrapdp(x, x), + class = "camtrapdp_error_camtrapdpid_duplicated" + ) +}) + +test_that("merge_camtrapdp() returns error on invalid prefix", { + skip_if_offline() + x <- example_dataset() + y <- example_dataset() + x$id <- "1" + y$id <- "2" + expect_error( + merge_camtrapdp(x, y, prefix = c(1, 2)), + class = "camtrapdp_error_prefix_invalid" + ) + expect_error( + merge_camtrapdp(x, y, prefix = c("one", "two", "three")), + class = "camtrapdp_error_prefix_invalid" + ) + expect_error( + merge_camtrapdp(x, y, prefix = c("one", NA)), + class = "camtrapdp_error_prefix_NA" + ) + + expect_no_error(merge_camtrapdp(x, y)) + expect_no_error(merge_camtrapdp(x, y, prefix = c("this", "works"))) + + x$id <- NULL + y$id <- NULL + expect_error( + merge_camtrapdp(x, y), + class = "camtrapdp_error_prefix_invalid" + ) +}) + +test_that("merge_camtrapdp() returns unique deploymentIDs, mediaIDs and + observationIDs", { + skip_if_offline() + x <- example_dataset() %>% + filter_deployments(deploymentID %in% c("00a2c20d", "29b7d356")) + y <- example_dataset() %>% + filter_deployments(deploymentID %in% c("577b543a", "62c200a9")) + x$id <- "1" + y$id <- "2" + xy_merged <- merge_camtrapdp(x, y) + + deploymentIDs <- purrr::pluck(deployments(xy_merged), "deploymentID") + mediaIDs <- purrr::pluck(media(xy_merged), "mediaID") + observationIDs <- purrr::pluck(observations(xy_merged), "observationID") + + expect_false(any(duplicated(deploymentIDs))) + expect_false(any(duplicated(mediaIDs))) + expect_false(any(duplicated(observationIDs))) + + expect_true("00a2c20d" %in% deployments(xy_merged)$deploymentID) + expect_true("577b543a" %in% deployments(xy_merged)$deploymentID) +}) + +test_that("merge_camtrapdp() adds default prefixes to all values of identifiers + (deploymentID, mediaID, observationID and eventID) with duplicates + between packages, but not for mediaID = NA", { + skip_if_offline() + x <- example_dataset() + y <- example_dataset() + x$id <- "1" + y$id <- "2" + xy_merged <- merge_camtrapdp(x, y) + + # deploymentID + expect_true("1_00a2c20d" %in% deployments(xy_merged)$deploymentID) + expect_true("2_00a2c20d" %in% deployments(xy_merged)$deploymentID) + expect_true("1_00a2c20d" %in% media(xy_merged)$deploymentID) + expect_true("2_00a2c20d" %in% media(xy_merged)$deploymentID) + expect_true("1_00a2c20d" %in% observations(xy_merged)$deploymentID) + expect_true("2_00a2c20d" %in% observations(xy_merged)$deploymentID) + + # mediaID + expect_true("1_07840dcc" %in% media(xy_merged)$mediaID) + expect_true("2_07840dcc" %in% media(xy_merged)$mediaID) + expect_true("1_07840dcc" %in% observations(xy_merged)$mediaID) + expect_true("2_07840dcc" %in% observations(xy_merged)$mediaID) + expect_false("1_NA" %in% observations(xy_merged)$mediaID) + expect_true(NA %in% observations(xy_merged)$mediaID) + + # observationID + expect_true("1_705e6036" %in% observations(xy_merged)$observationID) + expect_true("2_705e6036" %in% observations(xy_merged)$observationID) + + # eventID + expect_true("1_4bb69c45" %in% media(xy_merged)$eventID) + expect_true("2_4bb69c45" %in% media(xy_merged)$eventID) + expect_true("1_4bb69c45" %in% observations(xy_merged)$eventID) + expect_true("2_4bb69c45" %in% observations(xy_merged)$eventID) +}) + +test_that("merge_camtrapdp() adds custom prefixes to all values of identifiers + (deploymentID, mediaID, observationID and eventID) with duplicates + between packages, but not for mediaID = NA", { + skip_if_offline() + x <- example_dataset() + y <- example_dataset() + x$id <- NULL + y$id <- NULL + xy_merged <- merge_camtrapdp(x, y, prefix = c("x", "y")) + + # deploymentID + expect_true("x_00a2c20d" %in% deployments(xy_merged)$deploymentID) + expect_true("y_00a2c20d" %in% deployments(xy_merged)$deploymentID) + expect_true("x_00a2c20d" %in% media(xy_merged)$deploymentID) + expect_true("y_00a2c20d" %in% media(xy_merged)$deploymentID) + expect_true("x_00a2c20d" %in% observations(xy_merged)$deploymentID) + expect_true("y_00a2c20d" %in% observations(xy_merged)$deploymentID) + + # mediaID + expect_true("x_07840dcc" %in% media(xy_merged)$mediaID) + expect_true("y_07840dcc" %in% media(xy_merged)$mediaID) + expect_true("x_07840dcc" %in% observations(xy_merged)$mediaID) + expect_true("y_07840dcc" %in% observations(xy_merged)$mediaID) + expect_false("x_NA" %in% observations(xy_merged)$mediaID) + expect_true(NA %in% observations(xy_merged)$mediaID) + + # observationID + expect_true("x_705e6036" %in% observations(xy_merged)$observationID) + expect_true("y_705e6036" %in% observations(xy_merged)$observationID) + + # eventID + expect_true("x_4bb69c45" %in% media(xy_merged)$eventID) + expect_true("y_4bb69c45" %in% media(xy_merged)$eventID) + expect_true("x_4bb69c45" %in% observations(xy_merged)$eventID) + expect_true("y_4bb69c45" %in% observations(xy_merged)$eventID) +}) + +test_that("merge_camtrapdp() adds default prefixes to the names of + additional resources that are not unique and not required by Camera + Trap Data Package standard", { + skip_if_offline() + x <- example_dataset() + y <- example_dataset() + x$id <- "1" + y$id <- "2" + x$resources <- append( + y$resources, + list(list( + name = "annotations", + data = list(id = 1L, comment = "albino fox")) + )) + y$resources <- append( + y$resources, + list(list(name = "foo", description = "blabla"))) + xy_merged <- merge_camtrapdp(x, y) + + resource_names <- purrr::map(xy_merged$resources, ~ .[["name"]]) %>% unlist() + expected_names <- c( + "deployments", "media", "observations", "1_individuals", "annotations", + "2_individuals", "foo") + + expect_identical(resource_names, expected_names) +}) + +test_that("merge_camtrapdp() returns the expected metadata ", { + skip_if_offline() + x <- example_dataset() + y <- example_dataset() + x$id <- "1" + y$id <- "2" + xy_merged <- merge_camtrapdp(x, y) + + # Can't compare with x$licenses because remove_duplicates switches order of + # subelements + licenses <- list( + list(name = "CC0-1.0", scope = "data"), + list(scope = "media", path = "http://creativecommons.org/licenses/by/4.0/")) + + # Check metadata + expect_equal(length(xy_merged$resources), 5) + expect_identical(xy_merged$profile, "https://raw.githubusercontent.com/tdwg/camtrap-dp/1.0.1/camtrap-dp-profile.json") + expect_identical(xy_merged$name, NA) + expect_identical(xy_merged$id, NULL) + expect_identical(xy_merged$title, NA) + expect_identical(xy_merged$contributors, x$contributors) + expect_identical( + xy_merged$description, + paste(x$description, y$description, sep = "/n") + ) + expect_identical(xy_merged$version, "1.0") + expect_identical(xy_merged$keywords, x$keywords) + expect_identical(xy_merged$image, NULL) + expect_identical(xy_merged$homepage, NULL) + expect_identical(xy_merged$sources, x$sources) + expect_identical(xy_merged$licenses, licenses) + expect_identical(xy_merged$bibliographicCitation, NULL) + expect_identical(xy_merged$project, list(x$project, y$project)) + expect_identical(xy_merged$coordinatePrecision, x$coordinatePrecision) + expect_identical(xy_merged$spatial, x$spatial) + expect_identical(xy_merged$temporal, x$temporal) + expect_identical(xy_merged$taxonomic, x$taxonomic) + expect_identical(xy_merged$references, x$references) + expect_identical(xy_merged$directory, ".") + + relatedIdentifiers_merged <- list( + list( + relationType = "IsDerivedFrom", + relatedIdentifier = "https://doi.org/10.15468/5tb6ze", + resourceTypeGeneral = "Dataset", + relatedIdentifierType = "DOI" + ), + list( + relationType = "IsSupplementTo", + relatedIdentifier = "https://inbo.github.io/camtraptor/", + resourceTypeGeneral = "Software", + relatedIdentifierType = "URL" + ), + list( + relationType = "IsDerivedFrom", + relatedIdentifier = "1", + resourceTypeGeneral = "Data package", + relatedIdentifierType = "id" + ), + list( + relationType = "IsDerivedFrom", + relatedIdentifier = "2", + resourceTypeGeneral = "Data package", + relatedIdentifierType = "id" + ) + ) + + expect_identical(xy_merged$relatedIdentifiers, relatedIdentifiers_merged) + + # Check data +}) + +test_that("merge_camtrapdp() returns the expected metadata when merging two + different Data Packages", { + skip_if_offline() + x <- example_dataset() + + # Download second Camera Trap Data package + temp_dir <- tempdir() + on.exit(unlink(temp_dir, recursive = TRUE)) + zip_file <- file.path(temp_dir, "dataset.zip") + datapackage_file <- file.path(temp_dir, "datapackage.json") + url <- "https://ipt.nlbif.nl/archive.do?r=awd_pilot2" + download.file(url, zip_file, mode = 'wb') + unzip(zip_file, exdir = temp_dir) + y <- read_camtrapdp(datapackage_file) + + # Merge + xy_merged <- merge_camtrapdp(x, y) + + # Check metadata + profile <- "https://raw.githubusercontent.com/tdwg/camtrap-dp/1.0.1/camtrap-dp-profile.json" + contributors <- list( + list( + title = "Axel Neukermans", + email = "axel.neukermans@inbo.be", + path = "https://orcid.org/0000-0003-0272-9180", + role = "contributor", + organization = "Research Institute for Nature and Forest (INBO)" + ), + list( + title = "Danny Van der beeck", + email = "daniel.vanderbeeck@gmail.com" + ), + list( + title = "Emma Cartuyvels", + email = "emma.cartuyvels@inbo.be", + role = "principalInvestigator", + organization = "Research Institute for Nature and Forest (INBO)" + ), + list( + title = "Peter Desmet", + email = "peter.desmet@inbo.be", + path = "https://orcid.org/0000-0002-8442-8025", + role = "contact", + organization = "Research Institute for Nature and Forest (INBO)" + ), + list( + title = "Research Institute for Nature and Forest (INBO)", + path = "https://inbo.be", + role = "rightsHolder" + ), + list( + title = "Research Institute for Nature and Forest (INBO)", + path = "https://inbo.be", + role = "publisher" + ), + list( + title = "Julian Evans", + email = "jevansbio@gmail.com", + role = "principalInvestigator", + organization = "University of Amsterdam", + firstName = "Julian", + lastName = "Evans" + ), + list( + title = "Rotem Zilber", + email = "r.kadanzilber@uva.nl", + role = "principalInvestigator", + organization = "University of Amsterdam", + firstName = "Rotem", + lastName = "Zilber" + ), + list( + title = "W. Daniel Kissling", + email = "wdkissling@gmail.com", + path = "https://www.danielkissling.de/", + role = "principalInvestigator", + organization = "University of Amsterdam", + firstName = "W. Daniel ", + lastName = "Kissling" + ) + ) + + description <- "MICA - Muskrat and coypu camera trap observations in Belgium, the Netherlands and Germany is an occurrence dataset published by the Research Institute of Nature and Forest (INBO). It is part of the LIFE project MICA, in which innovative techniques are tested for a more efficient control of muskrat and coypu populations, both invasive species. This dataset is a sample of the original dataset and serves as an example of a Camera Trap Data Package (Camtrap DP)./nCamera trap pilot 2 was a test of the difference in species detection and data accumulation between a Snyper Commander camera with a regular lens (52°) and one with a wide lens (100°). The cameras were deployed at 30 cm above the ground within the herbivore exclosure Zeeveld Noord in the Amsterdam Water Supply Dunes from 14th of August 2021 to 24th of September 2021. During this pilot, a solar panel failure caused the cameras to stop recording data from the 24th of August 2021 to the 6th of September (14 days). During annotation, only days in which both cameras were operational were annotated. This led to a total of 1,113 images over 28 days from the two cameras. A detailed description of the dataset can be found in a data paper published in the journal Data in Brief (Evans et al. 2024, https://doi.org/10.1016/j.dib.2024.110544)." + + sources <- list( + list( + title = "Agouti", + path = "https://www.agouti.eu", + email = "agouti@wur.nl", + version = "v3.21" + ), + list( + title = "Agouti", + path = "https://www.agouti.eu", + email = "agouti@wur.nl", + version = "v4" + ) + ) + + licenses <- list( + list(name = "CC0-1.0", scope = "data"), + list(scope = "media", path = "http://creativecommons.org/licenses/by/4.0/"), + list(name = "CC-BY-4.0", scope = "data") + ) + + coordinatePrecision <- 0.001 + + spatial <- list( + type = "Polygon", + coordinates = structure( + c( + 4.013, 5.659, 5.659, 4.013, 4.013, + 50.699, 50.699, 52.35604, 52.35604, 50.699 + ), + dim = c(1L, 5L, 2L) + ) + ) + + temporal <- list(start = "2020-05-30", end = "2022-03-18") + + taxonomic <- list( + list( + scientificName = "Anas platyrhynchos", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/DGP6", + taxonRank = "species", + family = NA_character_, + order. = NA_character_, + vernacularNames = list(eng = "mallard", nld = "wilde eend") + ), + list( + scientificName = "Anas strepera", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/DGPL", + taxonRank = "species", + family = NA_character_, + order. = NA_character_, + vernacularNames = list(eng = "gadwall", nld = "krakeend") + ), + list( + scientificName = "Apodemus sylvaticus", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/FRJJ", + taxonRank = "species", + family = "Muridae", + order. = "Rodentia", + vernacularNames = list(eng = "wood mouse", nld = "bosmuis") + ), + list( + scientificName = "Ardea", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/32FH", + taxonRank = "genus", + family = NA_character_, + order. = NA_character_, + vernacularNames = list(eng = "great herons", nld = "reigers") + ), + list( + scientificName = "Ardea cinerea", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/GCHS", + taxonRank = "species", + family = NA_character_, + order. = NA_character_, + vernacularNames = list(eng = "grey heron", nld = "blauwe reiger") + ), + list( + scientificName = "Aves", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/V2", + taxonRank = "class", + family = NA_character_, + order. = NA_character_, + vernacularNames = list(eng = "bird sp.", nld = "vogel") + ), + list( + scientificName = "Corvus corone", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/YNHJ", + taxonRank = "species", + family = "Corvidae", + order. = "Passeriformes", + vernacularNames = list(eng = "carrion crow", nld = "zwarte kraai") + ), + list( + scientificName = "Homo sapiens", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/6MB3T", + taxonRank = "species", + family = "Hominidae", + order. = "Primates", + vernacularNames = list(eng = "human", nld = "mens") + ), + list( + scientificName = "Martes foina", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/3Y9VW", + taxonRank = "species", + family = NA_character_, + order. = NA_character_, + vernacularNames = list(eng = "beech marten", nld = "steenmarter") + ), + list( + scientificName = "Mustela putorius", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/44QYC", + taxonRank = "species", + family = NA_character_, + order. = NA_character_, + vernacularNames = list(eng = "European polecat", nld = "bunzing") + ), + list( + scientificName = "Oryctolagus cuniculus", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/74ZBP", + taxonRank = "species", + family = "Leporidae", + order. = "Lagomorpha", + vernacularNames = list(eng = "European rabbit", nld = "Europees konijn") + ), + list( + scientificName = "Rattus norvegicus", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/4RM67", + taxonRank = "species", + family = NA_character_, + order. = NA_character_, + vernacularNames = list(eng = "brown rat", nld = "bruine rat") + ), + list( + scientificName = "Vulpes vulpes", + taxonID = "https://www.checklistbank.org/dataset/COL2023/taxon/5BSG3", + taxonRank = "species", + family = "Canidae", + order. = "Carnivora", + vernacularNames = list(eng = "red fox", nld = "vos") + ) + ) + + references <- list("Evans, J.C., Zilber, R., & Kissling, W.D. (2024). Data from three camera trapping pilots in the Amsterdam Water Supply Dunes of the Netherlands. Data in Brief, 54, 110544. https://doi.org/10.1016/j.dib.2024.110544") + + relatedIdentifiers_merged <- list( + list( + relationType = "IsDerivedFrom", + relatedIdentifier = "https://doi.org/10.15468/5tb6ze", + resourceTypeGeneral = "Dataset", + relatedIdentifierType = "DOI" + ), + list( + relationType = "IsSupplementTo", + relatedIdentifier = "https://inbo.github.io/camtraptor/", + resourceTypeGeneral = "Software", + relatedIdentifierType = "URL" + ), + list( + relationType = "IsPublishedIn", + relatedIdentifier = "https://doi.org/10.1016/j.dib.2024.110544", + resourceTypeGeneral = "DataPaper", + relatedIdentifierType = "DOI" + ), + list( + relationType = "IsDerivedFrom", + relatedIdentifier = "7cca70f5-ef8c-4f86-85fb-8f070937d7ab", + resourceTypeGeneral = "Data package", + relatedIdentifierType = "id" + ) + ) + + expect_identical(xy_merged$resources, x$resources) + expect_identical(xy_merged$profile, profile) + expect_identical(xy_merged$name, NA) + expect_identical(xy_merged$id, NULL) + expect_identical(xy_merged$title, NA) + expect_identical(xy_merged$contributors, contributors) + expect_identical( + xy_merged$description, + paste(x$description, y$description, sep = "/n") + ) + expect_identical(xy_merged$version, "1.0") + expect_identical(xy_merged$keywords, c(x$keywords, y$keywords)) + expect_identical(xy_merged$image, NULL) + expect_identical(xy_merged$homepage, NULL) + expect_identical(xy_merged$sources, sources) + expect_identical(xy_merged$licenses, licenses) + expect_identical(xy_merged$bibliographicCitation, NULL) + expect_identical(xy_merged$project, list(x$project, y$project)) + expect_identical(xy_merged$coordinatePrecision, coordinatePrecision) + expect_identical(xy_merged$spatial, spatial) + expect_identical(xy_merged$temporal, temporal) + expect_identical(xy_merged$taxonomic, taxonomic) + expect_identical(xy_merged$references, references) + expect_identical(xy_merged$directory, ".") + expect_identical(xy_merged$relatedIdentifiers, relatedIdentifiers_merged) +}) + +test_that("merge_camtrapdp() can be used in a pipe to merge multiple + camtrap DP", { + skip_if_offline() + + temp_dir <- tempdir() + on.exit(unlink(temp_dir, recursive = TRUE)) + zip_file <- file.path(temp_dir, "dataset.zip") + datapackage_file <- file.path(temp_dir, "datapackage.json") + url <- "https://ipt.nlbif.nl/archive.do?r=awd_pilot2" + + download.file(url, zip_file, mode = 'wb') + unzip(zip_file, exdir = temp_dir) + + x <- read_camtrapdp(datapackage_file) + y <- example_dataset() %>% + filter_deployments(deploymentID %in% c("00a2c20d", "29b7d356")) + z <- example_dataset() %>% + filter_deployments(deploymentID %in% c("577b543a", "62c200a9")) + y$id <- "y" + z$id <- "z" + + expect_no_error(x %>% merge_camtrapdp(y) %>% merge_camtrapdp(z)) +}) diff --git a/tests/testthat/test-write_camtrapdp.R b/tests/testthat/test-write_camtrapdp.R index d35ce67..cd3bd68 100644 --- a/tests/testthat/test-write_camtrapdp.R +++ b/tests/testthat/test-write_camtrapdp.R @@ -29,6 +29,31 @@ test_that("write_camtrapdp() writes a (filtered) dataset that can be read", { expect_lt(nrow(observations(x_written)), nrow(observations(x))) }) +test_that("write_camtrapdp() writes a merged dataset that can be read", { + skip_if_offline() + x <- example_dataset() + + # Download second Camera Trap Data package + temp_dir <- tempdir() + on.exit(unlink(temp_dir, recursive = TRUE)) + zip_file <- file.path(temp_dir, "dataset.zip") + datapackage_file <- file.path(temp_dir, "datapackage.json") + url <- "https://ipt.nlbif.nl/archive.do?r=awd_pilot2" + download.file(url, zip_file, mode = "wb") + unzip(zip_file, exdir = temp_dir) + y <- read_camtrapdp(datapackage_file) + + # Merge + xy_merged <- merge_camtrapdp(x, y) + + # Write + write_camtrapdp(xy_merged, file.path(temp_dir, "processed"), compress = TRUE) + + expect_no_error( + read_camtrapdp(file.path(temp_dir, "processed", "datapackage.json")) + ) +}) + test_that("write_camtrapdp() writes the unaltered example dataset as is", { skip_if_offline() x <- example_dataset()