From e3a70e39f885644fa5220c4220ccb8712f444be0 Mon Sep 17 00:00:00 2001 From: carlesmila Date: Fri, 17 Nov 2023 11:27:18 +0100 Subject: [PATCH] added preddata arg --- R/geodist.R | 31 ++++++++++++++++++++++++++----- man/geodist.Rd | 10 +++++++--- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/R/geodist.R b/R/geodist.R index ab6d2bdc..5631c3f7 100644 --- a/R/geodist.R +++ b/R/geodist.R @@ -8,15 +8,17 @@ #' @param cvfolds optional. list or vector. Either a list where each element contains the data points used for testing during the cross validation iteration (i.e. held back data). #' Or a vector that contains the ID of the fold for each training point. See e.g. ?createFolds or ?CreateSpacetimeFolds or ?nndm #' @param cvtrain optional. List of row indices of x to fit the model to in each CV iteration. If cvtrain is null but cvfolds is not, all samples but those included in cvfolds are used as training data -#' @param testdata optional. object of class sf: Data used for independent validation +#' @param testdata optional. object of class sf: Point data used for independent validation +#' @param preddata optional. object of class sf: Point data indicating the locations within the modeldomain to be used as target prediction points. Useful when the prediction objective is a subset of +#' locations within the modeldomain rather than the whole area. #' @param samplesize numeric. How many prediction samples should be used? #' @param sampling character. How to draw prediction samples? See \link[sp]{spsample}. Use sampling = "Fibonacci" for global applications. #' @param variables character vector defining the predictor variables used if type="feature. If not provided all variables included in modeldomain are used. #' @return A data.frame containing the distances. Unit of returned geographic distances is meters. attributes contain W statistic between prediction area and either sample data, CV folds or test data. See details. #' @details The modeldomain is a sf polygon or a raster that defines the prediction area. The function takes a regular point sample (amount defined by samplesize) from the spatial extent. -#' If type = "feature", the argument modeldomain (and if provided then also the testdata) has to include predictors. Predictor values for x are optional if modeldomain is a raster. +#' If type = "feature", the argument modeldomain (and if provided then also the testdata and/or preddata) has to include predictors. Predictor values for x, testdata and preddata are optional if modeldomain is a raster. #' If not provided they are extracted from the modeldomain rasterStack. -#' W statistic describes the match between the distributions. See Mila et al (2023) and Linnenbrink et al (2023) for further details. +#' W statistic describes the match between the distributions. See Linnenbrink et al (2023) for further details. #' @note See Meyer and Pebesma (2022) for an application of this plotting function #' @seealso \code{\link{nndm}} \code{\link{knndm}} #' @import ggplot2 @@ -98,6 +100,7 @@ geodist <- function(x, cvfolds=NULL, cvtrain=NULL, testdata=NULL, + preddata=NULL, samplesize=2000, sampling = "regular", variables=NULL){ @@ -150,13 +153,31 @@ geodist <- function(x, testdata <- sf::st_transform(testdata,4326) } } + if(!is.null(preddata)){ + if(any(!variables%in%names(preddata))){# extract variable values of raster: + preddata <- sf::st_transform(preddata,sf::st_crs(modeldomain)) + #preddata <- sf::st_as_sf(raster::extract(modeldomain, preddata, df = TRUE, sp = TRUE)) + preddata <- sf::st_as_sf(terra::extract(modeldomain, preddata, na.rm=FALSE,bind=TRUE)) + + if(any(is.na(preddata))){ + preddata <- na.omit(preddata) + message("some prediction data were removed because of NA in extracted predictor values") + } + + preddata <- sf::st_transform(preddata,4326) + } + } } # required steps ---- - ## Sample prediction location from the study area: - modeldomain <- sampleFromArea(modeldomain, samplesize, type,variables,sampling) + ## Sample prediction location from the study area if preddata not available: + if(is.null(preddata)){ + modeldomain <- sampleFromArea(modeldomain, samplesize, type,variables,sampling)} + else{ + modeldomain <- preddata + } # always do sample-to-sample and sample-to-prediction s2s <- sample2sample(x, type,variables) diff --git a/man/geodist.Rd b/man/geodist.Rd index e76b0550..97561706 100644 --- a/man/geodist.Rd +++ b/man/geodist.Rd @@ -11,6 +11,7 @@ geodist( cvfolds = NULL, cvtrain = NULL, testdata = NULL, + preddata = NULL, samplesize = 2000, sampling = "regular", variables = NULL @@ -28,7 +29,10 @@ Or a vector that contains the ID of the fold for each training point. See e.g. ? \item{cvtrain}{optional. List of row indices of x to fit the model to in each CV iteration. If cvtrain is null but cvfolds is not, all samples but those included in cvfolds are used as training data} -\item{testdata}{optional. object of class sf: Data used for independent validation} +\item{testdata}{optional. object of class sf: Point data used for independent validation} + +\item{preddata}{optional. object of class sf: Point data indicating the locations within the modeldomain to be used as target prediction points. Useful when the prediction objective is a subset of +locations within the modeldomain rather than the whole area.} \item{samplesize}{numeric. How many prediction samples should be used?} @@ -45,9 +49,9 @@ Optional, the nearest neighbor distances between training data and test data or } \details{ The modeldomain is a sf polygon or a raster that defines the prediction area. The function takes a regular point sample (amount defined by samplesize) from the spatial extent. - If type = "feature", the argument modeldomain (and if provided then also the testdata) has to include predictors. Predictor values for x are optional if modeldomain is a raster. + If type = "feature", the argument modeldomain (and if provided then also the testdata and/or preddata) has to include predictors. Predictor values for x, testdata and preddata are optional if modeldomain is a raster. If not provided they are extracted from the modeldomain rasterStack. - W statistic describes the match between the distributions. See Mila et al (2023) and Linnenbrink et al (2023) for further details. + W statistic describes the match between the distributions. See Linnenbrink et al (2023) for further details. } \note{ See Meyer and Pebesma (2022) for an application of this plotting function