Skip to content

Commit

Permalink
Merge branch 'master' of github.com:Ludwigm6/CAST
Browse files Browse the repository at this point in the history
  • Loading branch information
Ludwigm6 committed Mar 11, 2024
2 parents 5d6d9ec + 5952016 commit ca87153
Show file tree
Hide file tree
Showing 25 changed files with 1,035 additions and 118 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: CAST
Type: Package
Title: 'caret' Applications for Spatial-Temporal Models
Version: 0.9.1
Version: 0.9.2
Authors@R: c(person("Hanna", "Meyer", email = "[email protected]", role = c("cre", "aut")),
person("Carles", "Milà", role = c("aut")),
person("Marvin", "Ludwig", role = c("aut")),
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,17 @@ S3method(print,knndm)
S3method(print,nndm)
S3method(print,trainDI)
export(CreateSpacetimeFolds)
export(DItoErrormetric)
export(aoa)
export(bss)
export(calibrate_aoa)
export(clustered_sample)
export(errorProfiles)
export(ffs)
export(geodist)
export(global_validation)
export(knndm)
export(nndm)
export(normalize_DI)
export(plot_ffs)
export(plot_geodist)
export(show.aoa)
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# `CAST` 0.9.2
* new features:
* normalize_DI for a more intuitive interpretation
* modifications:
* function DItoErrormetric renamed to errorProfiles and allows for other dissimilarity measures

# `CAST` 0.9.1
* new features:
* calculate local point density within AOA
Expand Down
45 changes: 31 additions & 14 deletions R/aoa.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model?
#' @param LPD Logical. Indicates whether the local point density should be calculated or not.
#' @param maxLPD numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}})
#' @param verbose Logical. Print progress or not?
#' @details The Dissimilarity Index (DI), the Local Data Point Density (LPD) and the corresponding Area of Applicability (AOA) are calculated.
#' If variables are factors, dummy variables are created prior to weighting and distance calculation.
#'
Expand Down Expand Up @@ -146,7 +147,8 @@ aoa <- function(newdata,
method="L2",
useWeight=TRUE,
LPD = FALSE,
maxLPD = 1) {
maxLPD = 1,
verbose = TRUE) {

# handling of different raster formats
as_stars <- FALSE
Expand Down Expand Up @@ -200,8 +202,10 @@ aoa <- function(newdata,

# if not provided, compute train DI
if(!inherits(trainDI, "trainDI")) {
message("No trainDI provided.")
trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, LPD)
if (verbose) {
message("No trainDI provided.")
}
trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, LPD, verbose)
}

if (calc_LPD == TRUE) {
Expand Down Expand Up @@ -302,11 +306,15 @@ aoa <- function(newdata,
}

if (calc_LPD == TRUE) {
message("Computing DI and LPD of new data...")
if (verbose) {
message("Computing DI and LPD of new data...")
}

pb <- txtProgressBar(min = 0,
max = nrow(newdataCC),
style = 3)
if (verbose) {
pb <- txtProgressBar(min = 0,
max = nrow(newdataCC),
style = 3)
}

DI_out <- rep(NA, nrow(newdata))
LPD_out <- rep(NA, nrow(newdata))
Expand All @@ -317,24 +325,31 @@ aoa <- function(newdata,

DI_out[okrows[i]] <- knnDI[1]
LPD_out[okrows[i]] <- sum(knnDI < trainDI$threshold)
setTxtProgressBar(pb, i)
if (verbose) {
setTxtProgressBar(pb, i)
}
}

close(pb)
if (verbose) {
close(pb)
}

# set maxLPD to max of LPD_out if
realMaxLPD <- max(LPD_out, na.rm = T)
if (maxLPD > realMaxLPD) {
if (inherits(maxLPD, c("numeric", "integer"))) {
if (inherits(maxLPD, c("numeric", "integer")) && verbose) {
message("Your specified maxLPD is bigger than the real maxLPD of you predictor data.")
}
message(paste("maxLPD is set to", realMaxLPD))
if (verbose) {
message(paste("maxLPD is set to", realMaxLPD))
}
trainDI$maxLPD <- realMaxLPD
}
}


message("Computing AOA...")
if (verbose) {
message("Computing AOA...")
}

#### Create Mask for AOA and return statistics
if (inherits(out, "SpatRaster")) {
Expand Down Expand Up @@ -389,7 +404,9 @@ aoa <- function(newdata,
result$LPD <- LPD
}

message("Finished!")
if (verbose) {
message("Finished!")
}

class(result) <- "aoa"
return(result)
Expand Down
33 changes: 21 additions & 12 deletions R/DItoErrormetric.R → R/errorProfiles.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#' Model the relationship between the DI and the prediction error
#' @description Performance metrics are calculated for moving windows of DI/LPD values of cross-validated training data
#' Model and inspect the relationship between the prediction error and measures of dissimilarities and distances
#' @description Performance metrics are calculated for moving windows of dissimilarity values based on cross-validated training data
#' @param model the model used to get the AOA
#' @param trainDI the result of \code{\link{trainDI}} or aoa object \code{\link{aoa}}
#' @param variable Character. Which dissimilarity or distance measure to use for the error metric. Current options are "DI" or "LPD"
#' @param multiCV Logical. Re-run model fitting and validation with different CV strategies. See details.
#' @param window.size Numeric. Size of the moving window. See \code{\link{rollapply}}.
#' @param calib Character. Function to model the DI/LPD~performance relationship. Currently lm and scam are supported
Expand All @@ -10,9 +11,8 @@
#' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model?
#' @param k Numeric. See mgcv::s
#' @param m Numeric. See mgcv::s
#' @param variable Character. Which measure to use for the error metric. "DI" or "LPD"
#' @details If multiCV=TRUE the model is re-fitted and validated by length.out new cross-validations where the cross-validation folds are defined by clusters in the predictor space,
#' ranging from three clusters to LOOCV. Hence, a large range of DI/LPD values is created during cross-validation.
#' ranging from three clusters to LOOCV. Hence, a large range of dissimilarity values is created during cross-validation.
#' If the AOA threshold based on the calibration data from multiple CV is larger than the original AOA threshold (which is likely if extrapolation situations are created during CV),
#' the AOA threshold changes accordingly. See Meyer and Pebesma (2021) for the full documentation of the methodology.
#' @return A scam, linear model or exponential model
Expand All @@ -22,16 +22,25 @@
#' Estimating the area of applicability of spatial prediction models.
#' \doi{10.1111/2041-210X.13650}
#' @seealso \code{\link{aoa}}
#' @example inst/examples/ex_DItoErrormetric.R
#' @example inst/examples/ex_errorProfiles.R
#'
#'
#' @export


DItoErrormetric <- function(model, trainDI, multiCV=FALSE,
length.out = 10, window.size = 5, calib = "scam",
method= "L2", useWeight=TRUE,
k = 6, m = 2, variable = "DI"){
#' @export errorProfiles
#' @aliases errorProfiles DItoErrormetric



errorProfiles <- function(model,
trainDI,
variable = "DI",
multiCV=FALSE,
length.out = 10,
window.size = 5,
calib = "scam",
method= "L2",
useWeight=TRUE,
k = 6,
m = 2){


if(inherits(trainDI,"aoa")){
Expand Down
5 changes: 3 additions & 2 deletions R/geodist.R
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ geodist <- function(x,
if(any(!variables%in%names(testdata))){# extract variable values of raster:
testdata <- sf::st_transform(testdata,sf::st_crs(modeldomain))
#testdata <- sf::st_as_sf(raster::extract(modeldomain, testdata, df = TRUE, sp = TRUE))
testdata <- sf::st_as_sf(terra::extract(modeldomain, testdata, na.rm=FALSE,bind=TRUE))
testdata <- sf::st_as_sf(terra::extract(modeldomain, terra::vect(testdata), na.rm=FALSE,bind=TRUE))

if(any(is.na(testdata))){
testdata <- na.omit(testdata)
Expand All @@ -157,7 +157,7 @@ geodist <- function(x,
if(any(!variables%in%names(preddata))){# extract variable values of raster:
preddata <- sf::st_transform(preddata,sf::st_crs(modeldomain))
#preddata <- sf::st_as_sf(raster::extract(modeldomain, preddata, df = TRUE, sp = TRUE))
preddata <- sf::st_as_sf(terra::extract(modeldomain, preddata, na.rm=FALSE,bind=TRUE))
preddata <- sf::st_as_sf(terra::extract(modeldomain, terra::vect(preddata), na.rm=FALSE,bind=TRUE))

if(any(is.na(preddata))){
preddata <- na.omit(preddata)
Expand Down Expand Up @@ -194,6 +194,7 @@ geodist <- function(x,

##### Distance to CV data:
if(!is.null(cvfolds)){

cvd <- cvdistance(x, cvfolds, cvtrain, type, variables)
dists <- rbind(dists, cvd)
}
Expand Down
57 changes: 39 additions & 18 deletions R/nndm.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#' indices to perform a NNDM LOO CV for map validation.
#' @author Carles Milà
#' @param tpoints sf or sfc point object. Contains the training points samples.
#' @param modeldomain sf polygon object defining the prediction area (see Details).
#' @param modeldomain sf polygon object or SpatRaster defining the prediction area (see Details).
#' @param ppoints sf or sfc point object. Contains the target prediction points.
#' Optional. Alternative to modeldomain (see Details).
#' @param samplesize numeric. How many points in the modeldomain should be sampled as prediction points?
Expand Down Expand Up @@ -34,12 +34,13 @@
#' When \emph{phi} is set to "max", nearest neighbor distance matching is performed for the entire prediction area. Euclidean distances are used for projected
#' and non-defined CRS, great circle distances are used for geographic CRS (units in meters).
#'
#' The \emph{modeldomain} is a sf polygon that defines the prediction area. The function takes a regular point sample (amount defined by \emph{samplesize)} from the spatial extent.
#' The \emph{modeldomain} is either a sf polygon that defines the prediction area, or alternatively a SpatRaster out of which a polygon,
#' transformed into the CRS of the training points, is defined as the outline of all non-NA cells.
#' Then, the function takes a regular point sample (amount defined by \emph{samplesize)} from the spatial extent.
#' As an alternative use \emph{ppoints} instead of \emph{modeldomain}, if you have already defined the prediction locations (e.g. raster pixel centroids).
#' When using either \emph{modeldomain} or \emph{ppoints}, we advise to plot the study area polygon and the training/prediction points as a previous step to ensure they are aligned.
#'
#' @note NNDM is a variation of LOOCV and therefore may take a long time for large training data sets.
#' A k-fold variant will be implemented shortly.
#' @note NNDM is a variation of LOOCV and therefore may take a long time for large training data sets. See \code{\link{knndm}} for a more efficient k-fold variant of the method.
#' @seealso \code{\link{geodist}}, \code{\link{knndm}}
#' @references
#' \itemize{
Expand Down Expand Up @@ -97,8 +98,8 @@
#' plot(nndm_pred)
#'
#' ########################################################################
#' # Example 3: Real- world example; using a modeldomain instead of previously
#' # sampled prediction locations
#' # Example 3: Real- world example; using a SpatRast modeldomain instead
#' # of previously sampled prediction locations
#' ########################################################################
#' \dontrun{
#' library(sf)
Expand All @@ -112,15 +113,11 @@
#' pts <- st_as_sf(pts,coords=c("Easting","Northing"))
#' st_crs(pts) <- 26911
#' studyArea <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST"))
#' studyArea[!is.na(studyArea)] <- 1
#' studyArea <- as.polygons(studyArea, values = FALSE, na.all = TRUE) |>
#' st_as_sf() |>
#' st_union()
#' pts <- st_transform(pts, crs = st_crs(studyArea))
#' plot(studyArea)
#' plot(st_geometry(pts), add = TRUE, col = "red")
#' terra::plot(studyArea[["DEM"]])
#' terra::plot(vect(pts), add = T)
#'
#' nndm_folds <- nndm(pts, modeldomain= studyArea)
#' nndm_folds <- nndm(pts, modeldomain = studyArea)
#' plot(nndm_folds)
#'
#' #use for cross-validation:
Expand All @@ -143,12 +140,36 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000,

# create sample points from modeldomain
if(is.null(ppoints)&!is.null(modeldomain)){

# Check modeldomain is indeed a sf/SpatRaster
if(!any(c("sfc", "sf", "SpatRaster") %in% class(modeldomain))){
stop("modeldomain must be a sf/sfc object or a 'SpatRaster' object.")
}

# If modeldomain is a SpatRaster, transform into polygon
if(any(class(modeldomain) == "SpatRaster")){
modeldomain[!is.na(modeldomain)] <- 1
modeldomain <- terra::as.polygons(modeldomain, values = FALSE, na.all = TRUE) |>
sf::st_as_sf() |>
sf::st_union()
modeldomain <- sf::st_transform(modeldomain, crs = sf::st_crs(tpoints))
}

# Check modeldomain is indeed a polygon sf
if(!any(class(sf::st_geometry(modeldomain)) %in% c("sfc_POLYGON", "sfc_MULTIPOLYGON"))){
stop("modeldomain must be a sf/sfc polygon object.")
}

# Check whether modeldomain has the same crs as tpoints
if(!identical(sf::st_crs(tpoints), sf::st_crs(modeldomain))){
stop("tpoints and modeldomain must have the same CRS")
}

# We sample
message(paste0(samplesize, " prediction points are sampled from the modeldomain"))
ppoints <- sf::st_sample(x = modeldomain, size = samplesize, type = sampling)
sf::st_crs(ppoints) <- sf::st_crs(modeldomain)

}else if(!is.null(ppoints)){
if(!identical(sf::st_crs(tpoints), sf::st_crs(ppoints))){
stop("tpoints and ppoints must have the same CRS")
Expand All @@ -165,6 +186,9 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000,
ppoints <- sf::st_sf(geom=ppoints)
}

# Input data checks
nndm_checks(tpoints, ppoints, phi, min_train)

# if phi==max calculate the range of the size area
if(phi=="max"){
xmin <- min(sf::st_coordinates(ppoints)[,1])
Expand All @@ -176,9 +200,6 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000,
phi <- as.numeric(max(sf::st_distance(p)))
}

# Input data checks
nndm_checks(tpoints, ppoints, phi, min_train)

# Compute nearest neighbour distances between training and prediction points
Gij <- sf::st_distance(ppoints, tpoints)
units(Gij) <- NULL
Expand Down Expand Up @@ -238,8 +259,8 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000,
nndm_checks <- function(tpoints, ppoints, phi, min_train){

# Check for valid range of phi
if(phi < 0 | !is.numeric(phi)){
stop("phi must be positive.")
if(phi < 0 | (!is.numeric(phi) & phi!= "max")){
stop("phi must be positive or set to 'max'.")
}

# min_train must be a single positive numeric
Expand Down
54 changes: 54 additions & 0 deletions R/normalize_DI.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#' Normalize DI values
#' @description
#' The DI is normalized by the DI threshold to allow for a more straightforwrd interpretation.
#' A value in the resulting DI larger 1 means that the data are more dissimilar than what has been observed during cross-validation.
#' The returned threshold is adjusted accordingly and is, as a consequence, 1.
#' @param AOA An AOA object
#' @return An object of class \code{aoa}
#' @seealso \code{\link{aoa}}
#' @examples
#' \dontrun{
#' library(sf)
#' library(terra)
#' library(caret)
#'
#' # prepare sample data:
#' dat <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST"))
#' dat <- aggregate(dat[,c("VW","Easting","Northing")],by=list(as.character(dat$SOURCEID)),mean)
#' pts <- st_as_sf(dat,coords=c("Easting","Northing"))
#' pts$ID <- 1:nrow(pts)
#' set.seed(100)
#' pts <- pts[1:30,]
#' studyArea <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST"))[[1:8]]
#' trainDat <- extract(studyArea,pts,na.rm=FALSE)
#' trainDat <- merge(trainDat,pts,by.x="ID",by.y="ID")
#'
#' # train a model:
#' set.seed(100)
#' variables <- c("DEM","NDRE.Sd","TWI")
#' model <- train(trainDat[,which(names(trainDat)%in%variables)],
#' trainDat$VW, method="rf", importance=TRUE, tuneLength=1,
#' trControl=trainControl(method="cv",number=5,savePredictions=T))
#'
#' #...then calculate the AOA of the trained model for the study area:
#' AOA <- aoa(studyArea, model)
#' plot(AOA)
#' plot(AOA$DI)
#'
#' #... then normalize the DI
#' DI_norm <- normalize_DI(AOA)
#' plot(DI_norm)
#' plot(DI_norm$DI)
#'
#' }
#' @export normalize_DI
#' @aliases normalize_DI


normalize_DI <- function(AOA) {
AOA$DI <- AOA$DI/AOA$parameters$threshold
AOA$parameters$trainDI <- AOA$parameters$trainDI/AOA$parameters$threshold
AOA$parameters$threshold <- AOA$parameters$threshold/AOA$parameters$threshold
return(AOA)
}

Loading

0 comments on commit ca87153

Please sign in to comment.