Merge branch 'master' of github.com:Ludwigm6/CAST

HannaMeyer · Mar 11, 2024 · ca87153 · ca87153
2 parents 5d6d9ec + 5952016
commit ca87153
Show file tree

Hide file tree

Showing 25 changed files with 1,035 additions and 118 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: CAST
 Type: Package
 Title: 'caret' Applications for Spatial-Temporal Models
-Version: 0.9.1
+Version: 0.9.2
 Authors@R: c(person("Hanna", "Meyer", email = "[email protected]", role = c("cre", "aut")),
              person("Carles", "Milà", role = c("aut")),
              person("Marvin", "Ludwig", role = c("aut")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,16 +13,17 @@ S3method(print,knndm)
 S3method(print,nndm)
 S3method(print,trainDI)
 export(CreateSpacetimeFolds)
-export(DItoErrormetric)
 export(aoa)
 export(bss)
 export(calibrate_aoa)
 export(clustered_sample)
+export(errorProfiles)
 export(ffs)
 export(geodist)
 export(global_validation)
 export(knndm)
 export(nndm)
+export(normalize_DI)
 export(plot_ffs)
 export(plot_geodist)
 export(show.aoa)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# `CAST` 0.9.2
+* new features:
+  * normalize_DI for a more intuitive interpretation
+* modifications:
+  * function DItoErrormetric renamed to errorProfiles and allows for other dissimilarity measures
+
 # `CAST` 0.9.1
 * new features:
   * calculate local point density within AOA

diff --git a/R/aoa.R b/R/aoa.R
@@ -28,6 +28,7 @@
 #' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model?
 #' @param LPD Logical. Indicates whether the local point density should be calculated or not.
 #' @param maxLPD numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}})
+#' @param verbose Logical. Print progress or not?
 #' @details The Dissimilarity Index (DI), the Local Data Point Density (LPD) and the corresponding Area of Applicability (AOA) are calculated.
 #' If variables are factors, dummy variables are created prior to weighting and distance calculation.
 #'
@@ -146,7 +147,8 @@ aoa <- function(newdata,
                 method="L2",
                 useWeight=TRUE,
                 LPD = FALSE,
-                maxLPD = 1) {
+                maxLPD = 1,
+                verbose = TRUE) {
 
   # handling of different raster formats
   as_stars <- FALSE
@@ -200,8 +202,10 @@ aoa <- function(newdata,
 
   # if not provided, compute train DI
   if(!inherits(trainDI, "trainDI")) {
-    message("No trainDI provided.")
-    trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, LPD)
+    if (verbose) {
+      message("No trainDI provided.")
+    }
+    trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, LPD, verbose)
   }
 
   if (calc_LPD == TRUE) {
@@ -302,11 +306,15 @@ aoa <- function(newdata,
   }
 
   if (calc_LPD == TRUE) {
-    message("Computing DI and LPD of new data...")
+    if (verbose) {
+      message("Computing DI and LPD of new data...")
+    }
 
-    pb <- txtProgressBar(min = 0,
-                         max = nrow(newdataCC),
-                         style = 3)
+    if (verbose) {
+      pb <- txtProgressBar(min = 0,
+                           max = nrow(newdataCC),
+                           style = 3)
+    }
 
     DI_out <- rep(NA, nrow(newdata))
     LPD_out <- rep(NA, nrow(newdata))
@@ -317,24 +325,31 @@ aoa <- function(newdata,
 
       DI_out[okrows[i]] <- knnDI[1]
       LPD_out[okrows[i]] <- sum(knnDI < trainDI$threshold)
-      setTxtProgressBar(pb, i)
+      if (verbose) {
+        setTxtProgressBar(pb, i)
+      }
     }
 
-    close(pb)
+    if (verbose) {
+      close(pb)
+    }
 
     # set maxLPD to max of LPD_out if
     realMaxLPD <- max(LPD_out, na.rm = T)
     if (maxLPD > realMaxLPD) {
-      if (inherits(maxLPD, c("numeric", "integer"))) {
+      if (inherits(maxLPD, c("numeric", "integer")) && verbose) {
         message("Your specified maxLPD is bigger than the real maxLPD of you predictor data.")
       }
-      message(paste("maxLPD is set to", realMaxLPD))
+      if (verbose) {
+        message(paste("maxLPD is set to", realMaxLPD))
+      }
       trainDI$maxLPD <- realMaxLPD
     }
   }
 
-
-  message("Computing AOA...")
+  if (verbose) {
+    message("Computing AOA...")
+  }
 
   #### Create Mask for AOA and return statistics
   if (inherits(out, "SpatRaster")) {
@@ -389,7 +404,9 @@ aoa <- function(newdata,
     result$LPD <- LPD
   }
 
-  message("Finished!")
+  if (verbose) {
+    message("Finished!")
+  }
 
   class(result) <- "aoa"
   return(result)

diff --git a/R/DItoErrormetric.R → R/errorProfiles.R b/R/DItoErrormetric.R → R/errorProfiles.R
@@ -1,7 +1,8 @@
-#' Model the relationship between the DI and the prediction error
-#' @description Performance metrics are calculated for moving windows of DI/LPD values of cross-validated training data
+#' Model and inspect the relationship between the prediction error and measures of dissimilarities and distances
+#' @description Performance metrics are calculated for moving windows of dissimilarity values based on cross-validated training data
 #' @param model the model used to get the AOA
 #' @param trainDI the result of \code{\link{trainDI}} or aoa object \code{\link{aoa}}
+#' @param variable Character. Which dissimilarity or distance measure to use for the error metric. Current options are "DI" or "LPD"
 #' @param multiCV Logical. Re-run model fitting and validation with different CV strategies. See details.
 #' @param window.size Numeric. Size of the moving window. See \code{\link{rollapply}}.
 #' @param calib Character. Function to model the DI/LPD~performance relationship. Currently lm and scam are supported
@@ -10,9 +11,8 @@
 #' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model?
 #' @param k Numeric. See mgcv::s
 #' @param m Numeric. See mgcv::s
-#' @param variable Character. Which measure to use for the error metric. "DI" or "LPD"
 #' @details If multiCV=TRUE the model is re-fitted and validated by length.out new cross-validations where the cross-validation folds are defined by clusters in the predictor space,
-#' ranging from three clusters to LOOCV. Hence, a large range of DI/LPD values is created during cross-validation.
+#' ranging from three clusters to LOOCV. Hence, a large range of dissimilarity values is created during cross-validation.
 #' If the AOA threshold based on the calibration data from multiple CV is larger than the original AOA threshold (which is likely if extrapolation situations are created during CV),
 #' the AOA threshold changes accordingly. See Meyer and Pebesma (2021) for the full documentation of the methodology.
 #' @return A scam, linear model or exponential model
@@ -22,16 +22,25 @@
 #' Estimating the area of applicability of spatial prediction models.
 #' \doi{10.1111/2041-210X.13650}
 #' @seealso \code{\link{aoa}}
-#' @example inst/examples/ex_DItoErrormetric.R
+#' @example inst/examples/ex_errorProfiles.R
 #'
 #'
-#' @export
-
-
-DItoErrormetric <- function(model, trainDI, multiCV=FALSE,
-                            length.out = 10, window.size = 5, calib = "scam",
-                            method= "L2", useWeight=TRUE,
-                            k = 6, m = 2, variable = "DI"){
+#' @export errorProfiles
+#' @aliases errorProfiles DItoErrormetric
+
+
+
+errorProfiles <- function(model,
+                          trainDI,
+                          variable = "DI",
+                          multiCV=FALSE,
+                          length.out = 10,
+                          window.size = 5,
+                          calib = "scam",
+                          method= "L2",
+                          useWeight=TRUE,
+                          k = 6,
+                          m = 2){
 
 
   if(inherits(trainDI,"aoa")){

diff --git a/R/geodist.R b/R/geodist.R
@@ -143,7 +143,7 @@ geodist <- function(x,
       if(any(!variables%in%names(testdata))){# extract variable values of raster:
         testdata <- sf::st_transform(testdata,sf::st_crs(modeldomain))
         #testdata <- sf::st_as_sf(raster::extract(modeldomain, testdata, df = TRUE, sp = TRUE))
-        testdata <- sf::st_as_sf(terra::extract(modeldomain, testdata, na.rm=FALSE,bind=TRUE))
+        testdata <- sf::st_as_sf(terra::extract(modeldomain, terra::vect(testdata), na.rm=FALSE,bind=TRUE))
 
         if(any(is.na(testdata))){
           testdata <- na.omit(testdata)
@@ -157,7 +157,7 @@ geodist <- function(x,
       if(any(!variables%in%names(preddata))){# extract variable values of raster:
         preddata <- sf::st_transform(preddata,sf::st_crs(modeldomain))
         #preddata <- sf::st_as_sf(raster::extract(modeldomain, preddata, df = TRUE, sp = TRUE))
-        preddata <- sf::st_as_sf(terra::extract(modeldomain, preddata, na.rm=FALSE,bind=TRUE))
+        preddata <- sf::st_as_sf(terra::extract(modeldomain, terra::vect(preddata), na.rm=FALSE,bind=TRUE))
 
         if(any(is.na(preddata))){
           preddata <- na.omit(preddata)
@@ -194,6 +194,7 @@ geodist <- function(x,
 
   ##### Distance to CV data:
   if(!is.null(cvfolds)){
+
     cvd <- cvdistance(x, cvfolds, cvtrain, type, variables)
     dists <- rbind(dists, cvd)
   }

diff --git a/R/nndm.R b/R/nndm.R
@@ -4,7 +4,7 @@
 #' indices to perform a NNDM LOO CV for map validation.
 #' @author Carles Milà
 #' @param tpoints sf or sfc point object. Contains the training points samples.
-#' @param modeldomain sf polygon object defining the prediction area (see Details).
+#' @param modeldomain sf polygon object or SpatRaster defining the prediction area (see Details).
 #' @param ppoints sf or sfc point object. Contains the target prediction points.
 #' Optional. Alternative to modeldomain (see Details).
 #' @param samplesize numeric. How many points in the modeldomain should be sampled as prediction points?
@@ -34,12 +34,13 @@
 #' When \emph{phi} is set to "max", nearest neighbor distance matching is performed for the entire prediction area. Euclidean distances are used for projected
 #' and non-defined CRS, great circle distances are used for geographic CRS (units in meters).
 #'
-#' The \emph{modeldomain} is a sf polygon that defines the prediction area. The function takes a regular point sample (amount defined by \emph{samplesize)} from the spatial extent.
+#' The \emph{modeldomain} is either a sf polygon that defines the prediction area, or alternatively a SpatRaster out of which a polygon,
+#' transformed into the CRS of the training points, is defined as the outline of all non-NA cells.
+#' Then, the function takes a regular point sample (amount defined by \emph{samplesize)} from the spatial extent.
 #' As an alternative use \emph{ppoints} instead of \emph{modeldomain}, if you have already defined the prediction locations (e.g. raster pixel centroids).
 #' When using either \emph{modeldomain} or \emph{ppoints}, we advise to plot the study area polygon and the training/prediction points as a previous step to ensure they are aligned.
 #'
-#' @note NNDM is a variation of LOOCV and therefore may take a long time for large training data sets.
-#' A k-fold variant will be implemented shortly.
+#' @note NNDM is a variation of LOOCV and therefore may take a long time for large training data sets. See \code{\link{knndm}} for a more efficient k-fold variant of the method.
 #' @seealso \code{\link{geodist}}, \code{\link{knndm}}
 #' @references
 #' \itemize{
@@ -97,8 +98,8 @@
 #' plot(nndm_pred)
 #'
 #' ########################################################################
-#' # Example 3: Real- world example; using a modeldomain instead of previously
-#' # sampled prediction locations
+#' # Example 3: Real- world example; using a SpatRast modeldomain instead
+#' # of previously sampled prediction locations
 #' ########################################################################
 #' \dontrun{
 #' library(sf)
@@ -112,15 +113,11 @@
 #' pts <- st_as_sf(pts,coords=c("Easting","Northing"))
 #' st_crs(pts) <- 26911
 #' studyArea <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST"))
-#' studyArea[!is.na(studyArea)] <- 1
-#' studyArea <- as.polygons(studyArea, values = FALSE, na.all = TRUE) |>
-#'     st_as_sf() |>
-#'     st_union()
 #' pts <- st_transform(pts, crs = st_crs(studyArea))
-#' plot(studyArea)
-#' plot(st_geometry(pts), add = TRUE, col = "red")
+#' terra::plot(studyArea[["DEM"]])
+#' terra::plot(vect(pts), add = T)
 #'
-#' nndm_folds <- nndm(pts, modeldomain= studyArea)
+#' nndm_folds <- nndm(pts, modeldomain = studyArea)
 #' plot(nndm_folds)
 #'
 #' #use for cross-validation:
@@ -143,12 +140,36 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000,
 
   # create sample points from modeldomain
   if(is.null(ppoints)&!is.null(modeldomain)){
+
+    # Check modeldomain is indeed a sf/SpatRaster
+    if(!any(c("sfc", "sf", "SpatRaster") %in% class(modeldomain))){
+      stop("modeldomain must be a sf/sfc object or a 'SpatRaster' object.")
+    }
+
+    # If modeldomain is a SpatRaster, transform into polygon
+    if(any(class(modeldomain) == "SpatRaster")){
+      modeldomain[!is.na(modeldomain)] <- 1
+      modeldomain <- terra::as.polygons(modeldomain, values = FALSE, na.all = TRUE) |>
+        sf::st_as_sf() |>
+        sf::st_union()
+      modeldomain <- sf::st_transform(modeldomain, crs = sf::st_crs(tpoints))
+    }
+
+    # Check modeldomain is indeed a polygon sf
+    if(!any(class(sf::st_geometry(modeldomain)) %in% c("sfc_POLYGON", "sfc_MULTIPOLYGON"))){
+      stop("modeldomain must be a sf/sfc polygon object.")
+    }
+
+    # Check whether modeldomain has the same crs as tpoints
     if(!identical(sf::st_crs(tpoints), sf::st_crs(modeldomain))){
       stop("tpoints and modeldomain must have the same CRS")
     }
+
+    # We sample
     message(paste0(samplesize, " prediction points are sampled from the modeldomain"))
     ppoints <- sf::st_sample(x = modeldomain, size = samplesize, type = sampling)
     sf::st_crs(ppoints) <- sf::st_crs(modeldomain)
+
   }else if(!is.null(ppoints)){
     if(!identical(sf::st_crs(tpoints), sf::st_crs(ppoints))){
       stop("tpoints and ppoints must have the same CRS")
@@ -165,6 +186,9 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000,
     ppoints <- sf::st_sf(geom=ppoints)
   }
 
+  # Input data checks
+  nndm_checks(tpoints, ppoints, phi, min_train)
+
   # if phi==max calculate the range of the size area
   if(phi=="max"){
     xmin <- min(sf::st_coordinates(ppoints)[,1])
@@ -176,9 +200,6 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000,
     phi <- as.numeric(max(sf::st_distance(p)))
   }
 
-  # Input data checks
-  nndm_checks(tpoints, ppoints, phi, min_train)
-
   # Compute nearest neighbour distances between training and prediction points
   Gij <- sf::st_distance(ppoints, tpoints)
   units(Gij) <- NULL
@@ -238,8 +259,8 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000,
 nndm_checks <- function(tpoints, ppoints, phi, min_train){
 
   # Check for valid range of phi
-  if(phi < 0 | !is.numeric(phi)){
-    stop("phi must be positive.")
+  if(phi < 0 | (!is.numeric(phi) & phi!= "max")){
+    stop("phi must be positive or set to 'max'.")
   }
 
   # min_train must be a single positive numeric

diff --git a/R/normalize_DI.R b/R/normalize_DI.R
@@ -0,0 +1,54 @@
+#' Normalize DI values
+#' @description
+#' The DI is normalized by the DI threshold to allow for a more straightforwrd interpretation.
+#' A value in the resulting DI larger 1 means that the data are more dissimilar than what has been observed during cross-validation.
+#' The returned threshold is adjusted accordingly and is, as a consequence, 1.
+#' @param AOA An AOA object
+#' @return An object of class \code{aoa}
+#' @seealso \code{\link{aoa}}
+#' @examples
+#' \dontrun{
+#' library(sf)
+#' library(terra)
+#' library(caret)
+#'
+#' # prepare sample data:
+#' dat <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST"))
+#' dat <- aggregate(dat[,c("VW","Easting","Northing")],by=list(as.character(dat$SOURCEID)),mean)
+#' pts <- st_as_sf(dat,coords=c("Easting","Northing"))
+#' pts$ID <- 1:nrow(pts)
+#' set.seed(100)
+#' pts <- pts[1:30,]
+#' studyArea <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST"))[[1:8]]
+#' trainDat <- extract(studyArea,pts,na.rm=FALSE)
+#' trainDat <- merge(trainDat,pts,by.x="ID",by.y="ID")
+#'
+#' # train a model:
+#' set.seed(100)
+#' variables <- c("DEM","NDRE.Sd","TWI")
+#' model <- train(trainDat[,which(names(trainDat)%in%variables)],
+#' trainDat$VW, method="rf", importance=TRUE, tuneLength=1,
+#' trControl=trainControl(method="cv",number=5,savePredictions=T))
+#'
+#' #...then calculate the AOA of the trained model for the study area:
+#' AOA <- aoa(studyArea, model)
+#' plot(AOA)
+#' plot(AOA$DI)
+#'
+#' #... then normalize the DI
+#' DI_norm <- normalize_DI(AOA)
+#' plot(DI_norm)
+#' plot(DI_norm$DI)
+#'
+#' }
+#' @export normalize_DI
+#' @aliases normalize_DI
+
+
+normalize_DI <- function(AOA) {
+  AOA$DI <- AOA$DI/AOA$parameters$threshold
+  AOA$parameters$trainDI <- AOA$parameters$trainDI/AOA$parameters$threshold
+  AOA$parameters$threshold <- AOA$parameters$threshold/AOA$parameters$threshold
+  return(AOA)
+}
+