From 6f68412abc3bf86598991e8f6c4fce7ed6977646 Mon Sep 17 00:00:00 2001 From: carlesmila Date: Mon, 11 Mar 2024 12:07:15 +0100 Subject: [PATCH 01/14] tests global_validation --- tests/testthat/test-global_validation.R | 50 +++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/testthat/test-global_validation.R b/tests/testthat/test-global_validation.R index e69de29b..95165f9a 100644 --- a/tests/testthat/test-global_validation.R +++ b/tests/testthat/test-global_validation.R @@ -0,0 +1,50 @@ +test_that("global_validation correctly handles missing predictions", { + + data("iris") + set.seed(123) + ctrl <- caret::trainControl(method="cv") + model <- caret::train(iris[,c("Sepal.Width", "Petal.Length", "Petal.Width")], + iris[,c("Sepal.Length")], + method="rf", trControl=ctrl, ntree=10) + expect_error(global_validation(model)) +}) + +test_that("global_validation works with caret regression", { + + data("iris") + set.seed(123) + ctrl <- caret::trainControl(method="cv", savePredictions="final") + model <- caret::train(iris[,c("Sepal.Width", "Petal.Length", "Petal.Width")], + iris[,c("Sepal.Length")], + method="rf", trControl=ctrl, ntree=10) + expect_equal(global_validation(model), + c("RMSE"=0.3307870, "Rsquared"=0.8400544, "MAE"=0.2621827)) + +}) + +test_that("global_validation works with caret classification", { + + data("iris") + set.seed(123) + ctrl <- caret::trainControl(method="cv", savePredictions="final") + model <- caret::train(iris[,c("Sepal.Width", "Petal.Length", "Petal.Width", "Sepal.Length")], + iris[,c("Species")], + method="rf", trControl=ctrl, ntree=10) + expect_equal(global_validation(model)[1:2], + c("Accuracy"=0.96, "Kappa"=0.94)) + +}) + +test_that("global_validation works with CreateSpacetimeFolds", { + + data("iris") + set.seed(123) + iris$folds <- sample(rep(1:10, ceiling(nrow(iris)/10)), nrow(iris)) + indices <- CreateSpacetimeFolds(iris, "folds") + ctrl <- caret::trainControl(method="cv", savePredictions="final", index = indices$index) + model <- caret::train(iris[,c("Sepal.Width", "Petal.Length", "Petal.Width", "Sepal.Length")], + iris[,c("Species")], + method="rf", trControl=ctrl, ntree=10) + expect_equal(global_validation(model)[1:2], + c("Accuracy"=0.96, "Kappa"=0.94)) +}) From a8630559aab827d092d94f90980bce6e4bcf234b Mon Sep 17 00:00:00 2001 From: HannaMeyer Date: Mon, 11 Mar 2024 13:14:03 +0100 Subject: [PATCH 02/14] add missing packages --- tests/testthat/test-DItoErrormetric.R | 10 +++++----- tests/testthat/test-aoa.R | 16 ++++++++-------- tests/testthat/test_trainDI.R | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/testthat/test-DItoErrormetric.R b/tests/testthat/test-DItoErrormetric.R index b1bfc2ff..b7a6c15f 100644 --- a/tests/testthat/test-DItoErrormetric.R +++ b/tests/testthat/test-DItoErrormetric.R @@ -11,7 +11,7 @@ test_that("DItoErrormetric works in default settings", { AOA <- CAST::aoa(predictors, model) # DI ~ error - errormodel_DI <- DItoErrormetric(model, AOA, variable = "DI") + errormodel_DI <- CAST::DItoErrormetric(model, AOA, variable = "DI") expected_error_DI = terra::predict(AOA$DI, errormodel_DI) @@ -21,7 +21,7 @@ test_that("DItoErrormetric works in default settings", { expect_equal(round(as.numeric(summary(errormodel_DI$fitted.values)),2), c(14.25, 14.34, 15.21, 17.23, 18.70, 27.46)) # test model predictions - expect_equal(as.vector( summary(values(expected_error_DI))), + expect_equal(as.vector( summary(terra::values(expected_error_DI))), c("Min. :14.26 ", "1st Qu.:27.46 ", "Median :27.46 ", "Mean :26.81 ", "3rd Qu.:27.46 ","Max. :27.47 ", "NA's :17678 ")) @@ -38,7 +38,7 @@ test_that("DItoErrormetric works in with LPD", { trControl = caret::trainControl(method = "cv", savePredictions = TRUE)) AOA <- CAST::aoa(predictors, model, LPD = TRUE, maxLPD = 1) - errormodel_LPD <- DItoErrormetric(model, AOA, variable = "LPD") + errormodel_LPD <- CAST::DItoErrormetric(model, AOA, variable = "LPD") expected_error_LPD = terra::predict(AOA$LPD, errormodel_LPD) @@ -46,7 +46,7 @@ test_that("DItoErrormetric works in with LPD", { expect_equal(round(as.numeric(summary(errormodel_LPD$fitted.values)),2), c(16.36, 16.36, 16.36, 16.36, 16.36, 16.36)) # test model predictions - expect_equal(as.vector(summary(values(expected_error_LPD))), + expect_equal(as.vector(summary(terra::values(expected_error_LPD))), c("Min. :16.36 ", "1st Qu.:16.36 ", "Median :16.36 ", "Mean :16.36 ", "3rd Qu.:16.36 ", "Max. :16.36 ", "NA's :17678 ")) @@ -74,7 +74,7 @@ test_that("DItoErrormetric works for multiCV", { expect_equal(round(as.numeric(summary(errormodel_DI$fitted.values)),2), c(12.53, 17.21, 26.80, 26.19, 35.28, 35.30)) # test model predictions - expect_equal(as.vector( summary(values(expected_error_DI))), + expect_equal(as.vector( summary(terra::values(expected_error_DI))), c("Min. :13.11 ", "1st Qu.:32.58 ", "Median :35.05 ", "Mean :32.54 ", "3rd Qu.:35.30 ", "Max. :35.30 ", "NA's :17678 ")) diff --git a/tests/testthat/test-aoa.R b/tests/testthat/test-aoa.R index 4ce51728..783df669 100644 --- a/tests/testthat/test-aoa.R +++ b/tests/testthat/test-aoa.R @@ -37,7 +37,7 @@ test_that("AOA works in default: used with raster data and a trained model", { #test threshold: expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.38986) #test number of pixels within AOA: - expect_equal(sum(values(AOA$AOA)==1,na.rm=TRUE), 2936) + expect_equal(sum(terra::values(AOA$AOA)==1,na.rm=TRUE), 2936) # test trainDI expect_equal(AOA$parameters$trainDI, c(0.09043580, 0.14046341, 0.16584582, 0.57617177, 0.26840303, 0.14353894, 0.19768329, 0.24022059, 0.06832037, 0.29150668, @@ -46,7 +46,7 @@ test_that("AOA works in default: used with raster data and a trained model", { 0.23604264, 0.20388725, 0.91513568, 0.09558666, 0.14046341, 0.16214832, 0.37107762, 0.16214832, 0.18471625, 0.12344463)) # test summary statistics of the DI - expect_equal(as.vector(summary(values(AOA$DI))), + expect_equal(as.vector(summary(terra::values(AOA$DI))), c("Min. :0.0000 ", "1st Qu.:0.1329 ", "Median :0.2052 ", "Mean :0.2858 ", "3rd Qu.:0.3815 ", "Max. :4.4485 ", "NA's :1993 ")) @@ -60,9 +60,9 @@ test_that("AOA works without a trained model", { #test threshold: expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.52872) #test number of pixels within AOA: - expect_equal(sum(values(AOA$AOA)==1,na.rm=TRUE), 3377) + expect_equal(sum(terra::values(AOA$AOA)==1,na.rm=TRUE), 3377) # test summary statistics of the DI - expect_equal(as.vector(summary(values(AOA$DI))), + expect_equal(as.vector(summary(terra::values(AOA$DI))), c("Min. :0.0000 ", "1st Qu.:0.1759 ", "Median :0.2642 ", "Mean :0.3109 ", "3rd Qu.:0.4051 ", "Max. :2.6631 ", "NA's :1993 ")) @@ -76,7 +76,7 @@ test_that("AOA (including LPD) works with raster data and a trained model", { #test threshold: expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.38986) #test number of pixels within AOA: - expect_equal(sum(values(AOA$AOA)==1,na.rm=TRUE), 2936) + expect_equal(sum(terra::values(AOA$AOA)==1,na.rm=TRUE), 2936) #test trainLPD expect_equal(AOA$parameters$trainLPD, c(3, 4, 6, 0, 7, 6, 2, 1, 5, 3, @@ -85,7 +85,7 @@ test_that("AOA (including LPD) works with raster data and a trained model", { 3, 4, 0, 2, 3, 6, 1, 7, 3, 2)) # test summary statistics of the DI - expect_equal(as.vector(summary(values(AOA$DI))), + expect_equal(as.vector(summary(terra::values(AOA$DI))), c("Min. :0.0000 ", "1st Qu.:0.1329 ", "Median :0.2052 ", "Mean :0.2858 ", "3rd Qu.:0.3815 ", "Max. :4.4485 ", "NA's :1993 ")) @@ -99,7 +99,7 @@ test_that("AOA (inluding LPD) works without a trained model", { #test threshold: expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.52872) #test number of pixels within AOA: - expect_equal(sum(values(AOA$AOA)==1,na.rm=TRUE), 3377) + expect_equal(sum(terra::values(AOA$AOA)==1,na.rm=TRUE), 3377) # test trainLPD expect_equal(AOA$parameters$trainLPD, c(7, 9, 12, 1, 12, 12, 4, 2, 8, 10, @@ -108,7 +108,7 @@ test_that("AOA (inluding LPD) works without a trained model", { 6, 5, 0, 5, 9, 8, 4, 11, 3,2)) # test summary statistics of the DI - expect_equal(as.vector(summary(values(AOA$DI))), + expect_equal(as.vector(summary(terra::values(AOA$DI))), c("Min. :0.0000 ", "1st Qu.:0.1759 ", "Median :0.2642 ", "Mean :0.3109 ", "3rd Qu.:0.4051 ", "Max. :2.6631 ", "NA's :1993 ")) diff --git a/tests/testthat/test_trainDI.R b/tests/testthat/test_trainDI.R index 8a50b0c9..990906c8 100644 --- a/tests/testthat/test_trainDI.R +++ b/tests/testthat/test_trainDI.R @@ -13,9 +13,9 @@ loaddata <- function() { # train a model: set.seed(100) variables <- c("DEM","NDRE.Sd","TWI") - model <- train(trainDat[,which(names(trainDat)%in%variables)], + model <- caret::train(trainDat[,which(names(trainDat)%in%variables)], trainDat$VW, method="rf", importance=TRUE, tuneLength=1, - trControl=trainControl(method="cv",number=5,savePredictions=T)) + trControl=caret::trainControl(method="cv",number=5,savePredictions=T)) data <- list( From 5d3876b3a0d26cc1ed9a3dc3ae1adfeacb84302f Mon Sep 17 00:00:00 2001 From: HannaMeyer Date: Mon, 11 Mar 2024 13:22:24 +0100 Subject: [PATCH 03/14] remove duplicated print of LPD --- R/print.R | 5 ----- 1 file changed, 5 deletions(-) diff --git a/R/print.R b/R/print.R index 187ac7cb..b123d3ba 100644 --- a/R/print.R +++ b/R/print.R @@ -37,11 +37,6 @@ show.trainDI = function(x, ...){ print.aoa = function(x, ...){ cat("DI:\n") print(x$DI) - - if ("LPD" %in% names(x)) { - cat("LPD:\n") - print(x$LPD) - } if ("LPD" %in% names(x)) { cat("LPD:\n") From 30f90eb2bd41bba759cc96a70af8299bb13bb8e8 Mon Sep 17 00:00:00 2001 From: fab-scm Date: Mon, 11 Mar 2024 13:38:42 +0100 Subject: [PATCH 04/14] add `verbose` argument to `aoa` and `trainLPD` --- R/aoa.R | 44 ++++++++++++++++++++++++++++++-------------- R/trainDI.R | 40 +++++++++++++++++++++++++++------------- 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/R/aoa.R b/R/aoa.R index ac2482e1..a95d3ee7 100644 --- a/R/aoa.R +++ b/R/aoa.R @@ -146,7 +146,8 @@ aoa <- function(newdata, method="L2", useWeight=TRUE, LPD = FALSE, - maxLPD = 1) { + maxLPD = 1, + verbose = TRUE) { # handling of different raster formats as_stars <- FALSE @@ -200,8 +201,10 @@ aoa <- function(newdata, # if not provided, compute train DI if(!inherits(trainDI, "trainDI")) { - message("No trainDI provided.") - trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, LPD) + if (verbose) { + message("No trainDI provided.") + } + trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, LPD, verbose) } if (calc_LPD == TRUE) { @@ -302,11 +305,15 @@ aoa <- function(newdata, } if (calc_LPD == TRUE) { - message("Computing DI and LPD of new data...") + if (verbose) { + message("Computing DI and LPD of new data...") + } - pb <- txtProgressBar(min = 0, - max = nrow(newdataCC), - style = 3) + if (verbose) { + pb <- txtProgressBar(min = 0, + max = nrow(newdataCC), + style = 3) + } DI_out <- rep(NA, nrow(newdata)) LPD_out <- rep(NA, nrow(newdata)) @@ -317,24 +324,31 @@ aoa <- function(newdata, DI_out[okrows[i]] <- knnDI[1] LPD_out[okrows[i]] <- sum(knnDI < trainDI$threshold) - setTxtProgressBar(pb, i) + if (verbose) { + setTxtProgressBar(pb, i) + } } - close(pb) + if (verbose) { + close(pb) + } # set maxLPD to max of LPD_out if realMaxLPD <- max(LPD_out, na.rm = T) if (maxLPD > realMaxLPD) { - if (inherits(maxLPD, c("numeric", "integer"))) { + if (inherits(maxLPD, c("numeric", "integer")) && verbose) { message("Your specified maxLPD is bigger than the real maxLPD of you predictor data.") } - message(paste("maxLPD is set to", realMaxLPD)) + if (verbose) { + message(paste("maxLPD is set to", realMaxLPD)) + } trainDI$maxLPD <- realMaxLPD } } - - message("Computing AOA...") + if (verbose) { + message("Computing AOA...") + } #### Create Mask for AOA and return statistics if (inherits(out, "SpatRaster")) { @@ -389,7 +403,9 @@ aoa <- function(newdata, result$LPD <- LPD } - message("Finished!") + if (verbose) { + message("Finished!") + } class(result) <- "aoa" return(result) diff --git a/R/trainDI.R b/R/trainDI.R index ae8bc2b8..103e9ebd 100644 --- a/R/trainDI.R +++ b/R/trainDI.R @@ -111,7 +111,8 @@ trainDI <- function(model = NA, CVtrain = NULL, method="L2", useWeight = TRUE, - LPD = FALSE){ + LPD = FALSE, + verbose = TRUE){ # get parameters if they are not provided in function call----- if(is.null(train)){train = aoa_get_train(model)} @@ -192,10 +193,12 @@ trainDI <- function(model = NA, S_inv <- MASS::ginv(S) } - message("Computing DI of training data...") - pb <- txtProgressBar(min = 0, - max = nrow(train), - style = 3) + if (verbose) { + message("Computing DI of training data...") + pb <- txtProgressBar(min = 0, + max = nrow(train), + style = 3) + } for(i in seq(nrow(train))){ @@ -228,9 +231,14 @@ trainDI <- function(model = NA, }else{ trainDist_min <- append(trainDist_min, min(trainDist, na.rm = TRUE)) } - setTxtProgressBar(pb, i) + if (verbose) { + setTxtProgressBar(pb, i) + } + } + + if (verbose) { + close(pb) } - close(pb) trainDist_avrgmean <- mean(trainDist_avrg,na.rm=TRUE) @@ -254,10 +262,12 @@ trainDI <- function(model = NA, # calculate trainLPD and avrgLPD according to the CV folds if (LPD == TRUE) { - message("Computing LPD of training data...") - pb <- txtProgressBar(min = 0, - max = nrow(train), - style = 3) + if (verbose) { + message("Computing LPD of training data...") + pb <- txtProgressBar(min = 0, + max = nrow(train), + style = 3) + } trainLPD <- c() for (j in seq(nrow(train))) { @@ -287,10 +297,14 @@ trainDI <- function(model = NA, } else { trainLPD <- append(trainLPD, sum(DItrainDist[,1] < thres, na.rm = TRUE)) } - setTxtProgressBar(pb, j) + if (verbose) { + setTxtProgressBar(pb, j) + } } - close(pb) + if (verbose) { + close(pb) + } # Average LPD in trainData avrgLPD <- round(mean(trainLPD)) From e793c24369ef3dec0bcf327cb9bbc9489b7f0f32 Mon Sep 17 00:00:00 2001 From: carlesmila Date: Mon, 11 Mar 2024 14:03:32 +0100 Subject: [PATCH 05/14] NNDM tests --- R/nndm.R | 22 ++++++-- tests/testthat/test-nndm.R | 110 +++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 5 deletions(-) diff --git a/R/nndm.R b/R/nndm.R index 6c48b7a6..91867198 100644 --- a/R/nndm.R +++ b/R/nndm.R @@ -143,12 +143,24 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000, # create sample points from modeldomain if(is.null(ppoints)&!is.null(modeldomain)){ + + # Check modeldomain is indeed a polygon + if(!any(c("sfc", "sf") %in% class(modeldomain))){ + stop("modeldomain must be a sf/sfc object.") + }else if(!any(class(sf::st_geometry(modeldomain)) %in% c("sfc_POLYGON", "sfc_MULTIPOLYGON"))){ + stop("modeldomain must be a sf/sfc polygon object.") + } + + # Check whether modeldomain has the same crs as tpoints if(!identical(sf::st_crs(tpoints), sf::st_crs(modeldomain))){ stop("tpoints and modeldomain must have the same CRS") } + + # We sample message(paste0(samplesize, " prediction points are sampled from the modeldomain")) ppoints <- sf::st_sample(x = modeldomain, size = samplesize, type = sampling) sf::st_crs(ppoints) <- sf::st_crs(modeldomain) + }else if(!is.null(ppoints)){ if(!identical(sf::st_crs(tpoints), sf::st_crs(ppoints))){ stop("tpoints and ppoints must have the same CRS") @@ -165,6 +177,9 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000, ppoints <- sf::st_sf(geom=ppoints) } + # Input data checks + nndm_checks(tpoints, ppoints, phi, min_train) + # if phi==max calculate the range of the size area if(phi=="max"){ xmin <- min(sf::st_coordinates(ppoints)[,1]) @@ -176,9 +191,6 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000, phi <- as.numeric(max(sf::st_distance(p))) } - # Input data checks - nndm_checks(tpoints, ppoints, phi, min_train) - # Compute nearest neighbour distances between training and prediction points Gij <- sf::st_distance(ppoints, tpoints) units(Gij) <- NULL @@ -238,8 +250,8 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000, nndm_checks <- function(tpoints, ppoints, phi, min_train){ # Check for valid range of phi - if(phi < 0 | !is.numeric(phi)){ - stop("phi must be positive.") + if(phi < 0 | (!is.numeric(phi) & phi!= "max")){ + stop("phi must be positive or set to 'max'.") } # min_train must be a single positive numeric diff --git a/tests/testthat/test-nndm.R b/tests/testthat/test-nndm.R index e69de29b..3b69d4de 100644 --- a/tests/testthat/test-nndm.R +++ b/tests/testthat/test-nndm.R @@ -0,0 +1,110 @@ +test_that("Valid range of phi", { + + set.seed(1234) + poly <- sf::st_polygon(list(matrix(c(0,0,0,50,50,50,50,0,0,0), ncol=2, + byrow=TRUE))) + poly_sfc <- sf::st_sfc(poly) + tpoints_sfc <- sf::st_sample(poly_sfc, 50, type = "random") + ppoints_sfc <- sf::st_sample(poly_sfc, 50, type = "regular") + + expect_error(nndm(tpoints_sfc, ppoints = ppoints_sfc, phi = -1), + "phi must be positive or set to 'max'.") +}) + +test_that("NNDM detects wrong data and geometry types", { + + set.seed(1234) + poly <- sf::st_polygon(list(matrix(c(0,0,0,50,50,50,50,0,0,0), ncol=2, + byrow=TRUE))) + poly_sfc <- sf::st_sfc(poly) + tpoints_sfc <- sf::st_sample(poly_sfc, 50, type = "random") + ppoints_sfc <- sf::st_sample(poly_sfc, 50, type = "regular") + + # tpoints + expect_error(suppressWarnings(nndm(1, ppoints = ppoints_sfc)), + "tpoints must be a sf/sfc object.") + expect_error(nndm(poly, ppoints = ppoints_sfc), + "tpoints must be a sf/sfc object.") + expect_error(nndm(sf::st_sfc(poly), ppoints = ppoints_sfc), + "tpoints must be a sf/sfc point object.") + # ppoints + expect_error(suppressWarnings(nndm(tpoints_sfc, ppoints = 1)), + "ppoints must be a sf/sfc object.") + expect_error(nndm(tpoints_sfc, ppoints = poly), + "ppoints must be a sf/sfc object.") + expect_error(nndm(tpoints_sfc, ppoints = poly_sfc), + "ppoints must be a sf/sfc point object.") + + # model domain + expect_error(suppressWarnings(nndm(tpoints_sfc, modeldomain = 1)), + "modeldomain must be a sf/sfc object.") + expect_error(nndm(tpoints_sfc, modeldomain = ppoints_sfc), + "modeldomain must be a sf/sfc polygon object.") +}) + +test_that("NNDM detects different CRS in inputs", { + + set.seed(1234) + poly <- sf::st_polygon(list(matrix(c(0,0,0,50,50,50,50,0,0,0), ncol=2, + byrow=TRUE))) + poly_sfc <- sf::st_sfc(poly) + tpoints_sfc <- sf::st_sample(poly_sfc, 50, type = "random") + ppoints_sfc <- sf::st_sample(poly_sfc, 50, type = "regular") + + tpoints_sfc_4326 <- sf::st_set_crs(tpoints_sfc, 4326) + tpoints_sfc_3857 <- sf::st_set_crs(tpoints_sfc, 3857) + ppoints_sfc_4326 <- sf::st_set_crs(ppoints_sfc, 4326) + ppoints_sfc_3857 <- sf::st_set_crs(ppoints_sfc, 3857) + poly_sfc_4326 <- sf::st_set_crs(poly_sfc, 4326) + + # tests + expect_error(nndm(tpoints_sfc_3857, ppoints = ppoints_sfc), + "tpoints and ppoints must have the same CRS") + expect_error(nndm(tpoints_sfc_3857, modeldomain = poly_sfc_4326), + "tpoints and modeldomain must have the same CRS") +}) + + + +test_that("NNDM yields the expected results for all data types", { + + set.seed(1234) + poly <- sf::st_polygon(list(matrix(c(0,0,0,50,50,50,50,0,0,0), ncol=2, + byrow=TRUE))) + poly_sfc <- sf::st_sfc(poly) + tpoints_sfc <- sf::st_sample(poly_sfc, 50, type = "random") + ppoints_sfc <- sf::st_sample(poly_sfc, 50, type = "regular") + + # tpoints, ppoints + expect_equal(as.numeric(nndm(tpoints_sfc, ppoints = tpoints_sfc)$Gjstar[1]), 3.7265881) + # tpoints, modeldomain + expect_equal(as.numeric(nndm(tpoints_sfc, modeldomain = poly_sfc)$Gjstar[5]), 4.9417614) + # change phi + expect_equal(as.numeric(nndm(tpoints_sfc, ppoints = tpoints_sfc, phi = 10)$Gjstar[10]), 4.8651321) + # change min_train + expect_equal(as.numeric(nndm(tpoints_sfc, ppoints = tpoints_sfc, phi = 20, min_train = 0.2)$Gjstar[15]), 3.466861) + # length checks + expect_equal(length(nndm(tpoints_sfc, ppoints = tpoints_sfc)$Gjstar), length(tpoints_sfc)) + expect_equal(length(nndm(tpoints_sfc, ppoints = tpoints_sfc)$Gi), length(tpoints_sfc)) + expect_gt(length(nndm(tpoints_sfc, modeldomain = poly_sfc)$Gij), 900) +}) + +test_that("NNDM yields the expected results for all CRS", { + + set.seed(1234) + poly <- sf::st_polygon(list(matrix(c(0,0,0,50,50,50,50,0,0,0), ncol=2, + byrow=TRUE))) + poly_sfc <- sf::st_sfc(poly) + tpoints_sfc <- sf::st_sample(poly_sfc, 50, type = "random") + ppoints_sfc <- sf::st_sample(poly_sfc, 50, type = "regular") + + # Projected + tpoints_3857 <- sf::st_set_crs(tpoints_sfc, 3857) + ppoints_3857 <- sf::st_set_crs(ppoints_sfc, 3857) + expect_equal(as.numeric(nndm(tpoints_3857, ppoints = ppoints_3857, phi = 10)$Gjstar[20]), 3.2921498) + + # Geographic + tpoints_sf_4326 <- sf::st_set_crs(tpoints_sfc, 4326) + ppoints_sf_4326 <- sf::st_set_crs(ppoints_sfc, 4326) + expect_equal(as.numeric(nndm(tpoints_sf_4326, ppoints = ppoints_sf_4326, phi = 1000000)$Gjstar[20], 4), 355614.94) +}) From 525a876c19160e8e9a6a02b3a0ba7cdb33f06540 Mon Sep 17 00:00:00 2001 From: HannaMeyer Date: Mon, 11 Mar 2024 14:23:02 +0100 Subject: [PATCH 06/14] new feature: normalize_DI --- R/normalize_DI.R | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 R/normalize_DI.R diff --git a/R/normalize_DI.R b/R/normalize_DI.R new file mode 100644 index 00000000..7afcbb6d --- /dev/null +++ b/R/normalize_DI.R @@ -0,0 +1,54 @@ +#' Normalize DI values +#' @description +#' The DI is normalized by the DI threshold to allow for a more straightforwrd interpretation. +#' A value in the resulting DI larger 1 means that the data are more dissimilar than what has been observed during cross-validation. +#' The returned threshold is adjusted accordingly and is, as a consequence, 1. +#' @param AOA An AOA object +#' @return An object of class \code{aoa} +#' @seealso \code{\link{aoa}} +#' @examples +#' \dontrun{ +#' library(sf) +#' library(terra) +#' library(caret) +#' +#' # prepare sample data: +#' dat <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST")) +#' dat <- aggregate(dat[,c("VW","Easting","Northing")],by=list(as.character(dat$SOURCEID)),mean) +#' pts <- st_as_sf(dat,coords=c("Easting","Northing")) +#' pts$ID <- 1:nrow(pts) +#' set.seed(100) +#' pts <- pts[1:30,] +#' studyArea <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST"))[[1:8]] +#' trainDat <- extract(studyArea,pts,na.rm=FALSE) +#' trainDat <- merge(trainDat,pts,by.x="ID",by.y="ID") +#' +#' # train a model: +#' set.seed(100) +#' variables <- c("DEM","NDRE.Sd","TWI") +#' model <- train(trainDat[,which(names(trainDat)%in%variables)], +#' trainDat$VW, method="rf", importance=TRUE, tuneLength=1, +#' trControl=trainControl(method="cv",number=5,savePredictions=T)) +#' +#' #...then calculate the AOA of the trained model for the study area: +#' AOA <- aoa(studyArea, model) +#' plot(AOA) +#' plot(AOA$DI) +#' +#' #... then normalize the DI +#' DI_norm <- normalize_DI(AOA) +#' plot(DI_norm) +#' plot(DI_norm$DI) +#' +#' } +#' @export normalize_DI +#' @aliases normalize_DI + + +normalize_DI <- function(AOA) { + AOA$DI <- AOA$DI/AOA$parameters$threshold + AOA$parameters$trainDI <- AOA$parameters$trainDI/AOA$parameters$threshold + AOA$parameters$threshold <- AOA$parameters$threshold/AOA$parameters$threshold + return(AOA) +} + From 47e06d38981b2a73b4fea0abbc06fdf098470df6 Mon Sep 17 00:00:00 2001 From: HannaMeyer Date: Mon, 11 Mar 2024 14:38:56 +0100 Subject: [PATCH 07/14] documentation of parameter verbose --- R/aoa.R | 1 + R/trainDI.R | 1 + man/aoa.Rd | 3 ++- man/trainDI.Rd | 3 ++- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/aoa.R b/R/aoa.R index a95d3ee7..8c3fc606 100644 --- a/R/aoa.R +++ b/R/aoa.R @@ -28,6 +28,7 @@ #' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model? #' @param LPD Logical. Indicates whether the local point density should be calculated or not. #' @param maxLPD numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}}) +#' @param verbose Logical. Print progress or not? #' @details The Dissimilarity Index (DI), the Local Data Point Density (LPD) and the corresponding Area of Applicability (AOA) are calculated. #' If variables are factors, dummy variables are created prior to weighting and distance calculation. #' diff --git a/R/trainDI.R b/R/trainDI.R index 103e9ebd..7b3aa6ec 100644 --- a/R/trainDI.R +++ b/R/trainDI.R @@ -24,6 +24,7 @@ #' @param method Character. Method used for distance calculation. Currently euclidean distance (L2) and Mahalanobis distance (MD) are implemented but only L2 is tested. Note that MD takes considerably longer. #' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model? #' @param LPD Logical. Indicates whether the local point density should be calculated or not. +#' @param verbose Logical. Print progress or not? #' #' @seealso \code{\link{aoa}} #' @importFrom graphics boxplot diff --git a/man/aoa.Rd b/man/aoa.Rd index 49c2b5b1..e326d103 100644 --- a/man/aoa.Rd +++ b/man/aoa.Rd @@ -16,7 +16,8 @@ aoa( method = "L2", useWeight = TRUE, LPD = FALSE, - maxLPD = 1 + maxLPD = 1, + verbose = TRUE ) } \arguments{ diff --git a/man/trainDI.Rd b/man/trainDI.Rd index aaac35e0..c4fccaab 100644 --- a/man/trainDI.Rd +++ b/man/trainDI.Rd @@ -13,7 +13,8 @@ trainDI( CVtrain = NULL, method = "L2", useWeight = TRUE, - LPD = FALSE + LPD = FALSE, + verbose = TRUE ) } \arguments{ From 9b25ee194f801914f4987e0d760effa36e2aa7bc Mon Sep 17 00:00:00 2001 From: HannaMeyer Date: Mon, 11 Mar 2024 14:39:13 +0100 Subject: [PATCH 08/14] documentation of normalize_DI --- man/normalize_DI.Rd | 58 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 man/normalize_DI.Rd diff --git a/man/normalize_DI.Rd b/man/normalize_DI.Rd new file mode 100644 index 00000000..53e035e5 --- /dev/null +++ b/man/normalize_DI.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize_DI.R +\name{normalize_DI} +\alias{normalize_DI} +\title{Normalize DI values} +\usage{ +normalize_DI(AOA) +} +\arguments{ +\item{AOA}{An AOA object} +} +\value{ +An object of class \code{aoa} +} +\description{ +The DI is normalized by the DI threshold to allow for a more straightforwrd interpretation. +A value in the resulting DI larger 1 means that the data are more dissimilar than what has been observed during cross-validation. +The returned threshold is adjusted accordingly and is, as a consequence, 1. +} +\examples{ +\dontrun{ +library(sf) +library(terra) +library(caret) + +# prepare sample data: +dat <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST")) +dat <- aggregate(dat[,c("VW","Easting","Northing")],by=list(as.character(dat$SOURCEID)),mean) +pts <- st_as_sf(dat,coords=c("Easting","Northing")) +pts$ID <- 1:nrow(pts) +set.seed(100) +pts <- pts[1:30,] +studyArea <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST"))[[1:8]] +trainDat <- extract(studyArea,pts,na.rm=FALSE) +trainDat <- merge(trainDat,pts,by.x="ID",by.y="ID") + +# train a model: +set.seed(100) +variables <- c("DEM","NDRE.Sd","TWI") +model <- train(trainDat[,which(names(trainDat)\%in\%variables)], +trainDat$VW, method="rf", importance=TRUE, tuneLength=1, +trControl=trainControl(method="cv",number=5,savePredictions=T)) + +#...then calculate the AOA of the trained model for the study area: +AOA <- aoa(studyArea, model) +plot(AOA) +plot(AOA$DI) + +#... then normalize the DI +DI_norm <- normalize_DI(AOA) +plot(DI_norm) +plot(DI_norm$DI) + +} +} +\seealso{ +\code{\link{aoa}} +} From 265dfc3f8e0a18e178d71c07ec045e4d8426ac66 Mon Sep 17 00:00:00 2001 From: HannaMeyer Date: Mon, 11 Mar 2024 14:39:41 +0100 Subject: [PATCH 09/14] documentation of normalize_DI --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index b9037313..f5a43e10 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(geodist) export(global_validation) export(knndm) export(nndm) +export(normalize_DI) export(plot_ffs) export(plot_geodist) export(show.aoa) From cad15459b9e2316057fef896fc8846d1d5da017d Mon Sep 17 00:00:00 2001 From: JanLinnenbrink Date: Mon, 11 Mar 2024 15:07:19 +0100 Subject: [PATCH 10/14] tests for geodist added --- R/geodist.R | 5 +- tests/testthat/test-geodist.R | 217 ++++++++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+), 2 deletions(-) diff --git a/R/geodist.R b/R/geodist.R index 5631c3f7..55978e2f 100644 --- a/R/geodist.R +++ b/R/geodist.R @@ -143,7 +143,7 @@ geodist <- function(x, if(any(!variables%in%names(testdata))){# extract variable values of raster: testdata <- sf::st_transform(testdata,sf::st_crs(modeldomain)) #testdata <- sf::st_as_sf(raster::extract(modeldomain, testdata, df = TRUE, sp = TRUE)) - testdata <- sf::st_as_sf(terra::extract(modeldomain, testdata, na.rm=FALSE,bind=TRUE)) + testdata <- sf::st_as_sf(terra::extract(modeldomain, terra::vect(testdata), na.rm=FALSE,bind=TRUE)) if(any(is.na(testdata))){ testdata <- na.omit(testdata) @@ -157,7 +157,7 @@ geodist <- function(x, if(any(!variables%in%names(preddata))){# extract variable values of raster: preddata <- sf::st_transform(preddata,sf::st_crs(modeldomain)) #preddata <- sf::st_as_sf(raster::extract(modeldomain, preddata, df = TRUE, sp = TRUE)) - preddata <- sf::st_as_sf(terra::extract(modeldomain, preddata, na.rm=FALSE,bind=TRUE)) + preddata <- sf::st_as_sf(terra::extract(modeldomain, terra::vect(preddata), na.rm=FALSE,bind=TRUE)) if(any(is.na(preddata))){ preddata <- na.omit(preddata) @@ -194,6 +194,7 @@ geodist <- function(x, ##### Distance to CV data: if(!is.null(cvfolds)){ + cvd <- cvdistance(x, cvfolds, cvtrain, type, variables) dists <- rbind(dists, cvd) } diff --git a/tests/testthat/test-geodist.R b/tests/testthat/test-geodist.R index e69de29b..2f505179 100644 --- a/tests/testthat/test-geodist.R +++ b/tests/testthat/test-geodist.R @@ -0,0 +1,217 @@ +test_that("geodist works with points and polygon in geographic space", { + + data(splotdata) + studyArea <- rnaturalearth::ne_countries(continent = "South America", returnclass = "sf") + set.seed(1) + folds <- data.frame("folds"=sample(1:3, nrow(splotdata), replace=TRUE)) + folds <- CreateSpacetimeFolds(folds, spacevar="folds", k=3) + + dist_geo <- geodist(x=splotdata, + modeldomain=studyArea, + cvfolds=folds$indexOut, + type = "geo") + + mean_sample2sample <- round(mean(dist_geo[dist_geo$what=="sample-to-sample","dist"])) + mean_CV_distances <- round(mean(dist_geo[dist_geo$what=="CV-distances","dist"])) + # can't be tested for prediction-to-sample, which are sampled slightly different in each run + nrow_dist <- nrow(dist_geo) + + expect_equal(mean_sample2sample, 20321) + expect_equal(mean_CV_distances, 25616) + expect_equal(nrow_dist, 3410) + + +}) + +test_that("geodist works with points and polygon in feature space", { + + data(splotdata) + studyArea <- rnaturalearth::ne_countries(continent = "South America", returnclass = "sf") + set.seed(1) + folds <- data.frame("folds"=sample(1:3, nrow(splotdata), replace=TRUE)) + folds <- CreateSpacetimeFolds(folds, spacevar="folds", k=3) + predictors <- terra::rast(system.file("extdata","predictors_chile.tif", package="CAST")) + + dist_fspace <- geodist(x = splotdata, + modeldomain = predictors, + cvfolds=folds$indexOut, + type = "feature", + variables = c("bio_1","bio_12", "elev")) + + mean_sample2sample <- round(mean(dist_fspace[dist_fspace$what=="sample-to-sample","dist"]), 4) + mean_CV_distances <- round(mean(dist_fspace[dist_fspace$what=="CV-distances","dist"]), 4) + # can't be tested for prediction-to-sample, which are sampled slightly different in each run + + expect_equal(mean_sample2sample, 0.0843) + expect_equal(mean_CV_distances, 0.1036) + +}) + + +test_that("geodist works space with points and preddata in geographic space", { + + aoi <- sf::st_as_sfc("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0))", crs="epsg:25832") + tpoints <- sf::st_as_sfc("MULTIPOINT ((1 1), (1 2), (2 2), (2 3), (1 4), (5 4))", crs="epsg:25832") |> + sf::st_cast("POINT") + set.seed(1) + ppoints <- suppressWarnings(sf::st_sample(aoi, 20, type="regular")) |> + sf::st_set_crs("epsg:25832") + + set.seed(1) + folds <- data.frame("folds"=sample(1:3, length(tpoints), replace=TRUE)) + folds <- CreateSpacetimeFolds(folds, spacevar="folds", k=3) + + dist_geo <- geodist(x=tpoints, + modeldomain=aoi, + preddata=ppoints, + type = "geo") + + mean_sample2sample <- round(mean(dist_geo[dist_geo$what=="sample-to-sample","dist"]), 4) + mean_prediction_to_sample <- round(mean(dist_geo[dist_geo$what=="prediction-to-sample","dist"]), 4) + + expect_equal(mean_sample2sample, 1.4274) + expect_equal(mean_prediction_to_sample, 2.9402) + + +}) + + +test_that("geodist works with points and preddata in feature space", { + + aoi <- sf::st_as_sfc("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0))", crs="epsg:25832") + tpoints <- sf::st_as_sfc("MULTIPOINT ((1 1), (1 2), (2 2), (2 3), (1 4), (5 4))", crs="epsg:25832") |> + sf::st_cast("POINT") + set.seed(1) + ppoints <- suppressWarnings(sf::st_sample(aoi, 20, type="regular")) |> + sf::st_set_crs("epsg:25832") + + raster <- terra::rast(nrows=10, ncols=10, nlyrs=1, xmin=0, xmax=10, + ymin=0, ymax=10, crs="epsg:25832", vals=1:100) + + dist <- geodist(x=tpoints, + modeldomain=raster, + preddata=ppoints, + type = "feature") + + mean_sample2sample <- round(mean(dist[dist$what=="sample-to-sample","dist"]), 4) + mean_prediction_to_sample <- round(mean(dist[dist$what=="prediction-to-sample","dist"]), 4) + + expect_equal(mean_sample2sample, 0.4316) + expect_equal(mean_prediction_to_sample, 0.8328) + + +}) + + +test_that("geodist works with points and raster in geographic space", { + + tpoints <- sf::st_as_sfc("MULTIPOINT ((1 1), (1 2), (2 2), (2 3), (1 4), (5 4))", crs="epsg:25832") |> + sf::st_cast("POINT") + + raster <- terra::rast(nrows=10, ncols=10, nlyrs=1, xmin=0, xmax=10, + ymin=0, ymax=10, crs="epsg:25832", vals=1:100) + + dist <- geodist(x=tpoints, + modeldomain=raster, + type = "geo") + + mean_sample2sample <- round(mean(dist[dist$what=="sample-to-sample","dist"]), 4) + expect_equal(mean_sample2sample, 1.4274) + + +}) + + +test_that("geodist works with points and raster in feature space", { + + tpoints <- sf::st_as_sfc("MULTIPOINT ((1 1), (1 2), (2 2), (2 3), (1 4), (5 4))", crs="epsg:25832") |> + sf::st_cast("POINT") + + raster <- terra::rast(nrows=10, ncols=10, nlyrs=1, xmin=0, xmax=10, + ymin=0, ymax=10, crs="epsg:25832", vals=1:100) + + dist <- geodist(x=tpoints, + modeldomain=raster, + type = "feature") + + mean_sample2sample <- round(mean(dist[dist$what=="sample-to-sample","dist"]), 4) + expect_equal(mean_sample2sample, 0.4316) + + +}) + + +test_that("geodist works with points and stars raster in geographic space", { + + tpoints <- sf::st_as_sfc("MULTIPOINT ((1 1), (1 2), (2 2), (2 3), (1 4), (5 4))", crs="epsg:25832") |> + sf::st_cast("POINT") + + raster <- terra::rast(nrows=10, ncols=10, nlyrs=1, xmin=0, xmax=10, + ymin=0, ymax=10, crs="epsg:25832", vals=1:100) |> + stars::st_as_stars() + + dist <- geodist(x=tpoints, + modeldomain=raster, + type = "feature") + + mean_sample2sample <- round(mean(dist[dist$what=="sample-to-sample","dist"]), 4) + expect_equal(mean_sample2sample, 0.4316) + + +}) + + + +test_that("geodist works with points and test data in geographic space", { + + aoi <- sf::st_as_sfc("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0))", crs="epsg:25832") + tpoints <- sf::st_as_sfc("MULTIPOINT ((1 1), (1 2), (2 2), (2 3), (1 4), (5 4))", crs="epsg:25832") |> + sf::st_cast("POINT") + + set.seed(1) + test_point <- suppressWarnings(sf::st_sample(aoi, 20, type="regular")) |> + sf::st_set_crs("epsg:25832") + + dist <- geodist(x=tpoints, + modeldomain=aoi, + testdata=test_point, + type = "geo") + + mean_sample2sample <- round(mean(dist[dist$what=="sample-to-sample","dist"]), 4) + mean_test_to_sample <- round(mean(dist[dist$what=="test-to-sample","dist"]), 4) + + expect_equal(mean_sample2sample, 1.4274) + expect_equal(mean_test_to_sample, 2.9402) + + + +}) + + +test_that("geodist works with points and test data in feature space", { + + aoi <- sf::st_as_sfc("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0))", crs="epsg:25832") + + raster <- terra::rast(nrows=10, ncols=10, nlyrs=1, xmin=0, xmax=10, + ymin=0, ymax=10, crs="epsg:25832", vals=1:100) + + tpoints <- sf::st_as_sfc("MULTIPOINT ((1 1), (1 2), (2 2), (2 3), (1 4), (5 4))", crs="epsg:25832") |> + sf::st_cast("POINT") + + set.seed(1) + test_points <- suppressWarnings(sf::st_sample(aoi, 20, type="random")) |> + sf::st_set_crs("epsg:25832") + + dist <- geodist(x=tpoints, + modeldomain=raster, + testdata = test_points, + type = "feature") + + mean_sample2sample <- round(mean(dist[dist$what=="sample-to-sample","dist"]), 4) + mean_test_to_sample <- round(mean(dist[dist$what=="test-to-sample","dist"]), 4) + + expect_equal(mean_sample2sample, 0.4316) + expect_equal(mean_test_to_sample, 0.8783) + + +}) From 8ae55af1f1e5464bd869055e50bbfbcc836992b5 Mon Sep 17 00:00:00 2001 From: HannaMeyer Date: Mon, 11 Mar 2024 15:10:07 +0100 Subject: [PATCH 11/14] update version --- DESCRIPTION | 2 +- NAMESPACE | 2 +- NEWS.md | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c3a4e72f..ccf050e1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: CAST Type: Package Title: 'caret' Applications for Spatial-Temporal Models -Version: 0.9.1 +Version: 0.9.2 Authors@R: c(person("Hanna", "Meyer", email = "hanna.meyer@uni-muenster.de", role = c("cre", "aut")), person("Carles", "Milà", role = c("aut")), person("Marvin", "Ludwig", role = c("aut")), diff --git a/NAMESPACE b/NAMESPACE index f5a43e10..fcbfabf3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,11 +13,11 @@ S3method(print,knndm) S3method(print,nndm) S3method(print,trainDI) export(CreateSpacetimeFolds) -export(DItoErrormetric) export(aoa) export(bss) export(calibrate_aoa) export(clustered_sample) +export(errorProfiles) export(ffs) export(geodist) export(global_validation) diff --git a/NEWS.md b/NEWS.md index 49693488..8cde79a2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# `CAST` 0.9.2 +* new features: + * normalize_DI for a more intuitive interpretation +* modifications: + * function DItoErrormetric renamed to errorProfiles and allows for other dissimilarity measures + # `CAST` 0.9.1 * new features: * calculate local point density within AOA From e892bb2d2fc1e3f83cd3b3cf8081963f494b2a67 Mon Sep 17 00:00:00 2001 From: HannaMeyer Date: Mon, 11 Mar 2024 15:11:50 +0100 Subject: [PATCH 12/14] re-name DItoErrormetric --- R/{DItoErrormetric.R => errorProfiles.R} | 33 ++- ...x_DItoErrormetric.R => ex_errorProfiles.R} | 6 +- man/{DItoErrormetric.Rd => errorProfiles.Rd} | 27 +- ...DItoErrormetric.R => test-errorProfiles.R} | 12 +- vignettes/cast01-CAST-intro-cookfarm.R | 107 ++++++++ vignettes/cast02-AOA-tutorial.R | 233 ++++++++++++++++++ vignettes/cast02-AOA-tutorial.Rmd | 4 +- 7 files changed, 386 insertions(+), 36 deletions(-) rename R/{DItoErrormetric.R => errorProfiles.R} (91%) rename inst/examples/{ex_DItoErrormetric.R => ex_errorProfiles.R} (82%) rename man/{DItoErrormetric.Rd => errorProfiles.Rd} (78%) rename tests/testthat/{test-DItoErrormetric.R => test-errorProfiles.R} (87%) create mode 100644 vignettes/cast01-CAST-intro-cookfarm.R create mode 100644 vignettes/cast02-AOA-tutorial.R diff --git a/R/DItoErrormetric.R b/R/errorProfiles.R similarity index 91% rename from R/DItoErrormetric.R rename to R/errorProfiles.R index 8053a89b..34364a93 100644 --- a/R/DItoErrormetric.R +++ b/R/errorProfiles.R @@ -1,7 +1,8 @@ -#' Model the relationship between the DI and the prediction error -#' @description Performance metrics are calculated for moving windows of DI/LPD values of cross-validated training data +#' Model and inspect the relationship between the prediction error and measures of dissimilarities and distances +#' @description Performance metrics are calculated for moving windows of dissimilarity values based on cross-validated training data #' @param model the model used to get the AOA #' @param trainDI the result of \code{\link{trainDI}} or aoa object \code{\link{aoa}} +#' @param variable Character. Which dissimilarity or distance measure to use for the error metric. Current options are "DI" or "LPD" #' @param multiCV Logical. Re-run model fitting and validation with different CV strategies. See details. #' @param window.size Numeric. Size of the moving window. See \code{\link{rollapply}}. #' @param calib Character. Function to model the DI/LPD~performance relationship. Currently lm and scam are supported @@ -10,9 +11,8 @@ #' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model? #' @param k Numeric. See mgcv::s #' @param m Numeric. See mgcv::s -#' @param variable Character. Which measure to use for the error metric. "DI" or "LPD" #' @details If multiCV=TRUE the model is re-fitted and validated by length.out new cross-validations where the cross-validation folds are defined by clusters in the predictor space, -#' ranging from three clusters to LOOCV. Hence, a large range of DI/LPD values is created during cross-validation. +#' ranging from three clusters to LOOCV. Hence, a large range of dissimilarity values is created during cross-validation. #' If the AOA threshold based on the calibration data from multiple CV is larger than the original AOA threshold (which is likely if extrapolation situations are created during CV), #' the AOA threshold changes accordingly. See Meyer and Pebesma (2021) for the full documentation of the methodology. #' @return A scam, linear model or exponential model @@ -22,16 +22,25 @@ #' Estimating the area of applicability of spatial prediction models. #' \doi{10.1111/2041-210X.13650} #' @seealso \code{\link{aoa}} -#' @example inst/examples/ex_DItoErrormetric.R +#' @example inst/examples/ex_errorProfiles.R #' #' -#' @export - - -DItoErrormetric <- function(model, trainDI, multiCV=FALSE, - length.out = 10, window.size = 5, calib = "scam", - method= "L2", useWeight=TRUE, - k = 6, m = 2, variable = "DI"){ +#' @export errorProfiles +#' @aliases errorProfiles DItoErrormetric + + + +errorProfiles <- function(model, + trainDI, + variable = "DI", + multiCV=FALSE, + length.out = 10, + window.size = 5, + calib = "scam", + method= "L2", + useWeight=TRUE, + k = 6, + m = 2){ if(inherits(trainDI,"aoa")){ diff --git a/inst/examples/ex_DItoErrormetric.R b/inst/examples/ex_errorProfiles.R similarity index 82% rename from inst/examples/ex_DItoErrormetric.R rename to inst/examples/ex_errorProfiles.R index 3f62551b..fef05917 100644 --- a/inst/examples/ex_DItoErrormetric.R +++ b/inst/examples/ex_errorProfiles.R @@ -16,14 +16,14 @@ AOA <- aoa(predictors, model, LPD = TRUE, maxLPD = 1) # DI ~ error - errormodel_DI <- DItoErrormetric(model, AOA, variable = "DI") + errormodel_DI <- errorProfiles(model, AOA, variable = "DI") plot(errormodel_DI) expected_error_DI = terra::predict(AOA$DI, errormodel_DI) plot(expected_error_DI) # LPD ~ error - errormodel_LPD <- DItoErrormetric(model, AOA, variable = "LPD") + errormodel_LPD <- errorProfiles(model, AOA, variable = "LPD") plot(errormodel_LPD) expected_error_LPD = terra::predict(AOA$LPD, errormodel_LPD) @@ -31,7 +31,7 @@ # with multiCV = TRUE (for DI ~ error) - errormodel_DI = DItoErrormetric(model, AOA, multiCV = TRUE, length.out = 3, variable = "DI") + errormodel_DI = errorProfiles(model, AOA, multiCV = TRUE, length.out = 3, variable = "DI") plot(errormodel_DI) expected_error_DI = terra::predict(AOA$DI, errormodel_DI) diff --git a/man/DItoErrormetric.Rd b/man/errorProfiles.Rd similarity index 78% rename from man/DItoErrormetric.Rd rename to man/errorProfiles.Rd index aa4136dd..2697e133 100644 --- a/man/DItoErrormetric.Rd +++ b/man/errorProfiles.Rd @@ -1,12 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DItoErrormetric.R -\name{DItoErrormetric} +% Please edit documentation in R/errorProfiles.R +\name{errorProfiles} +\alias{errorProfiles} \alias{DItoErrormetric} -\title{Model the relationship between the DI and the prediction error} +\title{Model and inspect the relationship between the prediction error and measures of dissimilarities and distances} \usage{ -DItoErrormetric( +errorProfiles( model, trainDI, + variable = "DI", multiCV = FALSE, length.out = 10, window.size = 5, @@ -14,8 +16,7 @@ DItoErrormetric( method = "L2", useWeight = TRUE, k = 6, - m = 2, - variable = "DI" + m = 2 ) } \arguments{ @@ -23,6 +24,8 @@ DItoErrormetric( \item{trainDI}{the result of \code{\link{trainDI}} or aoa object \code{\link{aoa}}} +\item{variable}{Character. Which dissimilarity or distance measure to use for the error metric. Current options are "DI" or "LPD"} + \item{multiCV}{Logical. Re-run model fitting and validation with different CV strategies. See details.} \item{length.out}{Numeric. Only used if multiCV=TRUE. Number of cross-validation folds. See details.} @@ -38,18 +41,16 @@ DItoErrormetric( \item{k}{Numeric. See mgcv::s} \item{m}{Numeric. See mgcv::s} - -\item{variable}{Character. Which measure to use for the error metric. "DI" or "LPD"} } \value{ A scam, linear model or exponential model } \description{ -Performance metrics are calculated for moving windows of DI/LPD values of cross-validated training data +Performance metrics are calculated for moving windows of dissimilarity values based on cross-validated training data } \details{ If multiCV=TRUE the model is re-fitted and validated by length.out new cross-validations where the cross-validation folds are defined by clusters in the predictor space, -ranging from three clusters to LOOCV. Hence, a large range of DI/LPD values is created during cross-validation. +ranging from three clusters to LOOCV. Hence, a large range of dissimilarity values is created during cross-validation. If the AOA threshold based on the calibration data from multiple CV is larger than the original AOA threshold (which is likely if extrapolation situations are created during CV), the AOA threshold changes accordingly. See Meyer and Pebesma (2021) for the full documentation of the methodology. } @@ -72,14 +73,14 @@ the AOA threshold changes accordingly. See Meyer and Pebesma (2021) for the full AOA <- aoa(predictors, model, LPD = TRUE, maxLPD = 1) # DI ~ error - errormodel_DI <- DItoErrormetric(model, AOA, variable = "DI") + errormodel_DI <- errorProfiles(model, AOA, variable = "DI") plot(errormodel_DI) expected_error_DI = terra::predict(AOA$DI, errormodel_DI) plot(expected_error_DI) # LPD ~ error - errormodel_LPD <- DItoErrormetric(model, AOA, variable = "LPD") + errormodel_LPD <- errorProfiles(model, AOA, variable = "LPD") plot(errormodel_LPD) expected_error_LPD = terra::predict(AOA$LPD, errormodel_LPD) @@ -87,7 +88,7 @@ the AOA threshold changes accordingly. See Meyer and Pebesma (2021) for the full # with multiCV = TRUE (for DI ~ error) - errormodel_DI = DItoErrormetric(model, AOA, multiCV = TRUE, length.out = 3, variable = "DI") + errormodel_DI = errorProfiles(model, AOA, multiCV = TRUE, length.out = 3, variable = "DI") plot(errormodel_DI) expected_error_DI = terra::predict(AOA$DI, errormodel_DI) diff --git a/tests/testthat/test-DItoErrormetric.R b/tests/testthat/test-errorProfiles.R similarity index 87% rename from tests/testthat/test-DItoErrormetric.R rename to tests/testthat/test-errorProfiles.R index b7a6c15f..8dcebdf4 100644 --- a/tests/testthat/test-DItoErrormetric.R +++ b/tests/testthat/test-errorProfiles.R @@ -1,5 +1,5 @@ -test_that("DItoErrormetric works in default settings", { +test_that("errorProfiles works in default settings", { data(splotdata) splotdata <- sf::st_drop_geometry(splotdata) predictors <- terra::rast(system.file("extdata","predictors_chile.tif", package="CAST")) @@ -11,7 +11,7 @@ test_that("DItoErrormetric works in default settings", { AOA <- CAST::aoa(predictors, model) # DI ~ error - errormodel_DI <- CAST::DItoErrormetric(model, AOA, variable = "DI") + errormodel_DI <- CAST::errorProfiles(model, AOA, variable = "DI") expected_error_DI = terra::predict(AOA$DI, errormodel_DI) @@ -28,7 +28,7 @@ test_that("DItoErrormetric works in default settings", { }) -test_that("DItoErrormetric works in with LPD", { +test_that("errorProfiles works in with LPD", { data(splotdata) splotdata <- sf::st_drop_geometry(splotdata) predictors <- terra::rast(system.file("extdata","predictors_chile.tif", package="CAST")) @@ -38,7 +38,7 @@ test_that("DItoErrormetric works in with LPD", { trControl = caret::trainControl(method = "cv", savePredictions = TRUE)) AOA <- CAST::aoa(predictors, model, LPD = TRUE, maxLPD = 1) - errormodel_LPD <- CAST::DItoErrormetric(model, AOA, variable = "LPD") + errormodel_LPD <- CAST::errorProfiles(model, AOA, variable = "LPD") expected_error_LPD = terra::predict(AOA$LPD, errormodel_LPD) @@ -55,7 +55,7 @@ test_that("DItoErrormetric works in with LPD", { -test_that("DItoErrormetric works for multiCV", { +test_that("errorProfiles works for multiCV", { data(splotdata) splotdata <- sf::st_drop_geometry(splotdata) predictors <- terra::rast(system.file("extdata","predictors_chile.tif", package="CAST")) @@ -67,7 +67,7 @@ test_that("DItoErrormetric works for multiCV", { AOA <- CAST::aoa(predictors, model) # with multiCV = TRUE (for DI ~ error) set.seed(100) - errormodel_DI = suppressWarnings(DItoErrormetric(model, AOA, multiCV = TRUE, length.out = 3)) + errormodel_DI = suppressWarnings(errorProfiles(model, AOA, multiCV = TRUE, length.out = 3)) expected_error_DI = terra::predict(AOA$DI, errormodel_DI) #test model fit: diff --git a/vignettes/cast01-CAST-intro-cookfarm.R b/vignettes/cast01-CAST-intro-cookfarm.R new file mode 100644 index 00000000..7ad54682 --- /dev/null +++ b/vignettes/cast01-CAST-intro-cookfarm.R @@ -0,0 +1,107 @@ +## ----setup, echo=FALSE-------------------------------------------------------- +knitr::opts_chunk$set(fig.width = 8.83,cache = FALSE) +user_hanna <- Sys.getenv("USER") %in% c("hanna") + +## ----c1, message = FALSE, warning=FALSE--------------------------------------- +#install.packages("CAST") +library(CAST) + +## ----c2, message = FALSE, warning=FALSE--------------------------------------- +help(CAST) + +## ----c3, message = FALSE, warning=FALSE--------------------------------------- +data <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST")) +head(data) + +## ----c4, message = FALSE, warning=FALSE--------------------------------------- + +library(sf) +data_sp <- unique(data[,c("SOURCEID","Easting","Northing")]) +data_sp <- st_as_sf(data_sp,coords=c("Easting","Northing"),crs=26911) +plot(data_sp,axes=T,col="black") + +## ----c5, message = FALSE, warning=FALSE, eval=user_hanna---------------------- +#...or plot the data with mapview: +library(mapview) +mapviewOptions(basemaps = c("Esri.WorldImagery")) +mapview(data_sp) + +## ----c6, message = FALSE, warning=FALSE--------------------------------------- +library(lubridate) +library(ggplot2) +trainDat <- data[data$altitude==-0.3& + year(data$Date)==2012& + week(data$Date)%in%c(10:12),] +ggplot(data = trainDat, aes(x=Date, y=VW)) + + geom_line(aes(colour=SOURCEID)) + +## ----c7, message = FALSE, warning=FALSE--------------------------------------- +library(caret) +predictors <- c("DEM","TWI","Precip_cum","cday", + "MaxT_wrcc","Precip_wrcc","BLD", + "Northing","Easting","NDRE.M") +set.seed(10) +model <- train(trainDat[,predictors],trainDat$VW, + method="rf",tuneGrid=data.frame("mtry"=2), + importance=TRUE,ntree=50, + trControl=trainControl(method="cv",number=3)) + +## ----c8, message = FALSE, warning=FALSE--------------------------------------- +library(terra) +predictors_sp <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST")) +prediction <- predict(predictors_sp,model,na.rm=TRUE) +plot(prediction) + +## ----c9, message = FALSE, warning=FALSE--------------------------------------- +model + +## ----c10, message = FALSE, warning=FALSE-------------------------------------- +set.seed(10) +indices <- CreateSpacetimeFolds(trainDat,spacevar = "SOURCEID", + k=3) +set.seed(10) +model_LLO <- train(trainDat[,predictors],trainDat$VW, + method="rf",tuneGrid=data.frame("mtry"=2), importance=TRUE, + trControl=trainControl(method="cv", + index = indices$index)) +model_LLO + +## ----c11, message = FALSE, warning=FALSE-------------------------------------- +plot(varImp(model_LLO)) + +## ----c12, message = FALSE, warning=FALSE-------------------------------------- +set.seed(10) +ffsmodel_LLO <- ffs(trainDat[,predictors],trainDat$VW,metric="Rsquared", + method="rf", tuneGrid=data.frame("mtry"=2), + verbose=FALSE,ntree=50, + trControl=trainControl(method="cv", + index = indices$index)) +ffsmodel_LLO +ffsmodel_LLO$selectedvars + +## ----c13, message = FALSE, warning=FALSE-------------------------------------- +plot(ffsmodel_LLO) + +## ----c14, message = FALSE, warning=FALSE-------------------------------------- +prediction_ffs <- predict(predictors_sp,ffsmodel_LLO,na.rm=TRUE) +plot(prediction_ffs) + +## ----c15, message = FALSE, warning=FALSE-------------------------------------- +### AOA for which the spatial CV error applies: +AOA <- aoa(predictors_sp,ffsmodel_LLO) + +plot(prediction_ffs,main="prediction for the AOA \n(spatial CV error applied)") +plot(AOA$AOA,col=c("grey","transparent"),add=T) + +#spplot(prediction_ffs,main="prediction for the AOA \n(spatial CV error applied)")+ +#spplot(AOA$AOA,col.regions=c("grey","transparent")) + +### AOA for which the random CV error applies: +AOA_random <- aoa(predictors_sp,model) +plot(prediction,main="prediction for the AOA \n(random CV error applied)") +plot(AOA_random$AOA,col=c("grey","transparent"),add=T) + +#spplot(prediction,main="prediction for the AOA \n(random CV error applied)")+ +#spplot(AOA_random$AOA,col.regions=c("grey","transparent")) + + diff --git a/vignettes/cast02-AOA-tutorial.R b/vignettes/cast02-AOA-tutorial.R new file mode 100644 index 00000000..056ffeb8 --- /dev/null +++ b/vignettes/cast02-AOA-tutorial.R @@ -0,0 +1,233 @@ +## ----setup, echo=FALSE-------------------------------------------------------- +knitr::opts_chunk$set(fig.width = 8.83) + +## ----message = FALSE, warning=FALSE------------------------------------------- +library(CAST) +library(caret) +library(terra) +library(sf) +library(viridis) +library(gridExtra) + +## ----message = FALSE,include=FALSE, warning=FALSE----------------------------- +RMSE = function(a, b){ + sqrt(mean((a - b)^2,na.rm=T)) +} + +## ----message = FALSE, warning=FALSE------------------------------------------- +predictors <- rast(system.file("extdata","bioclim.tif",package="CAST")) +plot(predictors,col=viridis(100)) + +## ----message = FALSE, warning=FALSE------------------------------------------- + +generate_random_response <- function(raster, predictornames = +names(raster), seed = sample(seq(1000), 1)){ + operands_1 = c("+", "-", "*", "/") + operands_2 = c("^1","^2") + + expression <- paste(as.character(predictornames, sep="")) + # assign random power to predictors + set.seed(seed) + expression <- paste(expression, + sample(operands_2, length(predictornames), +replace = TRUE), + sep = "") + + # assign random math function between predictors (expect after the last one) + set.seed(seed) + expression[-length(expression)] <- paste(expression[- +length(expression)], + sample(operands_1, +length(predictornames)-1, replace = TRUE), + sep = " ") + print(paste0(expression, collapse = " ")) + # collapse + e = paste0("raster$", expression, collapse = " ") + + response = eval(parse(text = e)) + names(response) <- "response" + return(response) + +} + +## ----message = FALSE, warning=FALSE------------------------------------------- +response <- generate_random_response (predictors, seed = 10) +plot(response,col=viridis(100),main="virtual response") + +## ----message = FALSE, warning=FALSE------------------------------------------- +mask <- predictors[[1]] +values(mask)[!is.na(values(mask))] <- 1 +mask <- st_as_sf(as.polygons(mask)) +mask <- st_make_valid(mask) + +## ----message = FALSE, warning=FALSE------------------------------------------- +set.seed(15) +samplepoints <- st_as_sf(st_sample(mask,20,"random")) + +plot(response,col=viridis(100)) +plot(samplepoints,col="red",add=T,pch=3) + +## ----message = FALSE, warning=FALSE------------------------------------------- +trainDat <- extract(predictors,samplepoints,na.rm=FALSE) +trainDat$response <- extract(response,samplepoints,na.rm=FALSE, ID=FALSE)$response +trainDat <- na.omit(trainDat) + +## ----message = FALSE, warning=FALSE------------------------------------------- +set.seed(10) +model <- train(trainDat[,names(predictors)], + trainDat$response, + method="rf", + importance=TRUE, + trControl = trainControl(method="cv")) +print(model) + + +## ----message = FALSE, warning=FALSE------------------------------------------- +plot(varImp(model,scale = F),col="black") + +## ----message = FALSE, warning=FALSE------------------------------------------- +prediction <- predict(predictors,model,na.rm=T) +truediff <- abs(prediction-response) +plot(rast(list(prediction,response)),main=c("prediction","reference")) + +## ----message = FALSE, warning=FALSE------------------------------------------- +AOA <- aoa(predictors, model) +class(AOA) +names(AOA) +print(AOA) + +## ----message = FALSE, warning=FALSE------------------------------------------- +plot(AOA) + +## ----message = FALSE, warning=FALSE, fig.show="hold", out.width="30%"-------- +plot(truediff,col=viridis(100),main="true prediction error") +plot(AOA$DI,col=viridis(100),main="DI") +plot(prediction, col=viridis(100),main="prediction for AOA") +plot(AOA$AOA,col=c("grey","transparent"),add=T,plg=list(x="topleft",box.col="black",bty="o",title="AOA")) + +## ----message = FALSE, warning=FALSE------------------------------------------- +set.seed(25) +samplepoints <- clustered_sample(mask,75,15,radius=25000) + +plot(response,col=viridis(100)) +plot(samplepoints,col="red",add=T,pch=3) + + +## ----message = FALSE, warning=FALSE------------------------------------------- + +trainDat <- extract(predictors,samplepoints,na.rm=FALSE) +trainDat$response <- extract(response,samplepoints,na.rm=FALSE)$response +trainDat <- data.frame(trainDat,samplepoints) +trainDat <- na.omit(trainDat) + +## ----message = FALSE, warning=FALSE------------------------------------------- +set.seed(10) +model_random <- train(trainDat[,names(predictors)], + trainDat$response, + method="rf", + importance=TRUE, + trControl = trainControl(method="cv")) +prediction_random <- predict(predictors,model_random,na.rm=TRUE) +print(model_random) + +## ----message = FALSE, warning=FALSE------------------------------------------- +folds <- CreateSpacetimeFolds(trainDat, spacevar="parent",k=10) +set.seed(15) +model <- train(trainDat[,names(predictors)], + trainDat$response, + method="rf", + importance=TRUE, + tuneGrid = expand.grid(mtry = c(2:length(names(predictors)))), + trControl = trainControl(method="cv",index=folds$index)) + print(model) + +prediction <- predict(predictors,model,na.rm=TRUE) + +## ----message = FALSE, warning=FALSE------------------------------------------- +AOA_spatial <- aoa(predictors, model) + +AOA_random <- aoa(predictors, model_random) + +## ----message = FALSE, warning=FALSE, fig.show="hold", out.width="50%"-------- +plot(AOA_spatial$DI,col=viridis(100),main="DI") +plot(prediction, col=viridis(100),main="prediction for AOA \n(spatial CV error applies)") +plot(AOA_spatial$AOA,col=c("grey","transparent"),add=TRUE,plg=list(x="topleft",box.col="black",bty="o",title="AOA")) +plot(prediction_random, col=viridis(100),main="prediction for AOA \n(random CV error applies)") +plot(AOA_random$AOA,col=c("grey","transparent"),add=TRUE,plg=list(x="topleft",box.col="black",bty="o",title="AOA")) + +## ----message = FALSE, warning=FALSE------------------------------------------- +grid.arrange(plot(AOA_spatial) + ggplot2::ggtitle("Spatial CV"), + plot(AOA_random) + ggplot2::ggtitle("Random CV"), ncol = 2) + +## ----message = FALSE, warning=FALSE------------------------------------------- +###for the spatial CV: +RMSE(values(prediction)[values(AOA_spatial$AOA)==1], + values(response)[values(AOA_spatial$AOA)==1]) +RMSE(values(prediction)[values(AOA_spatial$AOA)==0], + values(response)[values(AOA_spatial$AOA)==0]) +model$results + +###and for the random CV: +RMSE(values(prediction_random)[values(AOA_random$AOA)==1], + values(response)[values(AOA_random$AOA)==1]) +RMSE(values(prediction_random)[values(AOA_random$AOA)==0], + values(response)[values(AOA_random$AOA)==0]) +model_random$results + +## ----message = FALSE, warning=FALSE------------------------------------------- +DI_RMSE_relation <- DItoErrormetric(model, AOA_spatial$parameters, multiCV=TRUE, + window.size = 5, length.out = 5) +plot(DI_RMSE_relation) + +expected_RMSE = terra::predict(AOA_spatial$DI, DI_RMSE_relation) + +# account for multiCV changing the DI threshold +updated_AOA = AOA_spatial$DI > attr(DI_RMSE_relation, "AOA_threshold") + + +plot(expected_RMSE,col=viridis(100),main="expected RMSE") +plot(updated_AOA, col=c("grey","transparent"),add=TRUE,plg=list(x="topleft",box.col="black",bty="o",title="AOA")) + +## ----message = FALSE, warning=FALSE------------------------------------------- +dat <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST")) +# calculate average of VW for each sampling site: +dat <- aggregate(dat[,c("VW","Easting","Northing")],by=list(as.character(dat$SOURCEID)),mean) +# create sf object from the data: +pts <- st_as_sf(dat,coords=c("Easting","Northing")) + +##### Extract Predictors for the locations of the sampling points +studyArea <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST")) +st_crs(pts) <- crs(studyArea) +trainDat <- extract(studyArea,pts,na.rm=FALSE) +pts$ID <- 1:nrow(pts) +trainDat <- merge(trainDat,pts,by.x="ID",by.y="ID") +# The final training dataset with potential predictors and VW: +head(trainDat) + +## ----message = FALSE, warning=FALSE------------------------------------------- +predictors <- c("DEM","NDRE.Sd","TWI","Bt") +response <- "VW" + +model <- train(trainDat[,predictors],trainDat[,response], + method="rf",tuneLength=3,importance=TRUE, + trControl=trainControl(method="LOOCV")) +model + +## ----message = FALSE, warning=FALSE------------------------------------------- +#Predictors: +plot(stretch(studyArea[[predictors]])) + +#prediction: +prediction <- predict(studyArea,model,na.rm=TRUE) + +## ----message = FALSE, warning=FALSE, fig.show="hold", out.width="50%"-------- +AOA <- aoa(studyArea,model) + +#### Plot results: +plot(AOA$DI,col=viridis(100),main="DI with sampling locations (red)") +plot(pts,zcol="ID",col="red",add=TRUE) + +plot(prediction, col=viridis(100),main="prediction for AOA \n(LOOCV error applies)") +plot(AOA$AOA,col=c("grey","transparent"),add=TRUE,plg=list(x="topleft",box.col="black",bty="o",title="AOA")) + + diff --git a/vignettes/cast02-AOA-tutorial.Rmd b/vignettes/cast02-AOA-tutorial.Rmd index f0e08c1f..4c10f349 100644 --- a/vignettes/cast02-AOA-tutorial.Rmd +++ b/vignettes/cast02-AOA-tutorial.Rmd @@ -305,10 +305,10 @@ The results indicate that there is a high agreement between the model CV error ( ## Relationship between the DI and the performance measure The relationship between error and DI can be used to limit predictions to an area (within the AOA) where a required performance (e.g. RMSE, R2, Kappa, Accuracy) applies. -This can be done using the result of DItoErrormetric which used the relationship analyzed in a window of DI values. The corresponding model (here: shape constrained additive models which is the default: Monotone increasing P-splines with the dimension of the basis used to represent the smooth term is 6 and a 2nd order penalty.) can be used to estimate the performance on a pixel level, which then allows limiting predictions using a threshold. Note that we used a multi-purpose CV to estimate the relationship between the DI and the RMSE here (see details in the paper). +This can be done using the result of errorProfiles which used the relationship analyzed in a window of DI values. The corresponding model (here: shape constrained additive models which is the default: Monotone increasing P-splines with the dimension of the basis used to represent the smooth term is 6 and a 2nd order penalty.) can be used to estimate the performance on a pixel level, which then allows limiting predictions using a threshold. Note that we used a multi-purpose CV to estimate the relationship between the DI and the RMSE here (see details in the paper). ```{r,message = FALSE, warning=FALSE} -DI_RMSE_relation <- DItoErrormetric(model, AOA_spatial$parameters, multiCV=TRUE, +DI_RMSE_relation <- errorProfiles(model, AOA_spatial$parameters, multiCV=TRUE, window.size = 5, length.out = 5) plot(DI_RMSE_relation) From 3e756f795504ae3ba30ceb6e9e76282320f5cea9 Mon Sep 17 00:00:00 2001 From: HannaMeyer Date: Mon, 11 Mar 2024 15:12:08 +0100 Subject: [PATCH 13/14] documentation of parameter verbose --- man/aoa.Rd | 2 ++ man/trainDI.Rd | 2 ++ 2 files changed, 4 insertions(+) diff --git a/man/aoa.Rd b/man/aoa.Rd index e326d103..cf0f4d43 100644 --- a/man/aoa.Rd +++ b/man/aoa.Rd @@ -51,6 +51,8 @@ Relevant if some data points are excluded, e.g. when using \code{\link{nndm}}.} \item{LPD}{Logical. Indicates whether the local point density should be calculated or not.} \item{maxLPD}{numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}})} + +\item{verbose}{Logical. Print progress or not?} } \value{ An object of class \code{aoa} containing: diff --git a/man/trainDI.Rd b/man/trainDI.Rd index c4fccaab..76d28f6c 100644 --- a/man/trainDI.Rd +++ b/man/trainDI.Rd @@ -40,6 +40,8 @@ Relevant if some data points are excluded, e.g. when using \code{\link{nndm}}.} \item{useWeight}{Logical. Only if a model is given. Weight variables according to importance in the model?} \item{LPD}{Logical. Indicates whether the local point density should be calculated or not.} + +\item{verbose}{Logical. Print progress or not?} } \value{ A list of class \code{trainDI} containing: From 0ac8b1a1105aa05cba8a9efde1d0d278f96990a7 Mon Sep 17 00:00:00 2001 From: carlesmila Date: Mon, 11 Mar 2024 15:24:48 +0100 Subject: [PATCH 14/14] NNDM modeldomain as SpatRaster (+ updates in docs) --- R/nndm.R | 43 +++++++++++++++++++++++--------------- man/nndm.Rd | 23 +++++++++----------- tests/testthat/test-nndm.R | 23 ++++++++++++++++++-- 3 files changed, 57 insertions(+), 32 deletions(-) diff --git a/R/nndm.R b/R/nndm.R index 91867198..3cb45d42 100644 --- a/R/nndm.R +++ b/R/nndm.R @@ -4,7 +4,7 @@ #' indices to perform a NNDM LOO CV for map validation. #' @author Carles Milà #' @param tpoints sf or sfc point object. Contains the training points samples. -#' @param modeldomain sf polygon object defining the prediction area (see Details). +#' @param modeldomain sf polygon object or SpatRaster defining the prediction area (see Details). #' @param ppoints sf or sfc point object. Contains the target prediction points. #' Optional. Alternative to modeldomain (see Details). #' @param samplesize numeric. How many points in the modeldomain should be sampled as prediction points? @@ -34,12 +34,13 @@ #' When \emph{phi} is set to "max", nearest neighbor distance matching is performed for the entire prediction area. Euclidean distances are used for projected #' and non-defined CRS, great circle distances are used for geographic CRS (units in meters). #' -#' The \emph{modeldomain} is a sf polygon that defines the prediction area. The function takes a regular point sample (amount defined by \emph{samplesize)} from the spatial extent. +#' The \emph{modeldomain} is either a sf polygon that defines the prediction area, or alternatively a SpatRaster out of which a polygon, +#' transformed into the CRS of the training points, is defined as the outline of all non-NA cells. +#' Then, the function takes a regular point sample (amount defined by \emph{samplesize)} from the spatial extent. #' As an alternative use \emph{ppoints} instead of \emph{modeldomain}, if you have already defined the prediction locations (e.g. raster pixel centroids). #' When using either \emph{modeldomain} or \emph{ppoints}, we advise to plot the study area polygon and the training/prediction points as a previous step to ensure they are aligned. #' -#' @note NNDM is a variation of LOOCV and therefore may take a long time for large training data sets. -#' A k-fold variant will be implemented shortly. +#' @note NNDM is a variation of LOOCV and therefore may take a long time for large training data sets. See \code{\link{knndm}} for a more efficient k-fold variant of the method. #' @seealso \code{\link{geodist}}, \code{\link{knndm}} #' @references #' \itemize{ @@ -97,8 +98,8 @@ #' plot(nndm_pred) #' #' ######################################################################## -#' # Example 3: Real- world example; using a modeldomain instead of previously -#' # sampled prediction locations +#' # Example 3: Real- world example; using a SpatRast modeldomain instead +#' # of previously sampled prediction locations #' ######################################################################## #' \dontrun{ #' library(sf) @@ -112,15 +113,11 @@ #' pts <- st_as_sf(pts,coords=c("Easting","Northing")) #' st_crs(pts) <- 26911 #' studyArea <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST")) -#' studyArea[!is.na(studyArea)] <- 1 -#' studyArea <- as.polygons(studyArea, values = FALSE, na.all = TRUE) |> -#' st_as_sf() |> -#' st_union() #' pts <- st_transform(pts, crs = st_crs(studyArea)) -#' plot(studyArea) -#' plot(st_geometry(pts), add = TRUE, col = "red") +#' terra::plot(studyArea[["DEM"]]) +#' terra::plot(vect(pts), add = T) #' -#' nndm_folds <- nndm(pts, modeldomain= studyArea) +#' nndm_folds <- nndm(pts, modeldomain = studyArea) #' plot(nndm_folds) #' #' #use for cross-validation: @@ -144,10 +141,22 @@ nndm <- function(tpoints, modeldomain = NULL, ppoints = NULL, samplesize = 1000, # create sample points from modeldomain if(is.null(ppoints)&!is.null(modeldomain)){ - # Check modeldomain is indeed a polygon - if(!any(c("sfc", "sf") %in% class(modeldomain))){ - stop("modeldomain must be a sf/sfc object.") - }else if(!any(class(sf::st_geometry(modeldomain)) %in% c("sfc_POLYGON", "sfc_MULTIPOLYGON"))){ + # Check modeldomain is indeed a sf/SpatRaster + if(!any(c("sfc", "sf", "SpatRaster") %in% class(modeldomain))){ + stop("modeldomain must be a sf/sfc object or a 'SpatRaster' object.") + } + + # If modeldomain is a SpatRaster, transform into polygon + if(any(class(modeldomain) == "SpatRaster")){ + modeldomain[!is.na(modeldomain)] <- 1 + modeldomain <- terra::as.polygons(modeldomain, values = FALSE, na.all = TRUE) |> + sf::st_as_sf() |> + sf::st_union() + modeldomain <- sf::st_transform(modeldomain, crs = sf::st_crs(tpoints)) + } + + # Check modeldomain is indeed a polygon sf + if(!any(class(sf::st_geometry(modeldomain)) %in% c("sfc_POLYGON", "sfc_MULTIPOLYGON"))){ stop("modeldomain must be a sf/sfc polygon object.") } diff --git a/man/nndm.Rd b/man/nndm.Rd index b0a2d21b..e4aacc78 100644 --- a/man/nndm.Rd +++ b/man/nndm.Rd @@ -17,7 +17,7 @@ nndm( \arguments{ \item{tpoints}{sf or sfc point object. Contains the training points samples.} -\item{modeldomain}{sf polygon object defining the prediction area (see Details).} +\item{modeldomain}{sf polygon object or SpatRaster defining the prediction area (see Details).} \item{ppoints}{sf or sfc point object. Contains the target prediction points. Optional. Alternative to modeldomain (see Details).} @@ -58,13 +58,14 @@ Distances are only matched up to \emph{phi}. Beyond that range, all data points When \emph{phi} is set to "max", nearest neighbor distance matching is performed for the entire prediction area. Euclidean distances are used for projected and non-defined CRS, great circle distances are used for geographic CRS (units in meters). -The \emph{modeldomain} is a sf polygon that defines the prediction area. The function takes a regular point sample (amount defined by \emph{samplesize)} from the spatial extent. +The \emph{modeldomain} is either a sf polygon that defines the prediction area, or alternatively a SpatRaster out of which a polygon, +transformed into the CRS of the training points, is defined as the outline of all non-NA cells. +Then, the function takes a regular point sample (amount defined by \emph{samplesize)} from the spatial extent. As an alternative use \emph{ppoints} instead of \emph{modeldomain}, if you have already defined the prediction locations (e.g. raster pixel centroids). When using either \emph{modeldomain} or \emph{ppoints}, we advise to plot the study area polygon and the training/prediction points as a previous step to ensure they are aligned. } \note{ -NNDM is a variation of LOOCV and therefore may take a long time for large training data sets. -A k-fold variant will be implemented shortly. +NNDM is a variation of LOOCV and therefore may take a long time for large training data sets. See \code{\link{knndm}} for a more efficient k-fold variant of the method. } \examples{ ######################################################################## @@ -116,8 +117,8 @@ nndm_pred plot(nndm_pred) ######################################################################## -# Example 3: Real- world example; using a modeldomain instead of previously -# sampled prediction locations +# Example 3: Real- world example; using a SpatRast modeldomain instead +# of previously sampled prediction locations ######################################################################## \dontrun{ library(sf) @@ -131,15 +132,11 @@ pts <- dat[,-1] pts <- st_as_sf(pts,coords=c("Easting","Northing")) st_crs(pts) <- 26911 studyArea <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST")) -studyArea[!is.na(studyArea)] <- 1 -studyArea <- as.polygons(studyArea, values = FALSE, na.all = TRUE) |> - st_as_sf() |> - st_union() pts <- st_transform(pts, crs = st_crs(studyArea)) -plot(studyArea) -plot(st_geometry(pts), add = TRUE, col = "red") +terra::plot(studyArea[["DEM"]]) +terra::plot(vect(pts), add = T) -nndm_folds <- nndm(pts, modeldomain= studyArea) +nndm_folds <- nndm(pts, modeldomain = studyArea) plot(nndm_folds) #use for cross-validation: diff --git a/tests/testthat/test-nndm.R b/tests/testthat/test-nndm.R index 3b69d4de..9d6aad08 100644 --- a/tests/testthat/test-nndm.R +++ b/tests/testthat/test-nndm.R @@ -37,7 +37,7 @@ test_that("NNDM detects wrong data and geometry types", { # model domain expect_error(suppressWarnings(nndm(tpoints_sfc, modeldomain = 1)), - "modeldomain must be a sf/sfc object.") + "modeldomain must be a sf/sfc object or a 'SpatRaster' object.") expect_error(nndm(tpoints_sfc, modeldomain = ppoints_sfc), "modeldomain must be a sf/sfc polygon object.") }) @@ -106,5 +106,24 @@ test_that("NNDM yields the expected results for all CRS", { # Geographic tpoints_sf_4326 <- sf::st_set_crs(tpoints_sfc, 4326) ppoints_sf_4326 <- sf::st_set_crs(ppoints_sfc, 4326) - expect_equal(as.numeric(nndm(tpoints_sf_4326, ppoints = ppoints_sf_4326, phi = 1000000)$Gjstar[20], 4), 355614.94) + expect_equal(as.numeric(nndm(tpoints_sf_4326, ppoints = ppoints_sf_4326, phi = 1000000)$Gjstar[20]), 355614.94) +}) + +test_that("NNDM yields the expected results with SpatRast modeldomain", { + + set.seed(1234) + + # prepare sample data + dat <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST")) + dat <- terra::aggregate(dat[,c("DEM","TWI", "NDRE.M", "Easting", "Northing","VW")], + by=list(as.character(dat$SOURCEID)),mean) + pts <- dat[,-1] + pts <- sf::st_as_sf(pts,coords=c("Easting","Northing")) + sf::st_crs(pts) <- 26911 + studyArea <- terra::rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST")) + pts <- sf::st_transform(pts, crs = sf::st_crs(studyArea)) + + nndm_folds <- nndm(pts, modeldomain = studyArea, phi = 150) + expect_equal(as.numeric(nndm(pts, modeldomain = studyArea, phi = 150)$Gjstar[5]), 63.828663) + })