ffs in parallel and documentation

HannaMeyer · Mar 13, 2024 · 8db65b0 · 8db65b0
1 parent e7774e0
commit 8db65b0
Show file tree

Hide file tree

Showing 4 changed files with 460 additions and 196 deletions.
diff --git a/R/ffs.R b/R/ffs.R
@@ -30,7 +30,10 @@
 #' of the currently best model. The process stops if none of the remaining
 #' variables increases the model performance when added to the current best model.
 #'
-#' The internal cross validation can be run in parallel. See information
+#' The forward feature selection can be run in parallel with forking on Linux systems (mclapply).
+#' Each fork computes a model, which drastically speeds up the runtime -
+#' especially of the initial predictor search.
+#' The internal cross validation can be run in parallel on all systems. See information
 #' on parallel processing of carets train functions for details.
 #'
 #' Using withinSE will favour models with less variables and
@@ -61,8 +64,8 @@
 #' }
 #' @examples
 #' \dontrun{
-#' data(iris)
-#' ffsmodel <- ffs(iris[,1:4],iris$Species)
+#' data(splotdata)
+#' ffsmodel <- ffs(splotdata[,6:12], splotdata$Species_richness, ntree = 20)
 #' ffsmodel$selectedvars
 #' ffsmodel$selectedvars_perf
 #'}
@@ -73,15 +76,15 @@
 #' #is shown here.
 #'
 #' \dontrun{
-#' #run the model on three cores:
+#' # run the model on three cores (see vignette for details):
 #' library(doParallel)
 #' library(lubridate)
 #' cl <- makeCluster(3)
 #' registerDoParallel(cl)
 #'
 #' #load and prepare dataset:
-#' dat <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST"))
-#' trainDat <- dat[dat$altitude==-0.3&year(dat$Date)==2012&week(dat$Date)%in%c(13:14),]
+#' data(cookfarm)
+#' trainDat <- cookfarm[cookfarm$altitude==-0.3&year(cookfarm$Date)==2012&week(cookfarm$Date)%in%c(13:14),]
 #'
 #' #visualize dataset:
 #' ggplot(data = trainDat, aes(x=Date, y=VW)) + geom_line(aes(colour=SOURCEID))
@@ -110,6 +113,24 @@
 #' model
 #' stopCluster(cl)
 #'}
+#'
+#'\dontrun{
+#'## on linux machines, you can also run the ffs in parallel with forks:
+#' data("splotdata")
+#' spatial_cv = CreateSpacetimeFolds(splotdata, spacevar = "Biome", k = 5)
+#' ctrl <- trainControl(method="cv",index = spatial_cv$index)
+#'
+#'ffsmodel <- ffs(predictors = splotdata[,6:16],
+#'                response = splotdata$Species_richness,
+#'                tuneLength = 1,
+#'                method = "rf",
+#'                trControl = ctrl,
+#'                ntree = 20,
+#'                seed = 1,
+#'                cores = 4)
+#'}
+#'
+#'
 #' @export ffs
 #' @aliases ffs
 
@@ -128,36 +149,49 @@ ffs <- function (predictors,
                  verbose=TRUE,
                  cores = 1,
                  ...){
-  trControl$returnResamp <- "final"
-  trControl$savePredictions <- "final"
+
+
+
+  # Init ----------------
+
+  ## Input Checks ------------------------------
+
+  if(inherits(predictors, "sf")){
+    predictors = sf::st_drop_geometry(predictors)
+  }
+
+  if(cores > 1 & .Platform$OS.type != "unix"){
+    warning("Parallel computations of ffs only implemented on unix systems. cores is set to 1")
+    cores <- 1
+  }
+
+
   if(inherits(response,"character")){
     response <- factor(response)
     if(metric=="RMSE"){
       metric <- "Accuracy"
       maximize <- TRUE
     }
   }
-  if (trControl$method=="LOOCV"){
-    if (withinSE==TRUE){
-      print("warning: withinSE is set to FALSE as no SE can be calculated using method LOOCV")
-      withinSE <- FALSE
-    }}
 
-  if(globalval){
-    if (withinSE==TRUE){
-      print("warning: withinSE is set to FALSE as no SE can be calculated using global validation")
-      withinSE <- FALSE
-    }}
+  if (trControl$method=="LOOCV" & withinSE){
+    warning("withinSE is set to FALSE as no SE can be calculated using method LOOCV")
+    withinSE <- FALSE
+  }
+
+  if(globalval & withinSE){
+    warning("withinSE is set to FALSE as no SE can be calculated using global validation")
+    withinSE <- FALSE
+  }
+
+  ## Define helper functions ---------------
 
   se <- function(x){sd(x, na.rm = TRUE)/sqrt(length(na.exclude(x)))}
-  n <- length(names(predictors))
 
-  acc <- 0
-  perf_all <- data.frame(matrix(ncol=length(predictors)+3,
-                                nrow=choose(n, minVar)+(n-minVar)*(n-minVar+1)/2))
-  names(perf_all) <- c(paste0("var",1:length(predictors)),metric,"SE","nvar")
-  if(maximize) evalfunc <- function(x){max(x,na.rm=TRUE)}
-  if(!maximize) evalfunc <- function(x){min(x,na.rm=TRUE)}
+  evalfunc = ifelse(maximize,
+                    function(x){max(x,na.rm=TRUE)},
+                    function(x){min(x,na.rm=TRUE)})
+
   isBetter <- function (actmodelperf,bestmodelperf,
                         bestmodelperfSE=NULL,
                         maximization=FALSE,
@@ -171,10 +205,23 @@ ffs <- function (predictors,
     }
     return(result)
   }
-  #### chose initial best model from all combinations of two variables
 
+
+  ## Initialize Variables --------------------------
+
+  trControl$returnResamp <- "final"
+  trControl$savePredictions <- "final"
+  n <- length(names(predictors))
+  acc <- 0
+  perf_all <- data.frame(matrix(ncol=length(predictors)+3,
+                                nrow=choose(n, minVar)+(n-minVar)*(n-minVar+1)/2))
+  names(perf_all) <- c(paste0("var",1:length(predictors)),metric,"SE","nvar")
   minGrid <- t(data.frame(combn(names(predictors),minVar)))
 
+
+  # Computation -----------------------------------------------
+  ## Step 1: Search best initial variables -----
+  ## parallel ----------
   if(cores > 1){
 
     initial_models = parallel::mclapply(X = 1:nrow(minGrid), mc.cores = cores, FUN = function(i){
@@ -208,7 +255,8 @@ ffs <- function (predictors,
                             metric=metric,
                             trControl=trControl,
                             tuneLength = tuneLength,
-                            tuneGrid = tuneGrid,...)
+                            tuneGrid = tuneGrid)
+                            #...)
 
 
       tuneGrid <- tuneGrid_orig
@@ -242,14 +290,16 @@ ffs <- function (predictors,
     best_predictors <- as.character(initial_models[best_rowindex, 1:minVar])
 
     # best minVar model has to be retrained
+    #
+    #
     bestmodel <- caret::train(predictors[,best_predictors],
                               response,
                               method=method,
                               metric=metric,
                               trControl=trControl,
                               tuneLength = tuneLength,
-                              tuneGrid = tuneGrid,
-                              ...)
+                              tuneGrid = tuneGrid)
+                              #...)
 
 
     acc = nrow(minGrid)
@@ -262,7 +312,7 @@ ffs <- function (predictors,
 
 
   }else{
-    ### unparallel version -------------
+    ## unparallel  -------------
 
     for (i in 1:nrow(minGrid)){
       if (verbose){
@@ -340,10 +390,8 @@ ffs <- function (predictors,
   }
 
 
+  ## both --------
 
-
-  #### increase the number of predictors by one (try all combinations)
-  #and test if model performance increases
   selectedvars <- names(bestmodel$trainingData)[-which(
     names(bestmodel$trainingData)==".outcome")]
 
@@ -362,6 +410,152 @@ ffs <- function (predictors,
     print(paste0(paste0("vars selected: ",paste(selectedvars, collapse = ',')),
                  " with ",metric," ",round(selectedvars_perf,3)))
   }
+
+  ## Step 2: Append more variables ------
+  # increase the number of predictors by one (try all combinations)
+  # and test if model performance increases
+  # k: amount of "additional variables" left after initial search
+  # for each k: search best additional predictor
+
+  ## parallel -----
+
+  if(cores > 1){
+  for(k in 1:(length(names(predictors))-minVar)){
+    startvars <- names(bestmodel$trainingData)[-which(
+      names(bestmodel$trainingData)==".outcome")]
+    nextvars <- names(predictors)[-which(
+      names(predictors)%in%startvars)]
+
+    if(verbose){
+    print(paste0("Searching for additional variable ", minVar + k, " now. ",
+                 length(nextvars), " potential predictors are available:"))
+    print(nextvars)
+    }
+
+
+    # search best additional variable in parallel
+    next_models <- parallel::mclapply(1:length(nextvars), mc.cores = cores, FUN = function(i){
+
+
+      set.seed(seed)
+
+      #adaptation for pls:
+      tuneGrid_orig <- tuneGrid
+      if(method=="pls"&!is.null(tuneGrid)&any(tuneGrid$ncomp>ncol(predictors[,c(startvars,nextvars[i])]))){
+        tuneGrid<- data.frame(ncomp=tuneGrid[tuneGrid$ncomp<=ncol(predictors[,c(startvars,nextvars[i])]),])
+        if(verbose){
+          print(paste0("note: maximum ncomp is ", ncol(predictors[,c(startvars,nextvars[i])])))
+        }}
+      #adaptation for ranger:
+      if(method=="ranger"&!is.null(tuneGrid)&any(tuneGrid$mtry>ncol(predictors[,c(startvars,nextvars[i])]))){
+        tuneGrid$mtry[tuneGrid$mtry>ncol(predictors[,c(startvars,nextvars[i])])] <- ncol(predictors[,c(startvars,nextvars[i])])
+        if(verbose){
+          print("invalid value for mtry. Reset to valid range.")
+        }
+      }
+
+      model <- caret::train(predictors[,c(startvars,nextvars[i])],
+                            response,
+                            method = method,
+                            metric=metric,
+                            trControl = trControl,
+                            tuneLength = tuneLength,
+                            tuneGrid = tuneGrid,
+                            ...)
+      tuneGrid <- tuneGrid_orig
+
+
+
+      if (globalval){
+        perf_stats <- global_validation(model)[names(global_validation(model))==metric]
+      }else{
+        perf_stats <- model$results[,names(model$results)==metric]
+      }
+
+
+      startvars
+      result = as.data.frame(t(startvars))
+      result$nextvar = nextvars[i]
+      result$actmodelperf <- evalfunc(perf_stats)
+      result$actmodelperfSE <- se(
+        sapply(unique(model$resample$Resample),
+               FUN=function(x){mean(model$resample[model$resample$Resample==x,
+                                                   metric],na.rm=TRUE)}))
+
+      return(result)
+
+    })
+
+    next_models = do.call(rbind, next_models)
+
+
+    ## best next_model
+    best_next_rowindex = ifelse(maximize,
+                           which.max(next_models[,(ncol(next_models)-1)]),
+                           which.min(next_models[,(ncol(next_models)-1)]))
+
+    better = isBetter(actmodelperf = next_models$actmodelperf[best_next_rowindex],
+                      bestmodelperf = bestmodelperf,
+                      bestmodelperfSE = bestmodelperfSE,
+                      maximization = maximize, withinSE = withinSE)
+
+
+
+    # patching perf_all
+    perf_all[(acc+1):(acc+length(nextvars)), 1:(minVar+k)] <- next_models[,1:(minVar+k)]
+    perf_all[(acc+1):(acc+length(nextvars)), (ncol(perf_all)-2):(ncol(perf_all)-1)] <- next_models[,(ncol(next_models)-1):ncol(next_models)]
+    perf_all$nvar[(acc+1):(acc+length(nextvars))] <- minVar+k
+
+    if(better){
+      # update best model stats
+      bestmodelperf = next_models$actmodelperf[best_next_rowindex]
+      bestmodelperfSE = next_models$actmodelperfSE[best_next_rowindex]
+      best_predictors = as.character(next_models[best_next_rowindex, 1:(minVar+k)])
+
+      selectedvars_perf = c(selectedvars_perf, bestmodelperf)
+      selectedvars_SE = c(selectedvars_SE, bestmodelperfSE)
+
+
+      bestmodel <- caret::train(predictors[,best_predictors],
+                                response,
+                                method=method,
+                                metric=metric,
+                                trControl=trControl,
+                                tuneLength = tuneLength,
+                                tuneGrid = tuneGrid,
+                                ...)
+
+
+      acc = acc+nrow(next_models)
+
+
+    }else{
+      # not better: return model and stats
+      message(paste0("Note: No increase in performance found using more than ",
+                     length(startvars), " variables"))
+      bestmodel$selectedvars <- best_predictors
+      bestmodel$selectedvars_perf <- selectedvars_perf
+      bestmodel$selectedvars_perf_SE <- selectedvars_SE
+      bestmodel$perf_all <- perf_all
+      bestmodel$perf_all <- bestmodel$perf_all[!apply(is.na(bestmodel$perf_all), 1, all),]
+      bestmodel$perf_all <- bestmodel$perf_all[colSums(!is.na(bestmodel$perf_all)) > 0]
+      bestmodel$minVar <- minVar
+      bestmodel$type <- "ffs"
+      class(bestmodel) <- c("ffs", "train")
+      return(bestmodel)
+
+    }
+
+
+
+
+  }# end of k loop
+
+
+
+
+  }else{
+    ## unparallel -----
   for (k in 1:(length(names(predictors))-minVar)){
     startvars <- names(bestmodel$trainingData)[-which(
       names(bestmodel$trainingData)==".outcome")]
@@ -469,21 +663,10 @@ ffs <- function (predictors,
     }
   }
 
-  # Old version that is not using global_validation:
-  #    if (maximize){
-  #      selectedvars_perf <- c(selectedvars_perf,max(bestmodel$results[,metric]))
-  #      if(verbose){
-  #        print(paste0(paste0("vars selected: ",paste(selectedvars, collapse = ',')),
-  #                     " with ", metric," ",round(max(bestmodel$results[,metric]),3)))
-  #      }
-  #    }
-  #    if (!maximize){
-  #      selectedvars_perf <- c(selectedvars_perf,min(bestmodel$results[,metric]))
-  #      if(verbose){
-  #        print(paste0(paste0("vars selected: ",paste(selectedvars, collapse = ',')),
-  #                     " with ",metric," ",round(min(bestmodel$results[,metric]),3)))
-  #      }
-  #    }
+  }
+## return best model --------
+
+
 
   bestmodel$selectedvars <- selectedvars
   bestmodel$selectedvars_perf <- selectedvars_perf
@@ -501,7 +684,3 @@ ffs <- function (predictors,
 }
 
 
-
-
-
-