Skip to content

Commit

Permalink
ffs in parallel and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Ludwigm6 committed Mar 13, 2024
1 parent e7774e0 commit 8db65b0
Show file tree
Hide file tree
Showing 4 changed files with 460 additions and 196 deletions.
283 changes: 231 additions & 52 deletions R/ffs.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@
#' of the currently best model. The process stops if none of the remaining
#' variables increases the model performance when added to the current best model.
#'
#' The internal cross validation can be run in parallel. See information
#' The forward feature selection can be run in parallel with forking on Linux systems (mclapply).
#' Each fork computes a model, which drastically speeds up the runtime -
#' especially of the initial predictor search.
#' The internal cross validation can be run in parallel on all systems. See information
#' on parallel processing of carets train functions for details.
#'
#' Using withinSE will favour models with less variables and
Expand Down Expand Up @@ -61,8 +64,8 @@
#' }
#' @examples
#' \dontrun{
#' data(iris)
#' ffsmodel <- ffs(iris[,1:4],iris$Species)
#' data(splotdata)
#' ffsmodel <- ffs(splotdata[,6:12], splotdata$Species_richness, ntree = 20)
#' ffsmodel$selectedvars
#' ffsmodel$selectedvars_perf
#'}
Expand All @@ -73,15 +76,15 @@
#' #is shown here.
#'
#' \dontrun{
#' #run the model on three cores:
#' # run the model on three cores (see vignette for details):
#' library(doParallel)
#' library(lubridate)
#' cl <- makeCluster(3)
#' registerDoParallel(cl)
#'
#' #load and prepare dataset:
#' dat <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST"))
#' trainDat <- dat[dat$altitude==-0.3&year(dat$Date)==2012&week(dat$Date)%in%c(13:14),]
#' data(cookfarm)
#' trainDat <- cookfarm[cookfarm$altitude==-0.3&year(cookfarm$Date)==2012&week(cookfarm$Date)%in%c(13:14),]
#'
#' #visualize dataset:
#' ggplot(data = trainDat, aes(x=Date, y=VW)) + geom_line(aes(colour=SOURCEID))
Expand Down Expand Up @@ -110,6 +113,24 @@
#' model
#' stopCluster(cl)
#'}
#'
#'\dontrun{
#'## on linux machines, you can also run the ffs in parallel with forks:
#' data("splotdata")
#' spatial_cv = CreateSpacetimeFolds(splotdata, spacevar = "Biome", k = 5)
#' ctrl <- trainControl(method="cv",index = spatial_cv$index)
#'
#'ffsmodel <- ffs(predictors = splotdata[,6:16],
#' response = splotdata$Species_richness,
#' tuneLength = 1,
#' method = "rf",
#' trControl = ctrl,
#' ntree = 20,
#' seed = 1,
#' cores = 4)
#'}
#'
#'
#' @export ffs
#' @aliases ffs

Expand All @@ -128,36 +149,49 @@ ffs <- function (predictors,
verbose=TRUE,
cores = 1,
...){
trControl$returnResamp <- "final"
trControl$savePredictions <- "final"



# Init ----------------

## Input Checks ------------------------------

if(inherits(predictors, "sf")){
predictors = sf::st_drop_geometry(predictors)
}

if(cores > 1 & .Platform$OS.type != "unix"){
warning("Parallel computations of ffs only implemented on unix systems. cores is set to 1")
cores <- 1
}


if(inherits(response,"character")){
response <- factor(response)
if(metric=="RMSE"){
metric <- "Accuracy"
maximize <- TRUE
}
}
if (trControl$method=="LOOCV"){
if (withinSE==TRUE){
print("warning: withinSE is set to FALSE as no SE can be calculated using method LOOCV")
withinSE <- FALSE
}}

if(globalval){
if (withinSE==TRUE){
print("warning: withinSE is set to FALSE as no SE can be calculated using global validation")
withinSE <- FALSE
}}
if (trControl$method=="LOOCV" & withinSE){
warning("withinSE is set to FALSE as no SE can be calculated using method LOOCV")
withinSE <- FALSE
}

if(globalval & withinSE){
warning("withinSE is set to FALSE as no SE can be calculated using global validation")
withinSE <- FALSE
}

## Define helper functions ---------------

se <- function(x){sd(x, na.rm = TRUE)/sqrt(length(na.exclude(x)))}
n <- length(names(predictors))

acc <- 0
perf_all <- data.frame(matrix(ncol=length(predictors)+3,
nrow=choose(n, minVar)+(n-minVar)*(n-minVar+1)/2))
names(perf_all) <- c(paste0("var",1:length(predictors)),metric,"SE","nvar")
if(maximize) evalfunc <- function(x){max(x,na.rm=TRUE)}
if(!maximize) evalfunc <- function(x){min(x,na.rm=TRUE)}
evalfunc = ifelse(maximize,
function(x){max(x,na.rm=TRUE)},
function(x){min(x,na.rm=TRUE)})

isBetter <- function (actmodelperf,bestmodelperf,
bestmodelperfSE=NULL,
maximization=FALSE,
Expand All @@ -171,10 +205,23 @@ ffs <- function (predictors,
}
return(result)
}
#### chose initial best model from all combinations of two variables


## Initialize Variables --------------------------

trControl$returnResamp <- "final"
trControl$savePredictions <- "final"
n <- length(names(predictors))
acc <- 0
perf_all <- data.frame(matrix(ncol=length(predictors)+3,
nrow=choose(n, minVar)+(n-minVar)*(n-minVar+1)/2))
names(perf_all) <- c(paste0("var",1:length(predictors)),metric,"SE","nvar")
minGrid <- t(data.frame(combn(names(predictors),minVar)))


# Computation -----------------------------------------------
## Step 1: Search best initial variables -----
## parallel ----------
if(cores > 1){

initial_models = parallel::mclapply(X = 1:nrow(minGrid), mc.cores = cores, FUN = function(i){
Expand Down Expand Up @@ -208,7 +255,8 @@ ffs <- function (predictors,
metric=metric,
trControl=trControl,
tuneLength = tuneLength,
tuneGrid = tuneGrid,...)
tuneGrid = tuneGrid)
#...)


tuneGrid <- tuneGrid_orig
Expand Down Expand Up @@ -242,14 +290,16 @@ ffs <- function (predictors,
best_predictors <- as.character(initial_models[best_rowindex, 1:minVar])

# best minVar model has to be retrained
#
#
bestmodel <- caret::train(predictors[,best_predictors],
response,
method=method,
metric=metric,
trControl=trControl,
tuneLength = tuneLength,
tuneGrid = tuneGrid,
...)
tuneGrid = tuneGrid)
#...)


acc = nrow(minGrid)
Expand All @@ -262,7 +312,7 @@ ffs <- function (predictors,


}else{
### unparallel version -------------
## unparallel -------------

for (i in 1:nrow(minGrid)){
if (verbose){
Expand Down Expand Up @@ -340,10 +390,8 @@ ffs <- function (predictors,
}


## both --------


#### increase the number of predictors by one (try all combinations)
#and test if model performance increases
selectedvars <- names(bestmodel$trainingData)[-which(
names(bestmodel$trainingData)==".outcome")]

Expand All @@ -362,6 +410,152 @@ ffs <- function (predictors,
print(paste0(paste0("vars selected: ",paste(selectedvars, collapse = ',')),
" with ",metric," ",round(selectedvars_perf,3)))
}

## Step 2: Append more variables ------
# increase the number of predictors by one (try all combinations)
# and test if model performance increases
# k: amount of "additional variables" left after initial search
# for each k: search best additional predictor

## parallel -----

if(cores > 1){
for(k in 1:(length(names(predictors))-minVar)){
startvars <- names(bestmodel$trainingData)[-which(
names(bestmodel$trainingData)==".outcome")]
nextvars <- names(predictors)[-which(
names(predictors)%in%startvars)]

if(verbose){
print(paste0("Searching for additional variable ", minVar + k, " now. ",
length(nextvars), " potential predictors are available:"))
print(nextvars)
}


# search best additional variable in parallel
next_models <- parallel::mclapply(1:length(nextvars), mc.cores = cores, FUN = function(i){


set.seed(seed)

#adaptation for pls:
tuneGrid_orig <- tuneGrid
if(method=="pls"&!is.null(tuneGrid)&any(tuneGrid$ncomp>ncol(predictors[,c(startvars,nextvars[i])]))){
tuneGrid<- data.frame(ncomp=tuneGrid[tuneGrid$ncomp<=ncol(predictors[,c(startvars,nextvars[i])]),])
if(verbose){
print(paste0("note: maximum ncomp is ", ncol(predictors[,c(startvars,nextvars[i])])))
}}
#adaptation for ranger:
if(method=="ranger"&!is.null(tuneGrid)&any(tuneGrid$mtry>ncol(predictors[,c(startvars,nextvars[i])]))){
tuneGrid$mtry[tuneGrid$mtry>ncol(predictors[,c(startvars,nextvars[i])])] <- ncol(predictors[,c(startvars,nextvars[i])])
if(verbose){
print("invalid value for mtry. Reset to valid range.")
}
}

model <- caret::train(predictors[,c(startvars,nextvars[i])],
response,
method = method,
metric=metric,
trControl = trControl,
tuneLength = tuneLength,
tuneGrid = tuneGrid,
...)
tuneGrid <- tuneGrid_orig



if (globalval){
perf_stats <- global_validation(model)[names(global_validation(model))==metric]
}else{
perf_stats <- model$results[,names(model$results)==metric]
}


startvars
result = as.data.frame(t(startvars))
result$nextvar = nextvars[i]
result$actmodelperf <- evalfunc(perf_stats)
result$actmodelperfSE <- se(
sapply(unique(model$resample$Resample),
FUN=function(x){mean(model$resample[model$resample$Resample==x,
metric],na.rm=TRUE)}))

return(result)

})

next_models = do.call(rbind, next_models)


## best next_model
best_next_rowindex = ifelse(maximize,
which.max(next_models[,(ncol(next_models)-1)]),
which.min(next_models[,(ncol(next_models)-1)]))

better = isBetter(actmodelperf = next_models$actmodelperf[best_next_rowindex],
bestmodelperf = bestmodelperf,
bestmodelperfSE = bestmodelperfSE,
maximization = maximize, withinSE = withinSE)



# patching perf_all
perf_all[(acc+1):(acc+length(nextvars)), 1:(minVar+k)] <- next_models[,1:(minVar+k)]
perf_all[(acc+1):(acc+length(nextvars)), (ncol(perf_all)-2):(ncol(perf_all)-1)] <- next_models[,(ncol(next_models)-1):ncol(next_models)]
perf_all$nvar[(acc+1):(acc+length(nextvars))] <- minVar+k

if(better){
# update best model stats
bestmodelperf = next_models$actmodelperf[best_next_rowindex]
bestmodelperfSE = next_models$actmodelperfSE[best_next_rowindex]
best_predictors = as.character(next_models[best_next_rowindex, 1:(minVar+k)])

selectedvars_perf = c(selectedvars_perf, bestmodelperf)
selectedvars_SE = c(selectedvars_SE, bestmodelperfSE)


bestmodel <- caret::train(predictors[,best_predictors],
response,
method=method,
metric=metric,
trControl=trControl,
tuneLength = tuneLength,
tuneGrid = tuneGrid,
...)


acc = acc+nrow(next_models)


}else{
# not better: return model and stats
message(paste0("Note: No increase in performance found using more than ",
length(startvars), " variables"))
bestmodel$selectedvars <- best_predictors
bestmodel$selectedvars_perf <- selectedvars_perf
bestmodel$selectedvars_perf_SE <- selectedvars_SE
bestmodel$perf_all <- perf_all
bestmodel$perf_all <- bestmodel$perf_all[!apply(is.na(bestmodel$perf_all), 1, all),]
bestmodel$perf_all <- bestmodel$perf_all[colSums(!is.na(bestmodel$perf_all)) > 0]
bestmodel$minVar <- minVar
bestmodel$type <- "ffs"
class(bestmodel) <- c("ffs", "train")
return(bestmodel)

}




}# end of k loop




}else{
## unparallel -----
for (k in 1:(length(names(predictors))-minVar)){
startvars <- names(bestmodel$trainingData)[-which(
names(bestmodel$trainingData)==".outcome")]
Expand Down Expand Up @@ -469,21 +663,10 @@ ffs <- function (predictors,
}
}

# Old version that is not using global_validation:
# if (maximize){
# selectedvars_perf <- c(selectedvars_perf,max(bestmodel$results[,metric]))
# if(verbose){
# print(paste0(paste0("vars selected: ",paste(selectedvars, collapse = ',')),
# " with ", metric," ",round(max(bestmodel$results[,metric]),3)))
# }
# }
# if (!maximize){
# selectedvars_perf <- c(selectedvars_perf,min(bestmodel$results[,metric]))
# if(verbose){
# print(paste0(paste0("vars selected: ",paste(selectedvars, collapse = ',')),
# " with ",metric," ",round(min(bestmodel$results[,metric]),3)))
# }
# }
}
## return best model --------



bestmodel$selectedvars <- selectedvars
bestmodel$selectedvars_perf <- selectedvars_perf
Expand All @@ -501,7 +684,3 @@ ffs <- function (predictors,
}






Loading

0 comments on commit 8db65b0

Please sign in to comment.