.Rhistory

#' @param seen.proportion Proportion of features which must have been seen before eGA can terminate (default:0.5)
#' @param max.iterations Maximum number of iterations which can be performed. If the eGA does not terminate before this point, the feature set at the max.iterations iteration will be returned (default:500)
#' @param kernel Type of kernel to be used in SVM, from linear, polynomial and radial (default: linear)
#' @param degree Kernel degree if kernel is polynomial or radial
#' @param include.ffs.plot Whether accuracy/sensitivity/specificity across features should be plotted for each forward.feature.selection call.
#' @return List containing (ordered list of) selected features, performance percentages, accessed using training_accuracy, test_accuracy, training_sensitivity, test_sensitivity, training_specificity, test_specificity.
#' @keywords eGA
#' @keywords embryonic
#' @keywords feature selection
#' @keywords genetic algorithm
#' @export
#' @examples
#' data.url <- RCurl::getURL("https://raw.githubusercontent.com/Core-Bioinformatics/feamiR/main/inst/samples/subsamples/sample0.csv")
#' data = read.csv(text = data.url,row.names=1)
#' data = rbind(head(data,50),tail(data,50))
#' data$classification = as.factor(data$classification)
#' eGA(num.features = 30, data,num.folds = 2, max.iterations = 20,include.ffs.plot = TRUE,measure = 'test')
eGA <- function(num.features = 30,
data,
num.folds = 10,
seen.proportion = 0.5,
measure = 'training',
max.iterations = 500,
kernel='linear',
degree=3,
include.ffs.plot = FALSE){
# Create list of features
total.features = colnames(data)[colnames(data)!='classification']
# Initialise feature strength
feature.strength = rep(0,length(total.features))
names(feature.strength) = total.features
# Prepare partition.vector for num-folds-fold cross-validation
partition.vector = sample(1:num.folds, nrow(data), replace=TRUE)
# Start first iteration of eGA
return(
eGA.iteration(current.features = c(),
total.features = total.features,
num.features = num.features,
data = data,
partition.vector = partition.vector,
current.iteration = 1,
features.seen = c(),
seen.proportion = seen.proportion,
measure = measure,
feature.strength = feature.strength,
max.iterations = max.iterations,
kernel = kernel,
degree = degree,
include.ffs.plot = include.ffs.plot)
)
}
eGA.iteration <- function(current.features,
total.features,
num.features = 30,
data,
partition.vector,
current.iteration,
features.seen = c(),
seen.proportion = 0.5,
measure = 'training',
feature.strength,
max.iterations = 500,
kernel='linear',
degree=3,
include.ffs.plot = FALSE){
message(paste('Iteration',current.iteration))
# Pick extra features to make num.features in total
features.to.select <- num.features - length(current.features)
if (features.to.select > 0){
current.features <- c(current.features, sample(setdiff(total.features,current.features),features.to.select))
}
# Update features.seen to include the new ones
features.seen <- union(features.seen, current.features)
# Run forward feature selection
ffs <- forward.feature.selection(model = svm.partition,
data = data,
feature.list = current.features,
partition.vector = partition.vector,
measure = measure,
num.features = length(current.features),
kernel = kernel,
degree = degree)
if (include.ffs.plot){
print(forward.feature.selection.plot(ffs))
}
# Find plateau for FFS performance plot and take features up to plateau
ffs.plateau <- find.plateau(measure.vector = ffs[,paste0(measure,'_accuracy')])
ffs.selected.features <- ffs$feature[1:ffs.plateau]
# Update feature strength measure
for (feat in ffs.selected.features){
feature.strength[feat] <- feature.strength[feat] + 1
}
# Update current best accuracy, sensitivity and specificity
ffs.plateau.row = ffs[ffs.plateau,]
accuracy <- ffs.plateau.row[paste0(measure,'_accuracy')]
training.accuracy <- ffs.plateau.row['training_accuracy']
test.accuracy <- ffs.plateau.row['test_accuracy']
training.sensitivity <- ffs.plateau.row['training_sensitivity']
test.sensitivity <- ffs.plateau.row['test_sensitivity']
training.specificity <- ffs.plateau.row['training_specificity']
test.specificity <- ffs.plateau.row['test_specificity']
# Run crossover
crossover.result <- crossover(ffs.selected.features,
total.features,
num.features,
data,
partition.vector,
measure,
kernel,
degree)
# Run mutation
mutation.result <- crossover(crossover.result[['features']],
total.features,
num.features,
data,
partition.vector,
measure,
kernel,
degree)
selected.features <- mutation.result[['features']]
selected.performance <- mutation.result[['performance']]
# If we haven't seen enough features or completed enough iterations or the solution
# hasn't stabilised (and we haven't performed max.iterations iterations) then go to next iteration
if ((length(features.seen) < seen.proportion * length(total.features) |
!(identical(current.features,selected.features)) |
current.iteration <= 5) &
current.iteration < max.iterations &
length(selected.features) != num.features){
# Add features which have been chosen in over half of previous iterations back in
over.half.features <- names(feature.strength)[feature.strength > (current.iteration/2)]
selected.features <- union(ffs.selected.features, over.half.features)
return(
eGA.iteration(current.features = selected.features,
total.features = total.features,
num.features = num.features,
data = data,
partition.vector = partition.vector,
current.iteration = current.iteration + 1,
features.seen = features.seen,
seen.proportion = seen.proportion,
measure = measure,
feature.strength = feature.strength,
max.iterations = max.iterations,
kernel = kernel,
degree = degree)
)
}
else {
# If the solution has stabilised or we have found num.features good features or we have performed
# too many iterations then return the current set
return.list <- list('features' = selected.features,
'training_accuracy' = selected.performance$training_accuracy,
'test_accuracy' = selected.performance$test_accuracy,
'training_sensitivity' = selected.performance$training_sensitivity,
'test_sensitivity' = selected.performance$test_sensitivity,
'training_specificity' = selected.performance$training_specificity,
'test_specificity' = selected.performance$test_specificity)
return(return.list)
}
}
# Forward Feature Selection and Genetic Algorithm components --------------------
# Performs FFS up to num.features
forward.feature.selection <-function(model=svm.partition,
data,
feature.list,
partition.vector,
kernel='linear',
degree=3,
num.features=50,
measure='train'){
# Initialise list of features and performance dataframe
ongoing<-c()
featureaccuracy <- data.frame(matrix(nrow = num.features, ncol = 0))
for (i in 1:num.features){
# Choose next best feature
b <- next.best.feature(model,data,
partition.vector,
feature.list[! feature.list %in% c('classification', ongoing ) ],
ongoing,
kernel = kernel,
degree = degree,
measure = measure)
ongoing <- c(ongoing, b$feature)
featureaccuracy[i,'feature'] <- b$feature
featureaccuracy[i,'numfeatures'] <- i
featureaccuracy[i,'training_accuracy'] <- b$training_accuracy
featureaccuracy[i,'test_accuracy'] <- b$test_accuracy
featureaccuracy[i,'training_specificity'] <- b$training_specificity
featureaccuracy[i,'test_specificity'] <- b$test_specificity
featureaccuracy[i,'test_sensitivity'] <- b$test_sensitivity
featureaccuracy[i,'training_sensitivity'] <- b$training_sensitivity
}
return(featureaccuracy)
}
# Selects feature which maximises accuracy combined with ongoing features,
# either training or test accuracy depending on measure parameter
next.best.feature<-function(model,
data,
partition.vector,
featuresleft,
ongoingfeatures,
kernel='linear',
degree=3,
measure='train'){
maxaccuracy <- 0
maxfeat <- featuresleft[1]
# Loop over all remaining features and train models on the existing features and the chosen one
for (feat in featuresleft){
feat_data <- data[,c('classification',ongoingfeatures,feat)]
featresults <- model(feat_data,
partition.vector,
kernel=kernel,
degree=degree)
if (measure == 'train'){
feataccuracy <- featresults$training_accuracy
}
else{feataccuracy <- featresults$test_accuracy}
# If this is the best feature seen so far then choose it
if (maxaccuracy <= feataccuracy){
maxaccuracy <- feataccuracy
max_train <- featresults$training_accuracy
max_test <- featresults$test_accuracy
max_testsens <- featresults$test_sensitivity
max_testspec <- featresults$test_specificity
max_trainsens <- featresults$training_sensitivity
max_trainspec <- featresults$training_specificity
maxfeat <- feat
}
}
# Return feature and performance for the best found feature
return_list <- list("feature" = maxfeat,
"training_accuracy" = max_train,
"test_accuracy" = max_test,
"training_sensitivity" = max_trainsens,
"test_sensitivity" = max_testsens,
"training_specificity" = max_trainspec,
"test_specificity" = max_testspec)
return(return_list)
}
crossover <- function(selected.features,
total.features,
num.features,
data,
partition.vector,
measure,
kernel,
degree){
# Choose split point
pos <- sample(1:length(selected.features),1)
# Choose second set of features to compare against
alternative.features <- sample( setdiff( total.features, selected.features) , length(selected.features) )
# Create crossover sets
ab<-union( head(selected.features, (pos-1) ), tail( alternative.features , length(selected.features) - pos - 1))
ba<-union( head(alternative.features, (pos-1) ), tail( selected.features, length(selected.features) - pos - 1))
# Train models on all 4 versions
model.aa <- svm.partition( data[, c('classification', selected.features) ], partition.vector, kernel, degree)
model.bb <- svm.partition( data[, c('classification', alternative.features) ], partition.vector, kernel, degree)
model.ab <- svm.partition( data[, c('classification', ab) ], partition.vector, kernel, degree)
model.ba <- svm.partition( data[, c('classification', ba) ], partition.vector, kernel, degree)
# Record performance of each set of models
acc.aa <- model.aa[[ paste0(measure, '_accuracy') ]]
acc.bb <- model.bb[[ paste0(measure, '_accuracy') ]]
acc.ab <- model.ab[[ paste0(measure, '_accuracy') ]]
acc.ba <- model.ba[[ paste0(measure, '_accuracy') ]]
# Choose the best version
if (acc.bb == max(c(acc.aa, acc.bb, acc.ab, acc.ba))){
message('Using crossover feature set')
best.model <- model.bb
best.features <- alternative.features
}
else if (acc.ab == max(c(acc.aa, acc.bb, acc.ab, acc.ba))){
message('Using crossover feature set')
best.model <- model.ab
best.features <- ab
}
else if (acc.ba == max(c(acc.aa, acc.bb, acc.ab, acc.ba))){
message('Using crossover feature set')
best.model <- model.ba
best.features <- ba
}
else {
best.model <- model.aa
best.features <- selected.features
}
return.list <- list('features' = best.features,
'performance' = best.model)
return(return.list)
}
mutation <- function(selected.features,
total.features,
num.features,
data,
partition.vector,
measure,
kernel,
degree){
# Pick mutation position
chosen <- sample( selected.features, 1)
# Pick a new feature to swap into the existing feature set
new.feature <- sample( setdiff(total.features, selected.features), 1)
# Add new feature into the feature set
alternative.features <- c(selected.features[selected.features!=chosen],new.feature)
# Test both versions
model.original <- svm.partition( data[, c('classification',selected.features) ], partition.vector, kernel, degree)
model.mutation <- svm.partition( data[, c('classification',alternative.features) ], partition.vector, kernel, degree)
acc.original <- model.original[[ paste0(measure,'_accuracy') ]]
acc.mutation <- model.mutation[[ paste0(measure,'_accuracy') ]]
# Pick the one which performed better
if (acc.original == max(acc.original, acc.mutation)){
best.model <- model.original
best.features <- selected.features
}
else{
message('Using mutation feature set')
best.model <- model.mutation
best.features <- alternative.features
}
return.list <- list('features' = best.features,
'performance' = best.model)
return(return.list)
}
# Helper functions --------------------------------------------------------------
# Calculates accuracy, sensitivity, specificity for classifier, can also plot confusion matrix
classifier.performance <- function(classifier,
data,
label,
includeplot=FALSE){
predicted.label <- stats::predict(classifier,
data[, colnames(data)[colnames(data) != 'classification'] ],
type = 'class')
true.label <- data[, 1]
pred.table <- table(true.label, predicted.label)
if (nrow(pred.table) == 1){
if (rownames(pred.table) %in% c(0,'0')){
pred.table=data.frame('pred_0' = c(pred.table[1,1],0),
'pred_1' = c(pred.table[1,2],0))
rownames(pred.table) <- c('true_0','true_1')
}
if (rownames(pred.table) %in% c(1,'1')){
pred.table=data.frame('pred_0' = c(0,pred.table[1,1]),
'pred_1' = c(0,pred.table[1,2]))
rownames(pred.table) <- c('true_0','true_1')
}
}
if (ncol(pred.table) == 1){
if (colnames(pred.table) %in% c(0,'0')){
pred.table=data.frame('pred_0' = c(pred.table[1,1],pred.table[2,1]),
'pred_1' = c(0,0))
rownames(pred.table) <- c('true_0','true_1')
}
if (colnames(pred.table) %in% c(1,'1')){
pred.table=data.frame('pred_0' = c(0,0),
'pred_1' = c(pred.table[1,1],pred.table[2,1]))
rownames(pred.table) <- c('true_0','true_1')
}
}
colnames(pred.table) <- c('pred_0','pred_1')
rownames(pred.table) <- c('true_0','true_1')
accuracy <- sum(diag(pred.table)) / sum(pred.table)
sensitivity = pred.table['true_1','pred_1']/
(pred.table['true_1','pred_0'] + pred.table['true_1','pred_1'])
specificity = pred.table['true_0','pred_0']/
(pred.table['true_0','pred_0'] + pred.table['true_0','pred_1'])
return_list = list('accuracy'=accuracy,
'sensitivity'=sensitivity,
'specificity'=specificity)
results = tibble::tibble(y_real  = true.label %>% factor(levels=c(0,1)),
y_pred  = predicted.label %>% factor(levels=c(0,1)),
Correct = ifelse(y_real == y_pred,"yes","no") %>% factor(levels=c('yes','no')))
if (includeplot==TRUE){
title = paste0(label)
xlab  = 'True label'
ylab  = 'Predicted label'
plot(ggplot(results,aes(x = y_pred, y = y_real, colour = Correct)) +
geom_point() +
ggtitle(label = title, subtitle = paste0("Accuracy = ", 100 * round(accuracy,3),"%")) +
xlab(xlab) +
ylab(ylab) +
scale_color_manual(labels = c('Yes', 'No'),values = c('cornflowerblue','tomato')) +
geom_jitter() +
theme_bw()+
labs(colour = 'Correct'))
}
return(return_list)
}
# A normal svm function using e1071
svm<-function(data_train,
data_test,
kernel='linear',
degree=3,
includeplot=FALSE){
if (kernel == 'poly'){
classifier <- suppressWarnings(e1071::svm(x = data_train[, colnames(data_train)[colnames(data_train) != 'classification'] ],
y = data_train$classification,
type = 'C-classification',
kernel = 'polynomial',
degree = degree))
}
else if (kernel=='radial'){
classifier <- suppressWarnings(e1071::svm(x = data_train[, colnames(data_train)[colnames(data_train) != 'classification'] ],
y = data_train$classification,
type = 'C-classification',
kernel = 'radial',
coef0 = 0,
degree = 3))
}
else {
classifier <- suppressWarnings(e1071::svm(x = data_train[, colnames(data_train)[colnames(data_train) != 'classification'] ],
y = data_train$classification,
type = 'C-classification',
kernel = kernel))
}
training <- suppressWarnings(classifier.performance(classifier,
data_train,
label = 'SVM - training performance',
includeplot = includeplot))
test <- suppressWarnings(classifier.performance(classifier,
data_test,
label = 'SVM - test performance',
includeplot = includeplot))
return_list <- list("test_accuracy" = test$accuracy,
"test_sensitivity" = test$sensitivity,
"test_specificity" = test$specificity,
"training_accuracy" = training$accuracy,
"training_sensitivity" = training$sensitivity,
"training_specificity" = training$specificity)
return(return_list)
}
# SVM with cross-validation, you input the partitioning
# e.g. partition.vector = sample(1:10, nrow(mydata), replace=T)
svm.partition <- function(data,
partition.vector,
kernel='linear',
degree = 3){
listoftracc = c()
listoftestacc = c()
listoftrasens = c()
listoftestsens = c()
listoftraspec = c()
listoftestspec = c()
for (i in unique(partition.vector)){
test.sample = data[partition.vector == i,]
training.sample = data[partition.vector != i,]
my.svm = svm(training.sample,
test.sample,
kernel = kernel,
degree = degree)
listoftracc <- c(listoftracc, my.svm$training_accuracy)
listoftestacc <- c(listoftestacc, my.svm$test_accuracy)
listoftrasens <- c(listoftrasens, my.svm$training_sensitivity)
listoftestsens <- c(listoftestsens, my.svm$test_sensitivity)
listoftraspec <- c(listoftraspec, my.svm$training_specificity)
listoftestspec <- c(listoftestspec, my.svm$test_specificity)
}
return_list <- list("test_accuracy" = mean(listoftestacc),
"test_sensitivity" = mean(listoftestsens),
"test_specificity" = mean(listoftestspec),
"training_accuracy" = mean(listoftracc),
"training_sensitivity" = mean(listoftrasens),
"training_specificity" = mean(listoftraspec))
return(return_list)
}
# Find plateau in FFS performance plot
find.plateau <- function(measure.vector,
plateau.threshold = 0.01,
num.each.way = 2,
span = 0.2){
x <- 1:length(measure.vector)
y <- measure.vector
loess.train <- stats::loess( y ~ x, degree = 1)
loess.predict <- stats::predict(loess.train, newdata = x, se = FALSE)
for (pos in (num.each.way + 1) : (length(measure.vector) - num.each.way )){
if (max( loess.predict[ (pos - num.each.way) : (pos + num.each.way)]) -
min( loess.predict[ (pos - num.each.way) : (pos + num.each.way) ]) < plateau.threshold){
plateau.point <- pos
break
}
else {
plateau.point <- length(measure.vector)
}
}
return(plateau.point)
}
# Create FFS performance plot from forward.feature.selection output
forward.feature.selection.plot <- function(df){
listoffeatures <- df$feature
df.melt <- reshape2::melt(df,
id.vars = c('feature','numfeatures'))
df.melt <- tidyr::separate(data = df.melt, col = variable,
into = c("type", "measure"),
sep = "\\_")
p<-ggplot(df.melt,aes(x = numfeatures,
y = value,
colour = measure,
linetype = type,
shape = type))+
geom_smooth(formula = y ~ x, method = 'loess', se = FALSE)+
scale_x_continuous(minor_breaks = seq(0, nrow(df) - 1, 1),
breaks = seq(0, nrow(df) - 1, 1),
labels = listoffeatures)+
ylab('Performance')+
xlab('Feature')+
theme(axis.text.x = element_text(angle = 90))
return(p)
}
data.url <- RCurl::getURL("https://raw.githubusercontent.com/Core-Bioinformatics/feamiR/main/inst/samples/subsamples/sample0.csv")
data = read.csv(text = data.url,row.names=1)
data = rbind(head(data,50),tail(data,50))
data$classification = as.factor(data$classification)
eGA(num.features = 30, data,num.folds = 2, max.iterations = 20,include.ffs.plot = TRUE,measure = 'test')
library(RCurl, lib.loc = "C:/Program Files/R/R-4.0.3/library")
data.url <- RCurl::getURL("https://raw.githubusercontent.com/Core-Bioinformatics/feamiR/main/inst/samples/subsamples/sample0.csv")