Merge pull request #6 from ParkerICI/negative-values

Negative values
ParkerICI · Jun 21, 2019 · 74260c4 · 74260c4
2 parents 17d3058 + 834fd2a
commit 74260c4
Show file tree

Hide file tree

Showing 13 changed files with 210 additions and 47 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: grappolo
 Type: Package
 Title: Feature generation from single-cell data
-Version: 0.4.4
+Version: 0.5.1
 Authors@R: "Pier Federico Gherardini <[email protected]> [aut, cre]"
 Description: This is a package to cluster single-cell flow data and generate
     features that can be used for model building

diff --git a/R/cluster.R b/R/cluster.R
@@ -65,7 +65,8 @@ cluster_data <- function(tab, col.names, k, ...) {
 #' @inheritParams cluster_fcs_files
 #'
 #'
-process_files_groups <- function(files, col.names, num.clusters, num.samples, asinh.cofactor, downsample.to, output.dir) {
+process_files_groups <- function(files, col.names, num.clusters, num.samples, asinh.cofactor,
+                                 downsample.to, output.dir, negative.values, quantile.prob) {
     tab <- NULL
     orig.data <- NULL
 
@@ -81,7 +82,7 @@ process_files_groups <- function(files, col.names, num.clusters, num.samples, as
 
         fcs.file <- flowCore::read.FCS(f)
         temp.orig.data <- flowCore::exprs(fcs.file)
-        temp.tab <- convert_fcs(fcs.file, asinh.cofactor)
+        temp.tab <- convert_fcs(fcs.file, asinh.cofactor, negative.values = negative.values, quantile.prob = quantile.prob)
 
         if(downsample.to > 0) {
             x <- NULL
@@ -123,10 +124,10 @@ process_files_groups <- function(files, col.names, num.clusters, num.samples, as
 #' @param f The file path
 #' @inheritParams cluster_fcs_files
 #'
-process_file <- function(f, col.names, num.clusters, num.samples, asinh.cofactor, output.dir) {
+process_file <- function(f, col.names, num.clusters, num.samples, asinh.cofactor, output.dir, negative.values, quantile.prob) {
     fcs.file <- flowCore::read.FCS(f)
     orig.data <- flowCore::exprs(fcs.file)
-    tab <- convert_fcs(fcs.file, asinh.cofactor)
+    tab <- convert_fcs(fcs.file, asinh.cofactor, negative.values = negative.values, quantile.prob = quantile.prob)
 
     m <- grappolo:::cluster_data(tab, col.names, k = num.clusters, sampsize = min(nrow(tab), 1000), samples = num.samples)
     colnames(m) <- gsub("groups", "cellType", colnames(m))
@@ -199,18 +200,20 @@ cluster_fcs_files_in_dir <- function(wd, ...) {
 #' @param num.cores Number of CPU cores to use
 #' @param col.names A vector of column names indicating which columns should be used for clustering
 #' @param num.clusters The desired number of clusters
-#' @param asinh.cofactor Cofactor for asinh transformation. If this is \code{NULL} no transformation is performed (see \code{convert_fcs})
 #' @param num.samples Number of samples to be used for the CLARA algorithm (see \code{cluster::clara})
 #' @param output.dir The name of the output directory, it will be created if it does not exist
+#' @inheritParams convert_fcs
 #' @return Returns either \code{NULL} or a \code{try-error} object if some error occurred during the computation
 #' @export
-cluster_fcs_files <- function(files.list, num.cores, col.names, num.clusters, asinh.cofactor, num.samples = 50, output.dir = ".") {
+cluster_fcs_files <- function(files.list, num.cores, col.names, num.clusters, asinh.cofactor,
+                              num.samples = 50, output.dir = ".", negative.values = "truncate", quantile.prob = 0.05) {
     if(!dir.exists(output.dir))
         dir.create(output.dir, recursive = TRUE, showWarnings = TRUE)
 
     parallel::mclapply(files.list, mc.cores = num.cores, mc.preschedule = FALSE,
                        process_file, col.names = col.names, num.clusters = num.clusters,
-                       num.samples = num.samples, asinh.cofactor = asinh.cofactor, output.dir = output.dir)
+                       num.samples = num.samples, asinh.cofactor = asinh.cofactor,
+                       output.dir = output.dir, negative.values = negative.values, quantile.prob = quantile.prob)
 }
 
 
@@ -227,7 +230,8 @@ cluster_fcs_files <- function(files.list, num.cores, col.names, num.clusters, as
 #'
 #' @export
 cluster_fcs_files_groups <- function(files.list, num.cores, col.names, num.clusters, asinh.cofactor,
-                                        num.samples = 50, downsample.to = 0, output.dir = ".") {
+                                        num.samples = 50, downsample.to = 0, output.dir = ".", negative.values = "truncate",
+                                        quantile.prob = 0.05) {
 
     files.list <- lapply(names(files.list), function(x) {
         c(x, files.list[[x]])
@@ -238,7 +242,8 @@ cluster_fcs_files_groups <- function(files.list, num.cores, col.names, num.clust
 
     parallel::mclapply(files.list, mc.cores = num.cores, mc.preschedule = FALSE,
                        process_files_groups, col.names = col.names, num.clusters = num.clusters, num.samples = num.samples,
-                       asinh.cofactor = asinh.cofactor, downsample.to = downsample.to, output.dir = output.dir)
+                       asinh.cofactor = asinh.cofactor, downsample.to = downsample.to,
+                       output.dir = output.dir, negative.values = negative.values, quantile.prob = quantile.prob)
 
 }
 

diff --git a/R/features.R b/R/features.R
@@ -86,10 +86,10 @@ multistep_normalize <- function(tab, norm.template, subject.var) {
 #' stimulation conditions.
 #' In this case the \code{metadata.tab} would look like this
 #' \itemize{
-#'   \item{\code{file}}{The names of the data files that contain data for each sample. These must match the names in the clustering results (see above)}
-#'   \item{\code{timepoint}}{The timepoint information}
-#'   \item{\code{condition}}{The stimulation condition}
-#'   \item{\code{subject}}{The subjet each file was derived from}
+#'   \item{\code{file}}{ The names of the data files that contain data for each sample. These must match the names in the clustering results (see above)}
+#'   \item{\code{timepoint}}{ The timepoint information}
+#'   \item{\code{condition}}{ The stimulation condition}
+#'   \item{\code{subject}}{ The subjet each file was derived from}
 #' }
 #' Let's assume a few different scenarios.
 #' \enumerate{
@@ -111,20 +111,24 @@ multistep_normalize <- function(tab, norm.template, subject.var) {
 #'   (see Details). The combination of \code{predictors} and \code{endpoint.grouping} must uniquely identify every row in \code{metadata.tab}.
 #'   The function will throw an error if this is not the case.
 #' @param filename.col The name of the column in \code{metadata.tab} that is used to identify the file names in tab
-#' @return Returns a data frame whose format depends on the value of the \code{format} parameter
+#' @return Returns a data frame whose format depends on the value of the \code{out.format} parameter
 #'   \itemize{
 #'     \item{table}: each row corresponds to a combination of the levels of the variables specified in \code{endpoint.grouping}, and the columns are
 #'     cluster features, which are combinations of the levels of the \code{predictors} for each feature specified in \code{features.names}
 #'     \item{tidy}: there is a single numeric column, and all the other columns represent variables whose combinations uniquely identify each observation (i.e. each row)
 #'   }
 #' @export
 
-get_cluster_features <- function(tab, metadata.tab, features.names, out.format = "table", predictors = NULL, endpoint.grouping = NULL, filename.col = "file") {
+get_cluster_features <- function(tab, metadata.tab, features.names, out.format = "table", predictors = NULL,
+                                 endpoint.grouping = NULL, filename.col = "file", transform.popsize = TRUE) {
     out.format <- match.arg(out.format, c("table", "tidy"))
-    m <- reshape_cluster_features(tab, features.names)
+    m <- reshape_cluster_features(tab, features.names, transform.popsize)
 
     df <- reshape::melt(m, varnames = c(filename.col, "variable"))
 
+    # Restore the original value of filename.col
+    names(df) <- gsub(make.names(filename.col), filename.col, names(df))
+
     df <- merge(df, metadata.tab, by = filename.col)
     ret <- NULL
 
@@ -143,7 +147,7 @@ get_cluster_features <- function(tab, metadata.tab, features.names, out.format =
         ret <- reshape::cast(df, formula.exp)
     }
 
-    return(ret)
+    return(data.frame(ret, check.names = FALSE, stringsAsFactors = FALSE))
 }
 
 
@@ -171,7 +175,7 @@ transpose_feature_matrix <- function(m) {
 }
 
 
-reshape_cluster_features <- function(input.tab, features) {
+reshape_cluster_features <- function(input.tab, features, transform.popsize = TRUE) {
     col.names <- sapply(features, paste, "@", sep = "")
     col.names <- paste(col.names, collapse = "|")
     col.names <- grep(col.names, names(input.tab), value = T)
@@ -183,7 +187,7 @@ reshape_cluster_features <- function(input.tab, features) {
         temp <- m[, grep(sprintf("%s@", s), colnames(m))]
         temp[is.na(temp)] <- 0
 
-        if(s == "popsize") {
+        if(s == "popsize" && transform.popsize) {
             temp <- t(temp)
             temp <- temp / rowSums(temp)
             temp <- t(temp)

diff --git a/R/io.R b/R/io.R
@@ -8,14 +8,22 @@
 #'
 #' @param f The \code{flowFrame} to convert
 #' @param asinh.cofactor Cofactor for \code{asinh} transformation. If this is \code{NULL} no transformation is performed
-#' @param clip.at.zero Wether to clip negative values (after transformation) at zero
 #' @param compensate Wether to compensate the data using the compensation matrix embedded in the \code{flowFrame} (if any)
-#'
+#' @param negative.values How to deal with negative values in the data. If this is \code{NULL} negative values
+#'   are left as is. Otherwise two options are possible:
+#'   \itemize{
+#'     \item{\code{truncate}}: Negative values will be truncated (i.e. replaced with 0)
+#'     \item{\code{shift}}: The data will be shifted so that only \code{quantile.prob} of the values for each channel will
+#'       be truncated to 0. This option is useful in cases where the range of data significantly extends
+#'       in the negatives, for instance due to compensation.
+#'   }
+#' @param quantile.prob Only used if \code{negative.value} is set to \code{shift}. The quantile of measurements
+#'   that are going to be truncated to 0. For instance if this is 0.05, the data will be shifted so that
+#'   only 5 percent of the values are negative and will be truncated to 0
 #' @return Returns a \code{data.frame} corresponding to the data in \code{flowCore::exprs(f)} after compensation
 #'   and transformation
-#'
 #' @export
-convert_fcs <- function(f, asinh.cofactor = NULL, clip.at.zero = T, compensate = T) {
+convert_fcs <- function(f, asinh.cofactor = NULL, compensate = T, negative.values = "truncate", quantile.prob = 0.05) {
     comp <- grep("SPILL", names(flowCore::description(f)), value = T)
 
     if(compensate && (length(comp) > 0)) {
@@ -35,8 +43,13 @@ convert_fcs <- function(f, asinh.cofactor = NULL, clip.at.zero = T, compensate =
     if(!is.null(asinh.cofactor))
         m <- asinh(m / asinh.cofactor)
 
-    if(clip.at.zero)
-        m[m < 0] <- 0
+    if(!is.null(negative.values)) {
+        negative.values <- match.arg(negative.values, choices = c("truncate", "shift"))
+        if(negative.values == "truncate")
+            m[m < 0] <- 0
+        else if(negative.values == "shift")
+            m <- shift_negative_values(m, quantile.prob)
+    }
 
     tab <- data.frame(m, check.names = F, stringsAsFactors = F)
 
@@ -51,6 +64,35 @@ convert_fcs <- function(f, asinh.cofactor = NULL, clip.at.zero = T, compensate =
 }
 
 
+#' Shift negative values in a matrix
+#'
+#' This function shifts negative values in a data matrix. For each column vector
+#' the procedure is as follows:
+#' \enumerate{
+#'   \item A specific quantile is calculated from the vector
+#'   \item If the quantile is negative then its absolute value is added to the vector
+#'   \item Values that are still negative are truncated at 0
+#' }
+#'
+#' @param m The data matrix
+#' @param quantile.prob The quantile probability to use
+#' @return Return the transformed data matrix
+#'
+shift_negative_values <- function(m, quantile.prob) {
+    apply(m, 2, function(x) {
+        qq <- quantile(x, quantile.prob)
+        ret <- NULL
+        if(qq < 0)
+            ret <- x + abs(qq)
+        else
+            ret <- x
+        ret[ret < 0] <- 0
+        return(ret)
+    })
+}
+
+
+
 #' Get the columns that are common to a set of input tabular files
 #'
 #' @param files.list A vector of input file names. If these are text files, each file should be a tab-separated table,

diff --git a/inst/shinyGUI/server.R b/inst/shinyGUI/server.R
@@ -45,11 +45,15 @@ render_clustering_ui <- function(working.directory, ...) {renderUI({
         ),
         fluidRow(
             column(12,
-
                 numericInput("clusteringui_num_clusters", "Number of clusters", value = 200, min = 1, max = 2000),
                 numericInput("clusteringui_num_samples", "Number of samples (lower numbers lead to faster but less accurate results)", value = 50, min = 2),
                 numericInput("clusteringui_asinh_cofactor", "Cofactor for asinh transformation", value = 5),
                 numericInput("clusteringui_num_cores", "Number of CPU cores to use", value = 1),
+                selectInput("clusteringui_negative_values", "Negative vaues", choices = c("truncate", "shift")),
+                conditionalPanel(
+                    condition = "input.clusteringui_negative_values == 'shift'",
+                    numericInput("clusteringui_quantile_prob", "Quantile probability", value = 0.05, min = 0, max = 1, step = 0.01)
+                ),
                 actionButton("clusteringui_start", "Start clustering")
             )
         )
@@ -106,6 +110,8 @@ shinyServer(function(input, output, session) {
             num.samples <- force(input$clusteringui_num_samples)
             downsample.to <- force(input$clusteringui_downsample_to)
             output.dir <- force(working.directory)
+            negative.values <- force(input$clusteringui_negative_values)
+            quantile.prob <- force(input$clusteringui_quantile_prob)
 
             if(input$clusteringui_clustering_mode == "Pooled") {
                 input.files <- lapply(clusteringui_reactive_values$clustering_groups, function(s) {file.path(working.directory, s)})
@@ -116,6 +122,8 @@ shinyServer(function(input, output, session) {
                     asinh.cofactor = asinh.cofactor,
                     num.samples = num.samples,
                     downsample.to = downsample.to,
+                    negative.values = negative.values,
+                    quantile.prob = quantile.prob,
                     output.dir = output.dir
                 )
             } else {
@@ -125,7 +133,9 @@ shinyServer(function(input, output, session) {
                     num.clusters = num.clusters,
                     asinh.cofactor = asinh.cofactor,
                     num.samples = num.samples,
-                    output.dir = output.dir
+                    output.dir = output.dir,
+                    negative.values = negative.values,
+                    quantile.prob = quantile.prob
                 )
             }
 

diff --git a/man/cluster_fcs_files.Rd b/man/cluster_fcs_files.Rd
diff --git a/man/cluster_fcs_files_groups.Rd b/man/cluster_fcs_files_groups.Rd