diff --git a/.github/workflows/rworkflows.yml b/.github/workflows/rworkflows.yml new file mode 100644 index 0000000..ebc5a11 --- /dev/null +++ b/.github/workflows/rworkflows.yml @@ -0,0 +1,57 @@ +name: rworkflows +'on': + push: + branches: + - master + - main + - devel + - RELEASE_** + pull_request: + branches: + - master + - main + - devel + - RELEASE_** +jobs: + rworkflows: + permissions: write-all + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + container: ${{ matrix.config.cont }} + strategy: + fail-fast: ${{ false }} + matrix: + config: + - os: ubuntu-latest + bioc: devel + r: auto + cont: ghcr.io/bioconductor/bioconductor_docker:devel + rspm: ~ + - os: macOS-latest + bioc: devel + r: auto + cont: ~ + rspm: ~ + - os: windows-latest + bioc: devel + r: auto + cont: ~ + rspm: ~ + steps: + - uses: neurogenomics/rworkflows@master + with: + run_bioccheck: ${{ false }} + run_rcmdcheck: ${{ true }} + as_cran: ${{ true }} + run_vignettes: ${{ true }} + has_testthat: ${{ true }} + run_covr: ${{ true }} + run_pkgdown: ${{ true }} + has_runit: ${{ false }} + has_latex: ${{ false }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run_docker: ${{ false }} + DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }} + runner_os: ${{ runner.os }} + cache_version: cache-v1 + docker_registry: ghcr.io diff --git a/DESCRIPTION b/DESCRIPTION index e27d379..978ad5e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,10 +14,18 @@ Authors@R: person(given = "Eleanor", family = "Coffey", role = c("aut", "ths"), email = "elecof@utu.fi", comment = c(ORCID = "0000-0002-9717-5610"))) -Description: Differential expression analysis is a prevalent method utilised in the examination of diverse biological data. - The reproducibility-optimized test statistic (ROTS) modifies a t-statistic based on the data's intrinsic characteristics and ranks features according to their statistical significance for differential expression between two or more groups (f-statistic). - Focussing on proteomics and metabolomics, the current ROTS implementation cannot account for technical or biological covariates such as MS batches or gender differences among the samples. - Consequently, we developed LimROTS, which employs a reproducibility-optimized test statistic utilising the limma methodology to simulate complex experimental designs. +Description: Differential expression analysis is a prevalent method utilised in + the examination of diverse biological data.The + reproducibility-optimized test statistic (ROTS) modifies a + t-statistic based on the data's intrinsic characteristics and ranks + features according to their statistical significance for + differential expression between two or more groups (f-statistic). + Focussing on proteomics and metabolomics, the current ROTS + implementation cannot account for technical or biological + covariates such as MS batches or gender differences among + the samples.Consequently, we developed LimROTS, which employs a + reproducibility-optimized test statistic utilising the limma + methodology to simulate complex experimental designs. License: Artistic-2.0 Encoding: UTF-8 Roxygen: list(markdown = TRUE) @@ -40,8 +48,10 @@ Imports: utils, stats, doRNG, + magick, dplyr Suggests: + BiocStyle, ggplot2, testthat (>= 3.0.0), knitr, diff --git a/R/Bootstrap_functions.r b/R/Bootstrap_functions.r index eccf067..3af1761 100644 --- a/R/Bootstrap_functions.r +++ b/R/Bootstrap_functions.r @@ -1,43 +1,62 @@ #' Generate Bootstrap Samples with Optional Pairing #' -#' This function generates bootstrap samples from the input metadata. It samples with replacement -#' within each group defined in the metadata, and optionally adjusts for paired groups. +#' This function generates bootstrap samples from the input metadata. It samples +#' with replacement within each group defined in the metadata, and optionally +#' adjusts for paired groups. #' #' @param B Integer. The number of bootstrap samples to generate. -#' @param meta.info Data frame. Metadata containing sample information, where each row corresponds to a sample. -#' @param group.name Character. The name of the column in `meta.info` that defines the grouping variable for the samples. -#' @param paired Logical. If `TRUE`, the function ensures the bootstrap samples are paired between two groups. +#' @param meta.info Data frame. Metadata containing sample information, where +#' each row corresponds to a sample. +#' @param group.name Character. The name of the column in `meta.info` that +#' defines the grouping variable for the samples. +#' @param paired Logical. If `TRUE`, the function ensures the bootstrap samples +#' are paired between two groups. #' #' @details -#' The function works by resampling the row names of the metadata for each group separately. If `paired` is `TRUE`, -#' it assumes there are exactly two groups and samples the second group based on the positions of the first group to maintain pairing. +#' The function works by resampling the row names of the metadata for each group +#' separately. If `paired` is `TRUE`, it assumes there are exactly two groups +#' and samples the second group based on the positions of the first group to +#' maintain pairing. #' -#' @return A matrix of dimension \code{B} x \code{n}, where \code{n} is the number of samples. Each row corresponds -#' to a bootstrap sample, and each entry is a resampled row name from the metadata. +#' @return A matrix of dimension \code{B} x \code{n}, where \code{n} is the +#' number of samples. Each row corresponds to a bootstrap sample, and each +#' entry is a resampled row name from the metadata. #' #' @export #' @examples #' # Example usage: #' set.seed(123) -#' meta.info <- data.frame(group = rep(c("A", "B"), each = 5), row.names = paste0("Sample", 1:10)) -#' bootstrapS(B = 10, meta.info = meta.info, group.name = "group", paired = FALSE) +#' meta.info <- data.frame( +#' group = rep(c("A", "B"), each = 5), +#' row.names = paste0("Sample", 1:10) +#' ) +#' bootstrapS( +#' B = 10, meta.info = meta.info, group.name = "group", +#' paired = FALSE +#' ) #' #' # Paired bootstrap sampling -#' bootstrapS(B = 10, meta.info = meta.info, group.name = "group", paired = TRUE) +#' bootstrapS( +#' B = 10, meta.info = meta.info, group.name = "group", +#' paired = TRUE +#' ) +#' bootstrapS <- function(B, meta.info, group.name, paired) { groups <- meta.info[, group.name] bootsamples <- matrix(nrow = B, ncol = length(groups)) for (i in seq_len(B)) { for (g in unique(groups)) { g.names <- row.names(meta.info)[which(groups == g)] - bootsamples[i, which(groups == g)] <- sample(g.names, length(g.names), replace = TRUE) + bootsamples[i, which(groups == g)] <- + sample(g.names, length(g.names), replace = TRUE) } } if (paired) { for (i in seq_len(B)) { g.names1 <- bootsamples[i, which(groups == unique(groups)[1])] g.names2 <- match(g.names1, row.names(meta.info)) + length(g.names1) - bootsamples[i, which(groups == unique(groups)[2])] <- row.names(meta.info)[g.names2] + bootsamples[i, which(groups == unique(groups)[2])] <- + row.names(meta.info)[g.names2] } } return(bootsamples) @@ -46,26 +65,32 @@ bootstrapS <- function(B, meta.info, group.name, paired) { #' Generate Permutated Samples #' -#' This function generates permuted samples by shuffling the row names of the metadata. +#' This function generates permuted samples by shuffling the row names of the +#' metadata. #' -#' @param meta.info Data frame. Metadata containing sample information, where each row corresponds to a sample. +#' @param meta.info Data frame. Metadata containing sample information, where +#' each row corresponds to a sample. #' @param B Integer. The number of permutations to generate. #' #' @details -#' The function creates a matrix where each row is a permuted version of the row names from `meta.info`. -#' This can be used to generate null distributions or perform randomization-based tests. +#' The function creates a matrix where each row is a permuted version of the +#' row names from `meta.info`.This can be used to generate null distributions +#' or perform randomization-based tests. #' -#' @return A matrix of dimension \code{B} x \code{n}, where \code{n} is the number of samples (i.e., rows in `meta.info`). -#' Each row is a permutation of the row names of the metadata. +#' @return A matrix of dimension \code{B} x \code{n}, where \code{n} is the +#' number of samples (i.e., rows in `meta.info`).Each row is a permutation of +#' the row names of the metadata. #' #' @export #' @examples #' # Example usage: #' set.seed(123) -#' meta.info <- data.frame(group = rep(c("A", "B"), each = 5), row.names = paste0("Sample", 1:10)) +#' meta.info <- data.frame( +#' group = rep(c("A", "B"), each = 5), +#' row.names = paste0("Sample", 1:10) +#' ) #' permutatedS(meta.info = meta.info, B = 10) -permutatedS <- function(meta.info, B) -{ +permutatedS <- function(meta.info, B) { persamples <- matrix(nrow = B, ncol = nrow(meta.info)) for (i in seq_len(B)) { persamples[i, ] <- sample(row.names(meta.info)) @@ -77,32 +102,42 @@ permutatedS <- function(meta.info, B) #' Generate Stratified Bootstrap Samples for limRots #' -#' This function generates stratified bootstrap samples based on the groupings and additional factors in the metadata. -#' The function ensures that samples are drawn proportionally based on strata defined by the interaction of factor columns in the metadata. +#' This function generates stratified bootstrap samples based on the groupings +#' and additional factors in the metadata. The function ensures that samples +#' are drawn proportionally based on strata defined by the interaction of +#' factor columns in the metadata. #' #' @param B Integer. The number of bootstrap samples to generate. -#' @param meta.info Data frame. Metadata containing sample information, where each row corresponds to a sample. Factor columns in `meta.info` are used to define strata for sampling. -#' @param group.name Character. The name of the column in `meta.info` that defines the grouping variable for the samples. +#' @param meta.info Data frame. Metadata containing sample information, +#' where each row corresponds to a sample. Factor columns in `meta.info` +#' are used to define strata for sampling. +#' @param group.name Character. The name of the column in `meta.info` that +#' defines the grouping variable for the samples. #' #' @details -#' The function works by first identifying the factors in the `meta.info` data frame that are used to create strata for sampling. -#' Within each group defined by `group.name`, the function samples according to the strata proportions, ensuring that samples are drawn -#' from the correct groups and strata in a proportional manner. +#' The function works by first identifying the factors in the `meta.info` data +#' frame that are used to create strata for sampling. Within each group defined +#' by `group.name`, the function samples according to the strata proportions, +#' ensuring that samples are drawn from the correct groups and strata in a +#' proportional manner. #' -#' @return A matrix of dimension \code{B} x \code{n}, where \code{n} is the number of samples. Each row corresponds -#' to a bootstrap sample, and each entry is a resampled row name from the metadata, stratified by group and additional factors. +#' @return A matrix of dimension \code{B} x \code{n}, where \code{n} is the +#' number of samples. Each row corresponds to a bootstrap sample, and each +#' entry is a resampled row name from the metadata, stratified by group and +#' additional factors. #' #' @export #' @examples #' # Example usage: #' set.seed(123) -#' meta.info <- data.frame(group = rep(c(1, 2), each = 5), +#' meta.info <- data.frame( +#' group = rep(c(1, 2), each = 5), #' batch = rep(c("A", "B"), 5), -#' row.names = paste0("Sample", 1:10)) +#' row.names = paste0("Sample", 1:10) +#' ) #' meta.info$batch <- as.factor(meta.info$batch) #' bootstrapSamples.limRots(B = 10, meta.info = meta.info, group.name = "group") -bootstrapSamples.limRots <- function(B, meta.info, group.name) -{ +bootstrapSamples.limRots <- function(B, meta.info, group.name) { labels <- as.numeric(meta.info[, group.name]) samples <- matrix(nrow = B, ncol = length(labels)) for (i in seq_len(B)) { @@ -112,7 +147,8 @@ bootstrapSamples.limRots <- function(B, meta.info, group.name) meta.info.factors <- c() for (j in seq_len(ncol(meta.info))) { if (is.factor(meta.info.pos[, j])) { - meta.info.factors <- c(meta.info.factors, colnames(meta.info.pos)[j]) + meta.info.factors <- + c(meta.info.factors, colnames(meta.info.pos)[j]) } } if (is.null(meta.info.factors)) { @@ -124,14 +160,21 @@ bootstrapSamples.limRots <- function(B, meta.info, group.name) ) return(samples) } - meta.info.factors <- meta.info.factors[meta.info.factors != group.name] - meta.info.pos$stratum <- interaction(meta.info.pos[, meta.info.factors]) + meta.info.factors <- + meta.info.factors[meta.info.factors != group.name] + meta.info.pos$stratum <- + interaction(meta.info.pos[, meta.info.factors]) stratum_sizes <- table(meta.info.pos$stratum) stratum_samples <- round(length(pos) * prop.table(stratum_sizes)) - sampled_indices <- unlist(lapply(names(stratum_samples), function(stratum) { - stratum_indices <- row.names(meta.info.pos)[which(meta.info.pos$stratum == stratum)] - sample(stratum_indices, stratum_samples[stratum], replace = TRUE) - })) + sampled_indices <- + unlist(lapply(names(stratum_samples), function(stratum) { + stratum_indices <- + row.names(meta.info.pos)[which(meta.info.pos$stratum == + stratum)] + sample(stratum_indices, stratum_samples[stratum], + replace = TRUE + ) + })) samples[i, pos] <- sampled_indices } } diff --git a/R/LM_with_limma.permuting.r b/R/LM_with_limma.permuting.r deleted file mode 100644 index c72f327..0000000 --- a/R/LM_with_limma.permuting.r +++ /dev/null @@ -1,108 +0,0 @@ -#' Perform Permutation-Based Linear Modeling with Covariates using Limma -#' -#' This function performs linear modeling using the Limma package with permutation of the covariates -#' to evaluate the test statistics under random assignments. It handles two-group comparisons and -#' multi-group settings. -#' -#' @param x A list containing two or more data matrices where rows represent features (e.g., genes, proteins) -#' and columns represent samples. The list should contain at least two matrices for pairwise group comparison. -#' @param group.name A character string indicating the name of the group variable in `meta.info` to be used -#' in the analysis. -#' @param meta.info A data frame containing the metadata for the samples. This includes sample grouping and any -#' covariates to be included in the model. -#' @param formula.str A string specifying the formula to be used in model fitting. It should follow the standard -#' R formula syntax (e.g., `~ covariate1 + covariate2`). -#' @param trend A logical value indicating whether to allow for an intensity-dependent trend in the prior variance. -#' @param robust A logical value indicating whether to use a robust fitting procedure to protect against outliers. -#' @param permutating.group Logical, If \code{TRUE}, the permutation for calculating the null distribution is performed by permuting the target group only specified in \code{group.name}. If FALSE, the entire \code{meta.info} will be permuted (recommended to be set to TRUE). -#' -#' @details -#' This function combines the data matrices from different groups and permutes the covariates from `meta.info` -#' before fitting a linear model using Limma. Permutation helps assess how the covariates behave under random -#' conditions, providing a null distribution of the test statistics. For two-group comparisons, the function -#' computes contrasts between the two groups and applies empirical Bayes moderation. For multi-group analysis -#' with a single covariate, pairwise contrasts are computed, and the moderated F-statistic is calculated for -#' each feature. -#' -#' @return A list containing the following elements: -#' \item{d}{A vector of the test statistics (log-fold changes or F-statistics) for each feature.} -#' \item{s}{A vector of the standard deviations for each feature, adjusted by the empirical Bayes procedure.} -#' -#' @seealso \code{\link[limma]{lmFit}}, \code{\link[limma]{eBayes}}, \code{\link[limma]{topTable}}, -#' \code{\link[limma]{makeContrasts}} -#' -#' @importFrom stats model.matrix formula -#' @importFrom dplyr bind_cols -#' @importFrom limma makeContrasts lmFit contrasts.fit eBayes topTable -#' @import utils -#' @importFrom stringr str_split_fixed fixed -#' -#' - - -testStatistic_with_covariates_permutating <- function(x, - group.name, - meta.info, - formula.str, - trend, - robust, - permutating.group) { - data <- x - combined_data <- data.frame( - check.rows = FALSE, - check.names = FALSE, - none = rep("none", nrow(data[[1]])) - ) - for (k.list in names(data)) { - combined_data <- cbind(combined_data, - data.frame(data[[k.list]], check.rows = FALSE, check.names = FALSE)) - } - combined_data <- combined_data[, -1] - colnames(combined_data) <- paste0(colnames(combined_data), ".", seq(1, ncol(combined_data))) - covariates.p <- data.frame() - meta.info.temp <- meta.info - meta.info.temp$sample.id <- row.names(meta.info.temp) - for (i in colnames(combined_data)) { - real_SampleNames <- str_split_fixed(i, fixed("."), 2)[, 1] - df.temp <- meta.info.temp[row.names(meta.info.temp) %in% real_SampleNames, ] - df.temp$sample.id <- i - covariates.p <- rbind(covariates.p, df.temp) - } - if (permutating.group == TRUE) { - covariates.p[, group.name] <- sample(covariates.p[, group.name]) - } else{ - covariates.p <- covariates.p[sample(nrow(covariates.p)), ] - } - covariates.p$sample.id <- NULL - row.names(covariates.p) <- NULL - design.matrix <- model.matrix(formula(formula.str), data = covariates.p) - colnames(design.matrix) <- make.names(colnames(design.matrix)) - fit <- lmFit(combined_data, design.matrix) - if (length(data) == 2) { - pairwise_contrasts <- paste0(group.name, unique(covariates.p[, group.name])) - pairwise_contrasts <- combn(pairwise_contrasts, 2, function(x) - paste(x[1], "-", x[2])) - cont_matrix <- makeContrasts(contrasts = pairwise_contrasts, levels = - design.matrix) - fit2 <- contrasts.fit(fit, cont_matrix) - fit.ebayes <- eBayes(fit2, trend = trend, robust = robust) - d_values <- topTable( - fit.ebayes, - coef = pairwise_contrasts, - number = "Inf", - sort.by = 'none' - ) - d_values <- abs(d_values$logFC) - s_values <- as.numeric(sqrt(fit.ebayes$s2.post) * fit.ebayes$stdev.unscaled[, 1]) - return(list(d = d_values, s = s_values)) - } else if (length(data) > 2 & ncol(covariates.p) == 1) { - pairwise_contrasts <- paste0(group.name, unique(covariates.p[, group.name])) - pairwise_contrasts <- combn(pairwise_contrasts, 2, function(x) - paste(x[1], "-", x[2])) - cont_matrix <- makeContrasts(contrasts = pairwise_contrasts, levels = design.matrix) - fit2 <- contrasts.fit(fit, cont_matrix) - fit.ebayes <- eBayes(fit2, trend = trend, robust = robust) - msr <- fit.ebayes$F * fit.ebayes$s2.post - return(list(d = msr, s = fit.ebayes$s2.post)) - } -} diff --git a/R/LM_with_limma.r b/R/LM_with_limma.r index 23c99c3..02809f7 100644 --- a/R/LM_with_limma.r +++ b/R/LM_with_limma.r @@ -1,32 +1,43 @@ #' Perform Linear Modeling with Covariates using Limma #' -#' This function performs linear modeling using the Limma package while accounting for covariates specified -#' in the `meta.info`. It supports two-group comparisons and multi-group analysis, incorporating covariates +#' This function performs linear modeling using the Limma package while +#' accounting for covariates specified +#' in the `meta.info`. It supports two-group comparisons and multi-group +#' analysis, incorporating covariates #' through a design matrix. #' -#' @param x A list containing two or more data matrices where rows represent features (e.g., genes, proteins) -#' and columns represent samples. The list should contain at least two matrices for pairwise group comparison. -#' @param group.name A character string indicating the name of the group variable in `meta.info` to be used -#' in the analysis. -#' @param meta.info A data frame containing the metadata for the samples. This includes sample grouping and any -#' covariates to be included in the model. -#' @param formula.str A string specifying the formula to be used in model fitting. It should follow the standard -#' R formula syntax (e.g., `~ covariate1 + covariate2`). -#' @param trend A logical value indicating whether to allow for an intensity-dependent trend in the prior variance. -#' @param robust A logical value indicating whether to use a robust fitting procedure to protect against outliers. +#' @param x A list containing two or more data matrices where rows represent +#' features (e.g., genes, proteins) and columns represent samples. The list +#' should contain at least two matrices for pairwise group comparison. +#' @param group.name A character string indicating the name of the group +#' variable in `meta.info` to be used in the analysis. +#' @param meta.info A data frame containing the metadata for the samples. +#' This includes sample grouping and any covariates to be included in the model. +#' @param formula.str A string specifying the formula to be used in model +#' fitting. It should follow the standard R formula syntax +#' (e.g., `~ covariate1 + covariate2`). +#' @param trend A logical value indicating whether to allow for an +#' intensity-dependent trend in the prior variance. +#' @param robust A logical value indicating whether to use a robust +#' fitting procedure to protect against outliers. #' #' @details -#' This function first combines the data matrices from different groups and prepares a design matrix based on -#' the covariates specified in `meta.info` using the provided formula. It fits a linear model using Limma, -#' computes contrasts between groups, and applies empirical Bayes moderation. For two-group comparisons, the -#' function returns log-fold changes and associated statistics. In multi-group settings with a single covariate, +#' This function first combines the data matrices from different groups and +#' prepares a design matrix based on the covariates specified in `meta.info` +#' using the provided formula. It fits a linear model using Limma, +#' computes contrasts between groups, and applies empirical Bayes moderation. +#' For two-group comparisons, the function returns log-fold changes and +#' associated statistics. In multi-group settings with a single covariate, #' it calculates pairwise contrasts and moderated F-statistics. #' #' @return A list containing the following elements: -#' \item{d}{A vector of the test statistics (log-fold changes or F-statistics) for each feature.} -#' \item{s}{A vector of the standard deviations for each feature, adjusted by the empirical Bayes procedure.} +#' \item{d}{A vector of the test statistics (log-fold changes or F-statistics) +#' for each feature.} +#' \item{s}{A vector of the standard deviations for each feature, adjusted by +#' the empirical Bayes procedure.} #' -#' @seealso \code{\link[limma]{lmFit}}, \code{\link[limma]{eBayes}}, \code{\link[limma]{topTable}}, +#' @seealso \code{\link[limma]{lmFit}}, \code{\link[limma]{eBayes}}, +#' \code{\link[limma]{topTable}}, #' \code{\link[limma]{makeContrasts}} #' #' @importFrom stats model.matrix formula @@ -35,34 +46,31 @@ #' @importFrom stringr str_split_fixed fixed #' @importFrom limma makeContrasts lmFit contrasts.fit eBayes topTable #' -#' - - -testStatistic_with_covariates <- function(x, - group.name, - meta.info, - formula.str, - trend, - robust) { - data <- x - combined_data <- data.frame( - check.rows = FALSE, - check.names = FALSE, - none = rep("none", nrow(data[[1]])) +testStatistic_with_covariates <- + function(x, group.name, meta.info, formula.str, trend,robust) { + data <- x + combined_data <- data.frame( + check.rows = FALSE, + check.names = FALSE, + none = rep("none", nrow(data[[1]])) ) for (k.list in names(data)) { - combined_data <- cbind(combined_data, - data.frame(data[[k.list]], check.rows = FALSE, check.names = FALSE)) + combined_data <- cbind( + combined_data, + data.frame(data[[k.list]], check.rows = FALSE, check.names = FALSE) + ) } combined_data <- combined_data[, -1] - colnames(combined_data) <- paste0(colnames(combined_data), ".", seq(1, ncol(combined_data))) + colnames(combined_data) <- + paste0(colnames(combined_data), ".", seq(1, ncol(combined_data))) covariates.p <- data.frame() meta.info.temp <- meta.info meta.info.temp$sample.id <- row.names(meta.info.temp) for (i in colnames(combined_data)) { real_SampleNames <- str_split_fixed(i, fixed("."), 2)[, 1] - df.temp <- meta.info.temp[row.names(meta.info.temp) %in% real_SampleNames, ] + df.temp <- + meta.info.temp[row.names(meta.info.temp) %in% real_SampleNames, ] df.temp$sample.id <- i covariates.p <- rbind(covariates.p, df.temp) } @@ -72,11 +80,14 @@ testStatistic_with_covariates <- function(x, colnames(design.matrix) <- make.names(colnames(design.matrix)) fit <- lmFit(combined_data, design.matrix) if (length(data) == 2) { - pairwise_contrasts <- paste0(group.name, unique(covariates.p[, group.name])) + pairwise_contrasts <- + paste0(group.name, unique(covariates.p[, group.name])) pairwise_contrasts <- combn(pairwise_contrasts, 2, function(x) paste(x[1], "-", x[2])) - cont_matrix <- makeContrasts(contrasts = pairwise_contrasts, levels = - design.matrix) + cont_matrix <- makeContrasts( + contrasts = pairwise_contrasts, levels = + design.matrix + ) fit2 <- contrasts.fit(fit, cont_matrix) fit.ebayes <- eBayes(fit2, trend = trend, robust = robust) d_values <- topTable( @@ -86,13 +97,20 @@ testStatistic_with_covariates <- function(x, sort.by = 'none' ) d_values <- abs(d_values$logFC) - s_values <- as.numeric(sqrt(fit.ebayes$s2.post) * fit.ebayes$stdev.unscaled[, 1]) + s_values <- + as.numeric(sqrt(fit.ebayes$s2.post) * + fit.ebayes$stdev.unscaled[, 1]) return(list(d = d_values, s = s_values)) } else if (length(data) > 2 & ncol(covariates.p) == 1) { - pairwise_contrasts <- paste0(group.name, unique(covariates.p[, group.name])) + pairwise_contrasts <- + paste0(group.name, unique(covariates.p[, group.name])) pairwise_contrasts <- combn(pairwise_contrasts, 2, function(x) paste(x[1], "-", x[2])) - cont_matrix <- makeContrasts(contrasts = pairwise_contrasts, levels = design.matrix) + cont_matrix <- + makeContrasts( + contrasts = pairwise_contrasts, + levels = design.matrix + ) fit2 <- contrasts.fit(fit, cont_matrix) fit.ebayes <- eBayes(fit2, trend = trend, robust = robust) msr <- fit.ebayes$F * fit.ebayes$s2.post diff --git a/R/LM_with_limma.fit.r b/R/LM_with_limma_fit.R similarity index 57% rename from R/LM_with_limma.fit.r rename to R/LM_with_limma_fit.R index e0a478f..8ef39b1 100644 --- a/R/LM_with_limma.fit.r +++ b/R/LM_with_limma_fit.R @@ -1,37 +1,43 @@ #' Perform Linear Modeling with Covariates using Limma #' -#' This function performs linear modeling using the Limma package, incorporating covariates -#' in the model fitting process. It is designed to handle both two-group comparisons and -#' multi-group settings with covariates. +#' This function performs linear modeling using the Limma package, incorporating +#' covariates in the model fitting process. It is designed to handle both +#' two-group comparisons and multi-group settings with covariates. #' -#' @param x A list containing two or more data matrices where rows represent features -#' (e.g., genes, proteins) and columns represent samples. The list should contain at least -#' two matrices for pairwise group comparison. -#' @param group.name A character string indicating the name of the group variable in -#' `meta.info` to be used in the analysis. -#' @param meta.info A data frame containing the metadata for the samples. This includes -#' sample grouping and any covariates to be included in the model. -#' @param formula.str A string specifying the formula to be used in model fitting. It should -#' follow the standard R formula syntax (e.g., `~ covariate1 + covariate2`). -#' @param trend A logical value indicating whether to allow for an intensity-dependent trend in -#' the prior variance. -#' @param robust A logical value indicating whether to use a robust fitting procedure to protect -#' against outliers. +#' @param x A list containing two or more data matrices where rows represent +#' features (e.g., genes, proteins) and columns represent samples. The list +#' should contain at least two matrices for pairwise group comparison. +#' @param group.name A character string indicating the name of the group +#' variable in`meta.info` to be used in the analysis. +#' @param meta.info A data frame containing the metadata for the samples. +#' This includes sample grouping and any covariates to be included in the model. +#' @param formula.str A string specifying the formula to be used in model +#' fitting. It should follow the standard R formula syntax +#' (e.g., `~ covariate1 + covariate2`). +#' @param trend A logical value indicating whether to allow for an +#' intensity-dependent trend in the prior variance. +#' @param robust A logical value indicating whether to use a robust fitting +#' procedure to protect against outliers. #' #' @details -#' This function combines the data matrices from different groups and fits a linear model -#' using covariates provided in the `meta.info`. For two-group comparisons, the function -#' computes contrasts between the two groups and applies empirical Bayes moderation. For -#' multi-group analysis with a single covariate, pairwise contrasts are computed, and the -#' moderated F-statistic is calculated for each feature. +#' This function combines the data matrices from different groups and fits a +#' linear model using covariates provided in the `meta.info`. For two-group +#' comparisons, the function computes contrasts between the two groups and +#' applies empirical Bayes moderation. For multi-group analysis with a single +#' covariate, pairwise contrasts are computed, and the moderated F-statistic is +#' calculated for each feature. #' #' @return A list containing the following elements: -#' \item{d}{A vector of the test statistics (log-fold changes or F-statistics) for each feature.} -#' \item{s}{A vector of the standard deviations for each feature, adjusted by the empirical Bayes -#' procedure.} -#' \item{corrected.logfc}{The log-fold changes for each feature after fitting the model.} +#' \item{d}{A vector of the test statistics (log-fold changes or F-statistics) +#' for each feature.} +#' \item{s}{A vector of the standard deviations for each feature, adjusted by t +#' he empirical Bayes procedure.} +#' \item{corrected.logfc}{The log-fold changes for each feature after fitting +#' the model.} #' -#' @seealso \code{\link[limma]{lmFit}}, \code{\link[limma]{eBayes}}, \code{\link[limma]{topTable}}, +#' @seealso \code{\link[limma]{lmFit}}, +#' \code{\link[limma]{eBayes}}, +#' \code{\link[limma]{topTable}}, #' \code{\link[limma]{makeContrasts}} #' #' @@ -41,17 +47,9 @@ #' @importFrom limma makeContrasts lmFit contrasts.fit eBayes topTable #' @import utils #' -#' -#' - - -testStatistic_with_covariates_Fit <- function(x, - group.name, - meta.info, - formula.str, - trend, - robust) { +testStatistic_with_covariates_Fit <- + function(x, group.name, meta.info, formula.str, trend, robust){ data <- x combined_data <- data.frame( check.rows = FALSE, @@ -59,19 +57,26 @@ testStatistic_with_covariates_Fit <- function(x, none = rep("none", nrow(data[[1]])) ) for (k.list in names(data)) { - combined_data <- cbind(combined_data, - data.frame(data[[k.list]], check.rows = FALSE, check.names = FALSE)) + combined_data <- cbind( + combined_data, + data.frame(data[[k.list]], check.rows = FALSE, check.names = FALSE) + ) } combined_data <- combined_data[, -1] design.matrix <- model.matrix(formula(formula.str), data = meta.info) colnames(design.matrix) <- make.names(colnames(design.matrix)) fit <- lmFit(combined_data, design.matrix) if (length(data) == 2) { - pairwise_contrasts <- paste0(group.name, unique(meta.info[, group.name])) + pairwise_contrasts <- paste0(group.name, unique(meta.info[ + , + group.name + ])) pairwise_contrasts <- combn(pairwise_contrasts, 2, function(x) paste(x[1], "-", x[2])) - cont_matrix <- makeContrasts(contrasts = pairwise_contrasts, levels = - design.matrix) + cont_matrix <- makeContrasts( + contrasts = pairwise_contrasts, levels = + design.matrix + ) fit2 <- contrasts.fit(fit, cont_matrix) fit.ebayes <- eBayes(fit2, trend = trend, robust = robust) d_values <- topTable( @@ -82,17 +87,24 @@ testStatistic_with_covariates_Fit <- function(x, ) corrected.logfc <- d_values$logFC d_values <- abs(d_values$logFC) - s_values <- as.numeric(sqrt(fit.ebayes$s2.post) * fit.ebayes$stdev.unscaled[, 1]) + s_values <- + as.numeric(sqrt(fit.ebayes$s2.post) * + fit.ebayes$stdev.unscaled[, 1]) return(list( d = d_values, s = s_values, corrected.logfc = corrected.logfc )) } else if (length(data) > 2 & ncol(meta.info) == 1) { - pairwise_contrasts <- paste0(group.name, unique(meta.info[, group.name])) + pairwise_contrasts <- + paste0(group.name, unique(meta.info[, group.name])) pairwise_contrasts <- combn(pairwise_contrasts, 2, function(x) paste(x[1], "-", x[2])) - cont_matrix <- makeContrasts(contrasts = pairwise_contrasts, levels = design.matrix) + cont_matrix <- + makeContrasts( + contrasts = pairwise_contrasts, + levels = design.matrix + ) fit2 <- contrasts.fit(fit, cont_matrix) fit.ebayes <- eBayes(fit2, trend = trend, robust = robust) msr <- fit.ebayes$F * fit.ebayes$s2.post diff --git a/R/LM_with_limma_permuting.R b/R/LM_with_limma_permuting.R new file mode 100644 index 0000000..ca176ef --- /dev/null +++ b/R/LM_with_limma_permuting.R @@ -0,0 +1,138 @@ +#' Perform Permutation-Based Linear Modeling with Covariates using Limma +#' +#' This function performs linear modeling using the Limma package with +#' permutation of the covariates to evaluate the test statistics under random +#' assignments. It handles two-group comparisons and multi-group settings. +#' +#' @param x A list containing two or more data matrices where rows represent +#' features (e.g., genes, proteins) and columns represent samples. The list +#' should contain at least two matrices for pairwise group comparison. +#' @param group.name A character string indicating the name of the group +#' variable in `meta.info` to be used in the analysis. +#' @param meta.info A data frame containing the metadata for the samples. +#' This includes sample grouping and any covariates to be included in the model. +#' @param formula.str A string specifying the formula to be used in model +#' fitting. It should follow the standard R formula syntax +#' (e.g., `~ covariate1 + covariate2`). +#' @param trend A logical value indicating whether to allow for an +#' intensity-dependent trend in the prior variance. +#' @param robust A logical value indicating whether to use a robust +#' fitting procedure to protect against outliers. +#' @param permutating.group Logical, If \code{TRUE}, the permutation for +#' calculating the null distribution is performed by permuting the target group +#' only specified in \code{group.name}. If FALSE, the entire \code{meta.info} +#' will be permuted (recommended to be set to TRUE). +#' +#' @details +#' This function combines the data matrices from different groups and permutes +#' the covariates from `meta.info`before fitting a linear model using Limma. +#' Permutation helps assess how the covariates behave under random conditions, +#' providing a null distribution of the test statistics. For two-group +#' comparisons, the function computes contrasts between the two groups and +#' applies empirical Bayes moderation. For multi-group analysis with a single +#' covariate, pairwise contrasts are computed, and the moderated F-statistic is +#' calculated for each feature. +#' +#' @return A list containing the following elements: +#' \item{d}{A vector of the test statistics (log-fold changes or F-statistics) +#' for each feature.} +#' \item{s}{A vector of the standard deviations for each feature, adjusted by +#' the empirical Bayes procedure.} +#' +#' @seealso \code{\link[limma]{lmFit}}, \code{\link[limma]{eBayes}}, +#' \code{\link[limma]{topTable}}, +#' \code{\link[limma]{makeContrasts}} +#' +#' @importFrom stats model.matrix formula +#' @importFrom dplyr bind_cols +#' @importFrom limma makeContrasts lmFit contrasts.fit eBayes topTable +#' @import utils +#' @importFrom stringr str_split_fixed fixed +#' +testStatistic_with_covariates_permutating <- + function(x, group.name, meta.info, formula.str, trend, robust, + permutating.group) { + data <- x + combined_data <- data.frame( + check.rows = FALSE, + check.names = FALSE, + none = rep("none", nrow(data[[1]])) + ) + for (k.list in names(data)) { + combined_data <- cbind( + combined_data, + data.frame(data[[k.list]], + check.rows = FALSE, + check.names = FALSE + ) + ) + } + combined_data <- combined_data[, -1] + colnames(combined_data) <- + paste0(colnames(combined_data), ".", seq(1, ncol(combined_data))) + covariates.p <- data.frame() + meta.info.temp <- meta.info + meta.info.temp$sample.id <- row.names(meta.info.temp) + for (i in colnames(combined_data)) { + real_SampleNames <- str_split_fixed(i, fixed("."), 2)[, 1] + df.temp <- + meta.info.temp[row.names(meta.info.temp) %in% + real_SampleNames, ] + df.temp$sample.id <- i + covariates.p <- rbind(covariates.p, df.temp) + } + if (permutating.group == TRUE) { + covariates.p[, group.name] <- sample(covariates.p[, group.name]) + } else { + covariates.p <- covariates.p[sample(nrow(covariates.p)), ] + } + covariates.p$sample.id <- NULL + row.names(covariates.p) <- NULL + design.matrix <- + model.matrix(formula(formula.str), data = covariates.p) + colnames(design.matrix) <- + make.names(colnames(design.matrix)) + fit <- lmFit(combined_data, design.matrix) + if (length(data) == 2) { + pairwise_contrasts <- + paste0(group.name, unique(covariates.p[, group.name])) + pairwise_contrasts <- + combn(pairwise_contrasts, 2, function(x) + paste(x[1], "-", x[2])) + cont_matrix <- + makeContrasts( + contrasts = pairwise_contrasts, + levels = design.matrix + ) + fit2 <- contrasts.fit(fit, cont_matrix) + fit.ebayes <- + eBayes(fit2, trend = trend, robust = robust) + d_values <- topTable( + fit.ebayes, + coef = pairwise_contrasts, + number = "Inf", + sort.by = 'none' + ) + d_values <- abs(d_values$logFC) + s_values <- + as.numeric(sqrt(fit.ebayes$s2.post) * + fit.ebayes$stdev.unscaled[, 1]) + return(list(d = d_values, s = s_values)) + } else if (length(data) > 2 & ncol(covariates.p) == 1) { + pairwise_contrasts <- + paste0(group.name, unique(covariates.p[, group.name])) + pairwise_contrasts <- + combn(pairwise_contrasts, 2, function(x) + paste(x[1], "-", x[2])) + cont_matrix <- + makeContrasts( + contrasts = pairwise_contrasts, + levels = design.matrix + ) + fit2 <- contrasts.fit(fit, cont_matrix) + fit.ebayes <- + eBayes(fit2, trend = trend, robust = robust) + msr <- fit.ebayes$F * fit.ebayes$s2.post + return(list(d = msr, s = fit.ebayes$s2.post)) + } + } diff --git a/R/LimROTS.r b/R/LimROTS.r index e25876b..6d225d2 100644 --- a/R/LimROTS.r +++ b/R/LimROTS.r @@ -1,26 +1,59 @@ -#' LimROTS: A Hybrid Method Integrating Empirical Bayes and Reproducibility-Optimized Statistics for Robust Analysis of Proteomics and Metabolomics Data +#' LimROTS: A Hybrid Method Integrating Empirical Bayes and +#' Reproducibility-Optimized Statistics for Robust Analysis of Proteomics and +#' Metabolomics Data #' -#' The `LimROTS` function employs a reproducibility-optimized test statistic utilising the limma and ROTS methodology to simulate complex experimental designs. +#' The `LimROTS` function employs a reproducibility-optimized test statistic +#' utilising the limma and ROTS methodology to simulate complex experimental +#' designs. #' -#' @param x A \code{SummarizedExperiment} object or a matrix where rows represent features (e.g., genes, proteins) and columns represent samples. The values should be log-transformed. -#' @param B An integer specifying the number of bootstrap iterations. Default is 1000. -#' @param K An optional integer representing the top list size for ranking. If not specified, it is set to one-fourth of the number of features. -#' @param a1 Optional numeric value used in the optimization process. If defined by the user, no optimization occurs. -#' @param a2 Optional numeric value used in the optimization process. If defined by the user, no optimization occurs. -#' @param log Logical, indicating whether the data is already log-transformed. Default is \code{TRUE}. -#' @param progress Logical, indicating whether to display a progress bar during bootstrap sampling. Default is \code{FALSE}. -#' @param verbose Logical, indicating whether to display messages during the function's execution. Default is \code{TRUE}. -#' @param meta.info A data frame containing sample-level metadata, where each row corresponds to a sample. It should include the grouping variable specified in \code{group.name}. If \code{x} is a \code{SummarizedExperiment} object, \code{meta.info} must be a vector of the metadata needed for the model to run and can be retrieved using \code{colData()}. -#' @param group.name A string specifying the column in \code{meta.info} that represents the groups or conditions for comparison. -#' @param seed.cl An integer specifying the seed for randomization; if not provided, the default is 1234. -#' @param cluster A parallel cluster object for distributed computation, e.g., created by \code{makeCluster()}. Default is 2. -#' @param survival Logical, indicating whether to enable survival analysis. If \code{TRUE}, then \code{meta.info} should contain \code{time} and \code{event} columns. -#' @param paired Logical, indicating whether the data represent paired samples. Default is \code{FALSE}. -#' @param n.ROTS Logical. If \code{TRUE}, all parameters related to \code{LimROTS} will be ignored, and the original \code{ROTS} analysis will run. This must be \code{TRUE} when \code{survival} or \code{paired} is set to \code{TRUE}. -#' @param formula.str A formula string used when covariates are present in meta.info for modeling. It should include "~ 0 + ..." to exclude the intercept from the model. -#' @param robust indicating whether robust fitting should be used. Default is TRUE, see \link{eBayes}. -#' @param trend indicating whether to include trend fitting in the differential expression analysis. Default is TRUE. see \link{eBayes}. -#' @param permutating.group Logical, If \code{TRUE}, the permutation for calculating the null distribution is performed by permuting the target group only specified in \code{group.name}. If FALSE, the entire \code{meta.info} will be permuted (recommended to be set to TRUE). +#' @param x A \code{SummarizedExperiment} object or a matrix where rows +#' represent features (e.g., genes, proteins) and columns represent samples. +#' The values should be log-transformed. +#' @param B An integer specifying the number of bootstrap iterations. +#' Default is 1000. +#' @param K An optional integer representing the top list size for ranking. +#' If not specified, it is set to one-fourth of the number of features. +#' @param a1 Optional numeric value used in the optimization process. +#' If defined by the user, no optimization occurs. +#' @param a2 Optional numeric value used in the optimization process. +#' If defined by the user, no optimization occurs. +#' @param log Logical, indicating whether the data is already log-transformed. +#' Default is \code{TRUE}. +#' @param progress Logical, indicating whether to display a progress bar during +#' bootstrap sampling. Default is \code{FALSE}. +#' @param verbose Logical, indicating whether to display messages during the +#' function's execution. Default is \code{TRUE}. +#' @param meta.info A data frame containing sample-level metadata, where each +#' row corresponds to a sample. It should include the grouping variable +#' specified in \code{group.name}. If \code{x} is a \code{SummarizedExperiment} +#' object, \code{meta.info} must be a vector of the metadata needed for the +#' model to run and can be retrieved using \code{colData()}. +#' @param group.name A string specifying the column in \code{meta.info} that +#' represents the groups or conditions for comparison. +#' @param seed.cl An integer specifying the seed for randomization; +#' if not provided, the default is 1234. +#' @param cluster A parallel cluster object for distributed computation, +#' e.g., created by \code{makeCluster()}. Default is 2. +#' @param survival Logical, indicating whether to enable survival analysis. +#' If \code{TRUE}, then \code{meta.info} should contain \code{time} and +#' \code{event} columns. +#' @param paired Logical, indicating whether the data represent paired samples. +#' 'Default is \code{FALSE}. +#' @param n.ROTS Logical. If \code{TRUE}, all parameters related to +#' \code{LimROTS} will be ignored, and the original +#' \code{ROTS} analysis will run. This must be \code{TRUE} when +#' \code{survival} or \code{paired} is set to \code{TRUE}. +#' @param formula.str A formula string used when covariates are present in meta. +#' info for modeling. It should include "~ 0 + ..." to exclude the +#' intercept from the model. +#' @param robust indicating whether robust fitting should be used. +#' Default is TRUE, see \link{eBayes}. +#' @param trend indicating whether to include trend fitting in the +#' differential expression analysis. Default is TRUE. see \link{eBayes}. +#' @param permutating.group Logical, If \code{TRUE}, the permutation for +#' calculating the null distribution is performed by permuting the target +#' group only specified in \code{group.name}. If FALSE, the entire +#' \code{meta.info} will be permuted (recommended to be set to TRUE). #' #' @return A list of class `"list"` with the following elements: #' \item{data}{The original data matrix.} @@ -41,16 +74,23 @@ #' @examples #' # Example usage: #' -#' data <- data.frame(matrix(rnorm(500), nrow = 100, ncol = 10)) # Simulated data -#' meta.info <- data.frame(group = factor(rep(1:2, each = 5)), row.names = colnames(data)) +#' data <- data.frame(matrix(rnorm(500), nrow = 100, ncol = 10)) +#' # Simulated data +#' meta.info <- data.frame( +#' group = factor(rep(1:2, each = 5)), +#' row.names = colnames(data) +#' ) #' formula.str <- "~ 0 + group" -#' result <- LimROTS(data, meta.info = meta.info, group.name = "group", -#' formula.str = formula.str, B = 10) +#' result <- LimROTS(data, +#' meta.info = meta.info, group.name = "group", +#' formula.str = formula.str, B = 10 +#' ) #' #' @importFrom limma voom lmFit eBayes #' @importFrom stats model.matrix formula p.adjust #' @importFrom dplyr bind_cols -#' @importFrom parallel makeCluster clusterSetRNGStream clusterExport stopCluster +#' @importFrom parallel makeCluster clusterSetRNGStream clusterExport +#' stopCluster #' @importFrom doParallel registerDoParallel #' @importFrom foreach foreach #' @import doRNG @@ -58,19 +98,46 @@ #' @import utils #' @import SummarizedExperiment #' -#' @details The **LimROTS** approach initially uses the \link{limma} package to simulate the intensity data of proteins and metabolites. A linear model is subsequently fitted using the design matrix. Empirical Bayes variance shrinking is then implemented. To obtain the moderated t-statistics, the adjusted standard error \eqn{SEpost = √(s2.post) \times unscaled SD} for each feature is computed, along with the regression coefficient for each feature (indicating the impact of variations in the experimental settings). The \link{ROTS} approach establishes optimality based on the largest overlap of top-ranked features within group-preserving bootstrap datasets, Finally based on the optimized parameters \eqn{\alpha1} and \eqn{\alpha2} this equation used to calculates the final statistics: \deqn{t_{\alpha}(g) = \frac{\beta}{\alpha1 + \alpha2 \times SEpost}} -#' where \eqn{t_{\alpha}(g)} is the final statistics for each feature, \eqn{\beta} is the coefficient, and SEpost is the the adjusted standard error. LimROTS generates p-values from permutation samples using the implementation available in \link{qvalue} package, along with internal implementation of FDR adapted from ROTS package. Additionally, the qvalue package is used to calculate q-values, were the proportion of true null p-values is set to the bootstrap method \link{pi0est}. We recommend using permutation-derived p-values and qvalues. +#' @details The **LimROTS** approach initially uses the +#' \link{limma} package to simulate the intensity data of proteins and +#' metabolites. A linear model is subsequently fitted using the design matrix. +#' Empirical Bayes variance shrinking is then implemented. To obtain the +#' moderated t-statistics, the adjusted standard error +#' \eqn{SEpost = √(s2.post) +#' \times unscaled SD} for each feature is computed, along with the regression +#' coefficient for each feature (indicating the impact of variations in the +#' experimental settings). The \link[ROTS]{ROTS} approach establishes optimality +#' based on the largest overlap of top-ranked features within group-preserving +#' bootstrap datasets, Finally based on the optimized parameters +#' \eqn{\alpha1} and +#' \eqn{\alpha2} this equation used to calculates the final statistics: +#' \deqn{t_{\alpha}(g) = \frac{\beta}{\alpha1 + \alpha2 \times SEpost}}where +#' \eqn{t_{\alpha}(g)} is the final statistics for each feature, +#' \eqn{\beta} is the coefficient, and SEpost is the the adjusted +#' standard error. LimROTS generates p-values from permutation samples +#' using the implementation available in +#' \link{qvalue} package, along with internal implementation of FDR +#' adapted from ROTS package. Additionally, the qvalue package is used +#' to calculate q-values, were the proportion of true null p-values is +#' set to the bootstrap method \link{pi0est}. We recommend using +#' permutation-derived p-values and qvalues. #' #' @references -#' Ritchie, M.E., Phipson, B., Wu, D., Hu, Y., Law, C.W., Shi, W., and Smyth, G.K. (2015). limma powers differential -#' expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Research 43(7), e47 +#' Ritchie, M.E., Phipson, B., Wu, D., Hu, Y., Law, C.W., Shi, W., and Smyth, +#' G.K. (2015). limma powers differential expression analyses for +#' RNA-sequencing and microarray studies. Nucleic Acids Research 43(7), e47 #' -#' Suomi T, Seyednasrollah F, Jaakkola M, Faux T, Elo L (2017). “ROTS: An R package for reproducibility-optimized -#' statistical testing.” _PLoS computational biology_, *13*(5), e1005562. \url{doi:10.1371/journal.pcbi.1005562} -#' \url{https://doi.org/10.1371/journal.pcbi.1005562}, \url{http://www.ncbi.nlm.nih.gov/pubmed/28542205} +#' Suomi T, Seyednasrollah F, Jaakkola M, Faux T, Elo L (2017). “ROTS: An +#' R package for reproducibility-optimized statistical testing. +#' ” _PLoS computational biology_, *13*(5), e1005562. +#' \url{doi:10.1371/journal.pcbi.1005562} +#' \url{https://doi.org/10.1371/journal.pcbi.1005562}, +#' \url{http://www.ncbi.nlm.nih.gov/pubmed/28542205} #' -#' Elo LL, Filen S, Lahesmaa R, Aittokallio T. Reproducibility-optimized test statistic for ranking genes in microarray studies. -#' IEEE/ACM Trans Comput Biol Bioinform. 2008;5(3):423-431. \url{doi:10.1109/tcbb.2007.1078} +#' Elo LL, Filen S, Lahesmaa R, Aittokallio T. Reproducibility-optimized test +#' statistic for ranking genes in microarray studies. +#' IEEE/ACM Trans Comput Biol Bioinform. 2008;5(3):423-431. +#' \url{doi:10.1109/tcbb.2007.1078} #' #' #' @@ -95,8 +162,7 @@ LimROTS <- function(x, seed.cl = 1234, robust = TRUE, trend = TRUE, - permutating.group = TRUE) -{ + permutating.group = TRUE) { SanityChecK.list <- SanityChecK( x, B = B, @@ -126,9 +192,14 @@ LimROTS <- function(x, group2_data <- data[, groups == 2] if (log) { - logfc <- rowMeans(group1_data, na.rm = TRUE) - rowMeans(group2_data, na.rm = TRUE) + logfc <- + rowMeans(group1_data, na.rm = TRUE) - rowMeans(group2_data, + na.rm = TRUE + ) } else { - logfc <- rowMeans(log2(group1_data + 1), na.rm = TRUE) - rowMeans(log2(group2_data + 1), na.rm = TRUE) + logfc <- + rowMeans(log2(group1_data + 1), na.rm = TRUE) - + rowMeans(log2(group2_data + 1), na.rm = TRUE) } } else { logfc <- rep(NA, nrow(data)) @@ -162,9 +233,11 @@ LimROTS <- function(x, pD <- matrix(nrow = nrow(as.matrix(data)), ncol = nrow(samples)) pS <- matrix(nrow = nrow(as.matrix(data)), ncol = nrow(samples)) - pb <- txtProgressBar(min = 0, + pb <- txtProgressBar( + min = 0, max = 100, - style = 3) + style = 3 + ) if (is.null(cluster)) { cluster <- makeCluster(2) @@ -210,8 +283,11 @@ LimROTS <- function(x, results_list <- foreach( i = seq_len(nrow(samples)), .combine = "c", - .packages = c("utils", "stringr", "stats" , "limma"), - .export = c("testStatSurvivalOptimized" , "testStatistic_with_covariates" , "testStatOptimized", "testStatistic_with_covariates_permutating") + .packages = c("utils", "stringr", "stats", "limma"), + .export = c( + "testStatSurvivalOptimized", "testStatistic_with_covariates", + "testStatOptimized", "testStatistic_with_covariates_permutating" + ) ) %dorng% { samples.R <- split(samples[i, ], groups) @@ -221,11 +297,12 @@ LimROTS <- function(x, # Compute D and S if conditions are met if (is.null(a1) | is.null(a2)) { if (survival == TRUE) { - fit <- testStatSurvivalOptimized(lapply(samples.R, function(x) - data[, x]), - groups, - event) - + fit <- testStatSurvivalOptimized( + lapply(samples.R, function(x) + data[, x]), + groups, + event + ) } else if (n.ROTS == FALSE) { fit <- testStatistic_with_covariates( x = lapply(samples.R, function(x) @@ -237,7 +314,6 @@ LimROTS <- function(x, trend, robust = robust ) - } else { fit <- testStatOptimized(paired, lapply(samples.R, function(x) data[, x])) @@ -265,7 +341,7 @@ LimROTS <- function(x, formula.str = formula.str, trend = trend, - robust = robust , permutating.group = permutating.group + robust = robust, permutating.group = permutating.group ) } else { pSamples.R <- split(pSamples[i, ], groups) @@ -289,7 +365,10 @@ LimROTS <- function(x, setTxtProgressBar(pb, 80) } - names(results_list) <- paste0(names(results_list), seq(1, length(names(results_list)))) + names(results_list) <- paste0( + names(results_list), + seq(1, length(names(results_list))) + ) j <- 0 q <- 0 @@ -304,7 +383,6 @@ LimROTS <- function(x, pD[, q] <- results_list[[names(results_list)[i]]]$pd_result pS[, q] <- results_list[[names(results_list)[i]]]$ps_result } - } @@ -320,15 +398,17 @@ LimROTS <- function(x, if (is.null(a1) | is.null(a2)) { ssq <- c(seq(0, 20) / 100, seq(11, 50) / 50, seq(6, 25) / 5) - N <- c(seq(1, 20) * 5, + N <- c( + seq(1, 20) * 5, seq(11, 50) * 10, seq(21, 40) * 25, - seq(11, 1000) * - 100) + seq(11, 1000) * 100 + ) K <- min(K, nrow(data)) N <- N[N < K] - optimized.parameters <- Optimizing(B, ssq, N, D, S, pD, pS, verbose, progress) + optimized.parameters <- + Optimizing(B, ssq, N, D, S, pD, pS, verbose, progress) a1 <- optimized.parameters$a1 @@ -367,9 +447,11 @@ LimROTS <- function(x, gc() if (verbose) message("Calculating p-values") - p <- empPvals(stat = d, + p <- empPvals( + stat = d, stat0 = pD, - pool = TRUE) + pool = TRUE + ) if (verbose) message("Calculating FDR") @@ -380,7 +462,8 @@ LimROTS <- function(x, q_values <- qvalue(p, pi0.method = "bootstrap", - lambda = seq(0.01, 0.95, 0.01)) + lambda = seq(0.01, 0.95, 0.01) + ) BH.pvalue <- p.adjust(p, method = "BH") LimROTS.output <- list( @@ -401,8 +484,7 @@ LimROTS <- function(x, q_values = q_values, BH.pvalue = BH.pvalue ) - } - else { + } else { if (survival == TRUE) { fit <- testStatSurvivalOptimized(lapply(split(seq_len( length(groups) @@ -431,16 +513,19 @@ LimROTS <- function(x, pD <- pD / (a1 + a2 * pS) if (verbose) message("Calculating p-values") - p <- empPvals(stat = d, + p <- empPvals( + stat = d, stat0 = pD, - pool = TRUE) + pool = TRUE + ) if (verbose) message("Calculating FDR") FDR <- calculateFalseDiscoveryRate(d, pD, progress) corrected.logfc <- fit$corrected.logfc q_values <- qvalue(p, pi0.method = "bootstrap", - lambda = seq(0.01, 0.95, 0.01)) + lambda = seq(0.01, 0.95, 0.01) + ) BH.pvalue <- p.adjust(p, method = "BH") LimROTS.output <- list( data = data, diff --git a/R/Optimizing.R b/R/Optimizing.R index 4d3b8fd..247bd9e 100644 --- a/R/Optimizing.R +++ b/R/Optimizing.R @@ -1,25 +1,34 @@ #' Optimize Parameters Based on Overlap Calculations #' -#' This function optimizes parameters by calculating overlaps between observed and permuted data for multiple values of a smoothing constant (`ssq`) and a single-label replicate (SLR) comparison. +#' This function optimizes parameters by calculating overlaps between observed +#' and permuted data for multiple values of a smoothing constant (`ssq`) and a +#' single-label replicate (SLR) comparison. #' #' @param B Integer. Number of bootstrap samples or resampling iterations. #' @param ssq Numeric vector. Smoothing constants to be evaluated. -#' @param N Integer vector. Number of top values to consider for overlap calculation. +#' @param N Integer vector. Number of top values to consider for overlap +#' calculation. #' @param D Numeric matrix. Observed data values. #' @param S Numeric matrix. Standard errors or related values for observed data. #' @param pD Numeric matrix. Permuted data values. -#' @param pS Numeric matrix. Standard errors or related values for permuted data. +#' @param pS Numeric matrix. Standard errors or related values for +#' permuted data. #' @param verbose Logical. If `TRUE`, progress messages will be displayed. #' @param progress Logical. If `TRUE`, a progress bar will be shown. #' #' @details -#' The function calculates overlaps for a range of smoothing constants and identifies the optimal set of parameters by maximizing a z-score-based metric, which compares the overlap of observed data to permuted data. -#' It computes overlap matrices for both observed (`D` and `S`) and permuted (`pD` and `pS`) data and returns the optimal parameters based on the highest z-score. +#' The function calculates overlaps for a range of smoothing constants and +#' identifies the optimal set of parameters by maximizing a z-score-based +#' metric, which compares the overlap of observed data to permuted data. +#' It computes overlap matrices for both observed (`D` and `S`) and permuted +#' (`pD` and `pS`) data and returns the optimal parameters based on the +#' highest z-score. #' #' @return A list containing the optimal parameters: #' \itemize{ #' \item \code{a1}: Optimal smoothing constant or 1 for SLR. -#' \item \code{a2}: SLR flag (1 if smoothing constant is optimal, 0 if SLR is optimal). +#' \item \code{a2}: SLR flag (1 if smoothing constant is optimal, +#' 0 if SLR is optimal). #' \item \code{k}: Optimal number of top values to consider for overlap. #' \item \code{R}: Optimal overlap value. #' \item \code{Z}: Optimal z-score. @@ -44,9 +53,11 @@ Optimizing <- function(B, ssq, N, D, S, pD, pS, verbose, progress) { colnames(reprotable.sd) <- N row.names(reprotable.sd) <- c(ssq, "slr") if (progress) - pb <- txtProgressBar(min = 0, + pb <- txtProgressBar( + min = 0, max = length(ssq), - style = 3) + style = 3 + ) for (i in seq_len(length(ssq))) { overlaps <- matrix(0, nrow = B, ncol = length(N)) overlaps.P <- matrix(0, nrow = B, ncol = length(N)) @@ -76,14 +87,16 @@ Optimizing <- function(B, ssq, N, D, S, pD, pS, verbose, progress) { i <- length(ssq) + 1 overlaps <- matrix(0, nrow = B, ncol = length(N)) overlaps.P <- matrix(0, nrow = B, ncol = length(N)) - cResults <- calOverlaps.slr(D, + cResults <- calOverlaps.slr( + D, pD, nrow(D), as.integer(N), length(N), as.integer(B), overlaps, - overlaps.P) + overlaps.P + ) rm(D, S) gc() reprotable[i, ] <- colMeans(cResults[["overlaps"]]) @@ -121,5 +134,4 @@ Optimizing <- function(B, ssq, N, D, S, pD, pS, verbose, progress) { Z = Z, ztable = ztable )) - } diff --git a/R/SanityChecK.R b/R/SanityChecK.R index 4a719ae..04ca660 100644 --- a/R/SanityChecK.R +++ b/R/SanityChecK.R @@ -1,22 +1,40 @@ #' Sanity Check for Input Data and Parameters #' -#' This function performs a series of checks and initial setups for input data, metadata, and parameters, ensuring everything is correctly formatted for downstream analysis. +#' This function performs a series of checks and initial setups for input data, +#' metadata, and parameters, ensuring everything is correctly formatted for +#' downstream analysis. #' -#' @param x A matrix-like object or a `SummarizedExperiment` containing the data to be analyzed. -#' @param B Integer. Number of bootstrap samples or resampling iterations. Default is 1000. -#' @param K Integer. Top list size. If NULL, it will be set to a quarter of the number of rows in the data matrix. Default is NULL. +#' @param x A matrix-like object or a `SummarizedExperiment` containing the +#' data to be analyzed. +#' @param B Integer. Number of bootstrap samples or resampling iterations. +#' Default is 1000. +#' @param K Integer. Top list size. If NULL, it will be set to a quarter of +#' the number of rows in the data matrix. Default is NULL. #' @param a1,a2 Optional numeric parameters related to optimization. -#' @param meta.info Data frame. Metadata associated with the samples (columns of `data.exp`). If `data.exp` is a `SummarizedExperiment`, `meta.info` can be a vector of `colData` column names to use. -#' @param group.name Character. Column name in `meta.info` that defines the groups or conditions for comparison. -#' @param formula.str Optional character string representing the formula for the model. -#' @param survival Logical. If TRUE, survival analysis is used, requiring `time` and `event` columns in `meta.info`. Default is FALSE. -#' @param paired Logical. If TRUE, indicates paired test setup. Default is FALSE. -#' @param n.ROTS Logical. If TRUE, uses the ROTS method instead of LimROTS. Default is FALSE. -#' @param verbose Logical, indicating whether to display messages during the function's execution. Default is TRUE. -#' @param log Logical, indicating whether the data is already log-transformed. Default is TRUE. +#' @param meta.info Data frame. Metadata associated with the samples +#' (columns of `data.exp`). If `data.exp` is a `SummarizedExperiment`, +#' `meta.info` can be a vector of `colData` column names to use. +#' @param group.name Character. Column name in `meta.info` that defines the +#' groups or conditions for comparison. +#' @param formula.str Optional character string representing the formula for +#' the model. +#' @param survival Logical. If TRUE, survival analysis is used, requiring +#' `time` and `event` columns in `meta.info`. Default is FALSE. +#' @param paired Logical. If TRUE, indicates paired test setup. +#' Default is FALSE. +#' @param n.ROTS Logical. If TRUE, uses the ROTS method instead of LimROTS. +#' Default is FALSE. +#' @param verbose Logical, indicating whether to display messages during the +#' function's execution. Default is TRUE. +#' @param log Logical, indicating whether the data is already log-transformed. +#' Default is TRUE. #' #' @details -#' This function checks whether the input data and metadata are in the correct format, processes metadata from a `SummarizedExperiment` object if provided, and ensures that group information is correctly specified. If no top list size (`K`) is provided, it defaults to a quarter of the number of rows in the data. +#' This function checks whether the input data and metadata are in the correct +#' format, processes metadata from a `SummarizedExperiment` object if provided, +#' and ensures that group information is correctly specified. If no top list +#' size (`K`) is provided, it defaults to a quarter of the number of rows in +#' the data. #' #' @return A list containing: #' \itemize{ @@ -48,7 +66,9 @@ SanityChecK <- function(x, if (survival == TRUE) { if (n.ROTS == FALSE) { stop( - "survival is TRUE, Survival analysis is only available through the old ROTS implementation. To enable this, please set n.ROTS to TRUE." + "survival is TRUE, Survival analysis is only available through + the old ROTS implementation. To enable this, please set n.ROTS + to TRUE." ) } } @@ -56,7 +76,9 @@ SanityChecK <- function(x, if (paired == TRUE) { if (n.ROTS == FALSE) { stop( - "paired if TRUE, Paired analysis is only available through the old ROTS implementation. To enable this, please set n.ROTS to TRUE." + "paired if TRUE, Paired analysis is only available through the + old ROTS implementation. To enable this, please set n.ROTS to + TRUE." ) } } @@ -64,9 +86,10 @@ SanityChecK <- function(x, if (log == FALSE) { if (n.ROTS == FALSE) { stop( - "log is FALSE, Unlogged values can only be handled through the old ROTS implementation. To enable this, please set n.ROTS to TRUE." + "log is FALSE, Unlogged values can only be handled through the + old ROTS implementation. To enable this, please set + n.ROTS to TRUE." ) - } } @@ -88,11 +111,12 @@ SanityChecK <- function(x, } else { colnames(meta.info) <- meta.info.colnames } - } if (!group.name %in% colnames(meta.info)) { stop( - "group.name should be a string specifying the column in `meta.info` that represents the groups or conditions for comparison." + "group.name should be a string specifying the column in + `meta.info` that represents the groups or conditions + for comparison." ) } message(sprintf("Assay: %s will be used", assayNames(data.exp)[1])) @@ -103,7 +127,10 @@ SanityChecK <- function(x, } ### meta.info if (any(!row.names(meta.info) %in% colnames(data))) { - stop("rownames for meta.info should match the data colnames (samples names)") + stop( + "rownames + for meta.info should match the data colnames (samples names)" + ) } if (any(grepl(".", colnames(data), fixed = TRUE))) { stop("Sample names should contains no '.', please remove it if any") @@ -111,14 +138,16 @@ SanityChecK <- function(x, if (!is.null(meta.info) & n.ROTS == FALSE) { if (ncol(meta.info) == 1) { message( - "A meta.info table is provided with only group infomration >>> LimROTS with no covariates will be used" + "A meta.info table is provided with only group infomration >>> + LimROTS with no covariates will be used" ) if (is.null(formula.str)) { stop("formula.str should by provided for the model") } } else { message( - "A meta.info table is provided with covariates >>> LimROTS with covariates will be used" + "A meta.info table is provided with covariates >>> LimROTS with + covariates will be used" ) if (is.null(formula.str)) { stop("formula.str should by provided for the model") @@ -132,13 +161,17 @@ SanityChecK <- function(x, if (nrow(meta.info) != ncol(data)) { stop("Number of samples in the data does not match the groups.") } - sort.df <- data.frame(sample.id = colnames(data), groups = meta.info[, group.name]) + sort.df <- data.frame( + sample.id = colnames(data), + groups = meta.info[, group.name] + ) sort.df <- sort.df[order(sort.df$groups), ] data <- data[, sort.df$sample.id] meta.info$temp <- row.names(meta.info) meta.info <- data.frame(meta.info[colnames(data), ], check.rows = FALSE, - check.names = FALSE) + check.names = FALSE + ) meta.info$temp <- NULL ### Groups if (inherits(meta.info[, group.name], "character")) { @@ -154,7 +187,8 @@ SanityChecK <- function(x, if (survival == TRUE) { if (all(c("time", "event") %in% colnames(meta.info))) { stop( - "meta.info must have two columns time and event. Also, group.name must be time" + "meta.info must have two columns time and event. + Also, group.name must be time" ) } event <- meta.info[, "event"] diff --git a/R/UPS1.Case4.r b/R/UPS1.Case4.r index 4c04d24..7d11be3 100644 --- a/R/UPS1.Case4.r +++ b/R/UPS1.Case4.r @@ -1,12 +1,19 @@ #' Spectronaut and ScaffoldDIA UPS1 Spiked Dataset case 4 #' -#' A \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} object containing DIA proteomics data from a UPS1-spiked E. coli proteins, processed using both Spectronaut and ScaffoldDIA separately, then merged. +#' A \code{ +#' \link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +#' object containing DIA proteomics data from a UPS1-spiked E. coli proteins, +#' processed using both Spectronaut and ScaffoldDIA separately, then merged. #' #' @name UPS1.Case4 #' @docType data -#' @format An instance of the \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} class with the following assays: +#' @format An instance of the +#' \code{ +#' \link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +#' class with the following assays: #' \describe{ -#' \item{norm}{This assay includes log2 protein intensities calculated by averaging the peptides derived from the same protein} +#' \item{norm}{This assay includes log2 protein intensities calculated by +#' averaging the peptides derived from the same protein} #' } #' The object also contains colData and rowData: #' \describe{ @@ -17,14 +24,20 @@ #' The `colData` contains the following columns: #' \describe{ #' \item{SampleID}{Unique identifier for each sample.} -#' \item{Conc}{Experimental condition or group for each sample , representing different conc. of UPS1-spiked proteins.} +#' \item{Conc}{Experimental condition or group for each sample , +#' representing different conc. of UPS1-spiked proteins.} #' \item{tool}{software tool used, Spectronaut or ScaffoldDIA.} #' \item{fake.batch}{A fake digestion batch.} #' } #' -#' @source Generated with both spectronaut and ScaffoldDIA separately. using a mixed mode acquisition method and FASTA mode for demonstration purposes. +#' @source Generated with both spectronaut and ScaffoldDIA separately. +#' using a mixed mode acquisition method and FASTA mode for demonstration +#' purposes. #' @references -#' Gotti, C., Roux-Dalvai, F., Joly-Beauparlant, C., Mangnier, L., Leclercq, M., & Droit, A. (2022). DIA proteomics data from a UPS1-spiked E.coli protein mixture processed with six software tools. In Data in Brief (Vol. 41, p. 107829). Elsevier BV. https://doi.org/10.1016/j.dib.2022.107829 +#' Gotti, C., Roux-Dalvai, F., Joly-Beauparlant, C., Mangnier, L., Leclercq, M., +#' & Droit, A. (2022). DIA proteomics data from a UPS1-spiked E.coli protein +#' mixture processed with six software tools. In Data in Brief +#' (Vol. 41, p. 107829). Elsevier BV. https://doi.org/10.1016/j.dib.2022.107829 #' #' diff --git a/R/calOverlaps.R b/R/calOverlaps.R index f215302..08d9c11 100644 --- a/R/calOverlaps.R +++ b/R/calOverlaps.R @@ -1,25 +1,34 @@ #' Calculate Overlaps Between Observed and Permuted Data #' -#' This function calculates the overlap between observed and permuted data for two sets of comparisons. -#' It computes the ratio of overlap between pairs of vectors (res1/res2 and pres1/pres2) after sorting the values. +#' This function calculates the overlap between observed and permuted data for +#' two sets of comparisons. It computes the ratio of overlap between pairs of +#' vectors (res1/res2 and pres1/pres2) after sorting the values. #' #' @param D Numeric vector. Observed data values (e.g., differences). -#' @param S Numeric vector. Standard errors or related values associated with the observed data. +#' @param S Numeric vector. Standard errors or related values associated with +#' the observed data. #' @param pD Numeric vector. Permuted data values (e.g., differences). -#' @param pS Numeric vector. Standard errors or related values associated with the permuted data. +#' @param pS Numeric vector. Standard errors or related values associated with +#' the permuted data. #' @param nrow Integer. Number of rows in each block of data. -#' @param N Integer vector. Number of top values to consider for overlap calculation. +#' @param N Integer vector. Number of top values to consider for overlap +#' calculation. #' @param N_len Integer. Length of the `N` vector. #' @param ssq Numeric. A small constant added to standard errors for stability. #' @param B Integer. Number of bootstrap samples or resampling iterations. -#' @param overlaps Numeric matrix. Matrix to store overlap results for observed data. -#' @param overlaps_P Numeric matrix. Matrix to store overlap results for permuted data. +#' @param overlaps Numeric matrix. Matrix to store overlap results for observed +#' data. +#' @param overlaps_P Numeric matrix. Matrix to store overlap results for +#' permuted data. #' #' @details -#' The function calculates overlaps for two sets of comparisons: one for observed data (res1/res2) and one for permuted data (pres1/pres2). -#' For each bootstrap sample, the function orders the two vectors being compared, then calculates the proportion of overlap for the top `N` values. +#' The function calculates overlaps for two sets of comparisons: one for +#' observed data (res1/res2) and one for permuted data (pres1/pres2).For each +#' bootstrap sample, the function orders the two vectors being compared, then +#' calculates the proportion of overlap for the top `N` values. #' -#' @return A list containing two matrices: \code{overlaps} for observed data and \code{overlaps_P} for permuted data. +#' @return A list containing two matrices: \code{overlaps} for observed data and +#' \code{overlaps_P} for permuted data. #' #' @@ -46,12 +55,20 @@ calOverlaps <- function(D, idx_offset <- B D <- abs(D) for (b in idx_b) { - res1 <- abs(D[((b - 1) * nrow + 1):(b * nrow)] / (S[((b - 1) * nrow + 1):(b * nrow)] + ssq)) - res2 <- abs(D[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * nrow)] / - (S[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * nrow)] + ssq)) - pres1 <- abs(pD[((b - 1) * nrow + 1):(b * nrow)] / (pS[((b - 1) * nrow + 1):(b * nrow)] + ssq)) - pres2 <- abs(pD[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * nrow)] / - (pS[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * nrow)] + ssq)) + res1 <- + abs(D[((b - 1) * nrow + 1):(b * nrow)] / + (S[((b - 1) * nrow + 1):(b * nrow)] + ssq)) + res2 <- + abs(D[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * nrow)] / + (S[((b + idx_offset - 1) * + nrow + 1):((b + idx_offset) * nrow)] + ssq)) + pres1 <- + abs(pD[((b - 1) * nrow + 1):(b * nrow)] / + (pS[((b - 1) * nrow + 1):(b * nrow)] + ssq)) + pres2 <- + abs(pD[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * nrow)] + / (pS[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * + nrow)] + ssq)) sorted_res <- sort2_1R(res1, res2) res1 <- sorted_res$a diff --git a/R/calOverlaps.slr.R b/R/calOverlaps_slr.R similarity index 77% rename from R/calOverlaps.slr.R rename to R/calOverlaps_slr.R index 7661790..2e52b6a 100644 --- a/R/calOverlaps.slr.R +++ b/R/calOverlaps_slr.R @@ -1,22 +1,30 @@ #' Calculate Overlaps for Single-Label Replicates (SLR) #' -#' This function computes the overlap between two sets of observed and permuted values for single-label replicates (SLR). -#' It calculates the proportion of overlap between pairs of vectors (res1/res2 and pres1/pres2) after sorting them. +#' This function computes the overlap between two sets of observed and permuted +#' 'values for single-label replicates (SLR). +#' It calculates the proportion of overlap between pairs of vectors (res1/res2 +#' and pres1/pres2) after sorting them. #' #' @param D Numeric vector. Observed data values (e.g., differences). #' @param pD Numeric vector. Permuted data values. #' @param nrow Integer. Number of rows in each block of data. -#' @param N Integer vector. Number of top values to consider for overlap calculation. +#' @param N Integer vector. Number of top values to consider for overlap +#' calculation. #' @param N_len Integer. Length of the `N` vector. #' @param B Integer. Number of bootstrap samples or resampling iterations. -#' @param overlaps Numeric matrix. Matrix to store overlap results for observed data. -#' @param overlaps_P Numeric matrix. Matrix to store overlap results for permuted data. +#' @param overlaps Numeric matrix. Matrix to store overlap results for +#' observed data. +#' @param overlaps_P Numeric matrix. Matrix to store overlap results for +#' permuted data. #' #' @details -#' The function calculates the overlap for two sets of comparisons: one for observed data (`res1`/`res2`) and one for permuted data (`pres1`/`pres2`). -#' For each bootstrap sample, the function orders the two vectors being compared, then computes the proportion of overlap for the top `N` values. +#' The function calculates the overlap for two sets of comparisons: one for +#' observed data (`res1`/`res2`) and one for permuted data (`pres1`/`pres2`). +#' For each bootstrap sample, the function orders the two vectors being +#' compared, then computes the proportion of overlap for the top `N` values. #' -#' @return A list containing two matrices: \code{overlaps} for observed data and \code{overlaps_P} for permuted data. +#' @return A list containing two matrices: \code{overlaps} for observed data and +#' \code{overlaps_P} for permuted data. #' #' @@ -34,9 +42,11 @@ calOverlaps.slr <- function(D, pD, nrow, N, N_len, B, overlaps, overlaps_P) { D <- abs(D) for (b in idx_b) { res1 <- abs(D[((b - 1) * nrow + 1):(b * nrow)]) - res2 <- abs(D[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * nrow)]) + res2 <- + abs(D[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * nrow)]) pres1 <- abs(pD[((b - 1) * nrow + 1):(b * nrow)]) - pres2 <- abs(pD[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * nrow)]) + pres2 <- + abs(pD[((b + idx_offset - 1) * nrow + 1):((b + idx_offset) * nrow)]) sorted_res <- sort2_1R(res1, res2) res1 <- sorted_res$a res2 <- sorted_res$b diff --git a/R/calculateFalseDiscoveryRate.R b/R/calculateFalseDiscoveryRate.R index 8de8b7b..2a52e79 100644 --- a/R/calculateFalseDiscoveryRate.R +++ b/R/calculateFalseDiscoveryRate.R @@ -1,25 +1,35 @@ #' Calculate False Discovery Rate (FDR) Using Permuted Values (Adjusted) #' -#' This function calculates the false discovery rate (FDR) by comparing observed values to permuted values. -#' The function sorts observed values, compares them against permuted data, and computes FDR using the median of permutation results. +#' This function calculates the false discovery rate (FDR) by comparing +#' observed values to permuted values.The function sorts observed values, +#' compares them against permuted data, and computes FDR using the median of +#' permutation results. #' -#' @param observedValues Numeric vector. The observed test statistics or values to be evaluated for significance. -#' @param permutedValues Numeric matrix. The permuted test statistics or values, with rows corresponding to the same values as in `observedValues` and columns representing different permutations. -#' @param showProgress Logical. If `TRUE`, a progress bar will be shown during the computation. +#' @param observedValues Numeric vector. The observed test statistics or values +#' to be evaluated for significance. +#' @param permutedValues Numeric matrix. The permuted test statistics or values, +#' with rows corresponding to the same values as in `observedValues` and +#' columns representing different permutations. +#' @param showProgress Logical. If `TRUE`, a progress bar will be shown during +#' the computation. #' -#' @return A numeric vector of the same length as `observedValues`, containing the estimated FDR for each observed value. +#' @return A numeric vector of the same length as `observedValues`, containing +#' the estimated FDR for each observed value. #' @importFrom stats median #' #' @examples #' observedValues <- c(2.5, 1.8, 3.1, 0.7, 2.9) #' set.seed(123) #' permutedValues <- matrix(rnorm(5 * 5, mean = 2, sd = 1), nrow = 5) -#' fdr <- calculateFalseDiscoveryRate(observedValues, permutedValues, showProgress = FALSE) +#' fdr <- calculateFalseDiscoveryRate(observedValues, permutedValues, +#' showProgress = FALSE +#' ) #' print(fdr) #' #' @export -calculateFalseDiscoveryRate <- function(observedValues, permutedValues, showProgress = FALSE) { +calculateFalseDiscoveryRate <- function(observedValues, permutedValues, + showProgress = FALSE) { observedAbs <- abs(observedValues) permutedAbs <- abs(permutedValues) ord.obs <- order(observedAbs, decreasing = TRUE, na.last = TRUE) @@ -42,18 +52,28 @@ calculateFalseDiscoveryRate <- function(observedValues, permutedValues, showProg } falseDiscoveryRate <- apply(FDRmatrix, 1, median) falseDiscoveryRate[falseDiscoveryRate > 1] <- 1 - falseDiscoveryRate[ord.obs] <- rev(sapply(length(falseDiscoveryRate):1, function(x) return(min(falseDiscoveryRate[ord.obs][x:length(falseDiscoveryRate)])))) + falseDiscoveryRate[ord.obs] <- + rev(sapply( + length(falseDiscoveryRate):1, + function(x) + return(min(falseDiscoveryRate + [ord.obs][x:length(falseDiscoveryRate)])) + )) return(falseDiscoveryRate) } #' Count Larger Permuted Values (Modified) #' -#' This helper function compares observed values against permuted values and counts the number of permuted values that are greater than or equal to each observed value. +#' This helper function compares observed values against permuted values and +#' counts the number of permuted values that are greater than or equal to each +#' observed value. #' #' @param observedVec Numeric vector. The observed values. -#' @param permutedVec Numeric vector. The permuted values to compare against the observed values. +#' @param permutedVec Numeric vector. The permuted values to compare against +#' the observed values. #' -#' @return A numeric vector containing the counts of permuted values greater than or equal to the corresponding observed values. +#' @return A numeric vector containing the counts of permuted values greater +#' than or equal to the corresponding observed values. #' #' @@ -69,7 +89,10 @@ countLargerThan <- function(observedVec, permutedVec) { observedInPermuted <- observedVec %in% permutedVec # Combine observed and permuted into a single sorted vector - combinedSorted <- sort(c(observedVec, permutedVec), decreasing = TRUE, na.last = TRUE) + combinedSorted <- sort(c(observedVec, permutedVec), + decreasing = TRUE, + na.last = TRUE + ) # Match observed elements to the combined vector positions combinedPos <- match(observedVec, combinedSorted) diff --git a/R/proteogenomicPD.r b/R/proteogenomicPD.r index deb054e..75e68ab 100644 --- a/R/proteogenomicPD.r +++ b/R/proteogenomicPD.r @@ -1,10 +1,15 @@ #' proteogenomicPD Dataset #' -#' A \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} object +#' A \code{ +#' \link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +#' object #' #' @name proteogenomicPD #' @docType data -#' @format An instance of the \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} class with the following assays: +#' @format An instance of the +#' \code{ +#' \link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +#' class with the following assays: #' NULL diff --git a/R/testStatOptimized.R b/R/testStatOptimized.R index 16ff41c..746cfb7 100644 --- a/R/testStatOptimized.R +++ b/R/testStatOptimized.R @@ -1,13 +1,23 @@ #' Optimized Test Statistic Calculation #' -#' This function calculates the test statistic (mean differences and standard deviations) for comparing two or more groups of samples, with support for paired or unpaired samples. +#' This function calculates the test statistic (mean differences and standard +#' deviations) for comparing two or more groups of samples, with support for +#' paired or unpaired samples. #' -#' @param isPaired Logical. If TRUE, calculates test statistics for paired samples. If FALSE, for unpaired samples. -#' @param x List of matrices or data frames. Each element represents a group of samples (columns) with the same set of features (rows). +#' @param isPaired Logical. If TRUE, calculates test statistics for paired +#' samples. If FALSE, for unpaired samples. +#' @param x List of matrices or data frames. Each element represents a group of +#' samples (columns) with the same set of features (rows). #' #' @details -#' The function supports comparison between two groups or multiple groups of samples. For two groups, it computes the mean differences and pooled standard deviations for unpaired samples, or paired standard deviations for paired samples. -#' When comparing more than two groups, it calculates the mean differences and standard deviations across all groups. For unpaired samples, a scaling factor based on sample size is used. Paired comparisons are only supported for two groups. +#' The function supports comparison between two groups or multiple groups of +#' samples. For two groups, it computes the mean differences and pooled +#' standard deviations for unpaired samples, or paired standard deviations for +#' paired samples. +#' When comparing more than two groups, it calculates the mean differences and +#' standard deviations across all groups. For unpaired samples, a scaling factor +#' based on sample size is used. Paired comparisons are only supported for two +#' groups. #' #' @return A list containing: #' \itemize{ @@ -15,11 +25,6 @@ #' \item \code{s}: Standard deviations (pooled or paired). #' } #' -#' - - - - testStatOptimized <- function(isPaired, x) { sampleGroups <- x if (length(sampleGroups) == 2) { @@ -33,16 +38,24 @@ testStatOptimized <- function(isPaired, x) { nonNAcountA <- rowSums(!is.na(groupA)) nonNAcountB <- rowSums(!is.na(groupB)) meanDiff <- meanB - meanA - pooledSD <- sqrt(((sumSqA + sumSqB) / (nonNAcountA + nonNAcountB - 2)) * (1 / nonNAcountA + 1 / nonNAcountB)) + pooledSD <- sqrt(((sumSqA + sumSqB) / + (nonNAcountA + nonNAcountB - 2)) * + (1 / nonNAcountA + 1 / nonNAcountB)) insufficientSamples <- which(nonNAcountA < 2 | nonNAcountB < 2) meanDiff[insufficientSamples] <- 0 pooledSD[insufficientSamples] <- 1 return(list(d = meanDiff, s = pooledSD)) } else { - covarianceAB <- rowSums((groupA - meanA) * (groupB - meanB), na.rm = TRUE) + covarianceAB <- rowSums((groupA - meanA) * (groupB - meanB), + na.rm = TRUE + ) pairedCount <- rowSums(!is.na(groupA * groupB)) meanDiff <- meanB - meanA - pairedSD <- sqrt(((sumSqA + sumSqB) / (2 * pairedCount - 2)) * (2 / pairedCount) - 2 / (pairedCount * (pairedCount - 1)) * covarianceAB) + pairedSD <- sqrt(((sumSqA + sumSqB) / + (2 * pairedCount - 2)) * + (2 / pairedCount) - 2 / + (pairedCount * (pairedCount - 1)) * + covarianceAB) insufficientSamples <- which(pairedCount < 2) meanDiff[insufficientSamples] <- 0 pairedSD[insufficientSamples] <- 1 @@ -51,16 +64,21 @@ testStatOptimized <- function(isPaired, x) { } else if (length(sampleGroups) > 2) { allSamples <- do.call("cbind", sampleGroups) if (!isPaired) { - factorScaling <- sum(sapply(sampleGroups, ncol)) / prod(sapply(sampleGroups, ncol)) + factorScaling <- sum(sapply(sampleGroups, ncol)) / + prod(sapply(sampleGroups, ncol)) rowVariance <- rowSums(sapply(sampleGroups, function(group) ( - rowMeans(group, na.rm = TRUE) - rowMeans(allSamples, na.rm = TRUE) + rowMeans(group, na.rm = TRUE) - rowMeans(allSamples, + na.rm = TRUE + ) )^2)) meanDiff <- sqrt(factorScaling * rowVariance) scalingFactor <- 1 / sum(sapply(sampleGroups, ncol) - 1) * sum(1 / sapply(sampleGroups, ncol)) totalVariance <- rowSums(sapply(sampleGroups, function(group) - rowSums((group - rowMeans(group, na.rm = TRUE))^2, na.rm = TRUE))) + rowSums((group - rowMeans(group, na.rm = TRUE))^2, + na.rm = TRUE + ))) standardDev <- sqrt(scalingFactor * totalVariance) } else { stop("Multiple paired groups are not supported!") diff --git a/R/testStatSurvivalOptimized.R b/R/testStatSurvivalOptimized.R index fb90e4d..6866c7e 100644 --- a/R/testStatSurvivalOptimized.R +++ b/R/testStatSurvivalOptimized.R @@ -1,65 +1,74 @@ #' @title Test Statistics for Survival Data -#' @description This function calculates mean differences and standard deviations for survival data -#' across different sample groups, considering at-risk samples at each unique event time. +#' @description This function calculates mean differences and standard +#' deviations for survival data across different sample groups, considering +#' at-risk samples at each unique event time. #' -#' @param x A list of matrices or data frames, where each element represents a group of samples -#' (columns) with the same set of features (rows). -#' @param survivalTime A numeric vector containing the survival times for each sample. -#' @param survivalEvent A binary vector indicating the occurrence of an event (1 for event, 0 for censored) -#' for each sample. +#' @param x A list of matrices or data frames, where each element represents +#' a group of samples (columns) with the same set of features (rows). +#' @param survivalTime A numeric vector containing the survival times for +#' each sample. +#' @param survivalEvent A binary vector indicating the occurrence of an event +#' (1 for event, 0 for censored) for each sample. #' #' @return A list containing: -#' \item{d}{A numeric vector of mean differences for each feature across the groups.} +#' \item{d}{A numeric vector of mean differences for each feature across +#' the groups.} #' \item{s}{A numeric vector of standard deviations for the mean differences.} #' -#' @details The function identifies unique times from the survival data and computes the mean difference -#' for each time point by comparing samples that experienced the event against those at risk. -#' It calculates the standard deviation based on the variation among at-risk samples. +#' @details The function identifies unique times from the survival data and +#' computes the mean difference for each time point by comparing samples that +#' experienced the event against those at risk. It calculates the standard +#' deviation based on the variation among at-risk samples. #' #' -testStatSurvivalOptimized <- function(x, - survivalTime, - survivalEvent) { - sampleGroups <- x - allSamples <- do.call("cbind", sampleGroups) - uniqueTimes <- unique(survivalTime[survivalEvent == 1]) +testStatSurvivalOptimized <- + function(x, survivalTime, survivalEvent) { + sampleGroups <- x + allSamples <- do.call("cbind", sampleGroups) + uniqueTimes <- unique(survivalTime[survivalEvent == 1]) - meanDifference <- vector(mode = "numeric", length = nrow(allSamples)) + meanDifference <- vector(mode = "numeric", length = nrow(allSamples)) - for (currentTime in uniqueTimes) { - indicesAtRisk <- which(survivalTime >= currentTime) - indicesEvent <- which(survivalTime == currentTime) - eventIndices <- indicesEvent[which(survivalEvent[indicesEvent] == 1)] + for (currentTime in uniqueTimes) { + indicesAtRisk <- which(survivalTime >= currentTime) + indicesEvent <- which(survivalTime == currentTime) + eventIndices <- + indicesEvent[which(survivalEvent[indicesEvent] == 1)] - if (length(indicesAtRisk) > 1) { - meanDifference <- meanDifference + ( - rowSums(as.data.frame(allSamples[, eventIndices]), na.rm = TRUE) - - length(eventIndices) * rowMeans(allSamples[, indicesAtRisk], na.rm = TRUE) - ) + if (length(indicesAtRisk) > 1) { + meanDifference <- meanDifference + ( + rowSums(as.data.frame(allSamples[, eventIndices]), + na.rm = TRUE + ) - + length(eventIndices) * + rowMeans(allSamples[, indicesAtRisk], na.rm = TRUE) + ) + } } - } - survivalSD <- vector(mode = "numeric", length = nrow(allSamples)) + survivalSD <- vector(mode = "numeric", length = nrow(allSamples)) - for (currentTime in uniqueTimes) { - indicesAtRisk <- which(survivalTime >= currentTime) - indicesEvent <- which(survivalTime == currentTime) - eventIndices <- indicesEvent[which(survivalEvent[indicesEvent] == 1)] + for (currentTime in uniqueTimes) { + indicesAtRisk <- which(survivalTime >= currentTime) + indicesEvent <- which(survivalTime == currentTime) + eventIndices <- + indicesEvent[which(survivalEvent[indicesEvent] == 1)] - if (length(indicesAtRisk) > 1) { - survivalSD <- survivalSD + ((length(eventIndices) / length(indicesAtRisk)) * - rowSums(( - allSamples[, indicesAtRisk] - - rowMeans(allSamples[, indicesAtRisk], na.rm = TRUE) - )^2, na.rm = TRUE)) + if (length(indicesAtRisk) > 1) { + survivalSD <- survivalSD + ((length(eventIndices) / + length(indicesAtRisk)) * + rowSums(( + allSamples[, indicesAtRisk] - + rowMeans(allSamples[, indicesAtRisk], na.rm = TRUE) + )^2, na.rm = TRUE)) + } } - } - survivalSD <- sqrt(survivalSD) + survivalSD <- sqrt(survivalSD) - return(list(d = meanDifference, s = survivalSD)) -} + return(list(d = meanDifference, s = survivalSD)) + } diff --git a/man/LimROTS.Rd b/man/LimROTS.Rd index 8e96657..869e992 100644 --- a/man/LimROTS.Rd +++ b/man/LimROTS.Rd @@ -1,8 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LimROTS.r +% Please edit documentation in R/LimROTS.R \name{LimROTS} \alias{LimROTS} -\title{LimROTS: A Hybrid Method Integrating Empirical Bayes and Reproducibility-Optimized Statistics for Robust Analysis of Proteomics and Metabolomics Data} +\title{LimROTS: A Hybrid Method Integrating Empirical Bayes and +Reproducibility-Optimized Statistics for Robust Analysis of Proteomics and +Metabolomics Data} \usage{ LimROTS( x, @@ -27,43 +29,72 @@ LimROTS( ) } \arguments{ -\item{x}{A \code{SummarizedExperiment} object or a matrix where rows represent features (e.g., genes, proteins) and columns represent samples. The values should be log-transformed.} +\item{x}{A \code{SummarizedExperiment} object or a matrix where rows +represent features (e.g., genes, proteins) and columns represent samples. +The values should be log-transformed.} -\item{B}{An integer specifying the number of bootstrap iterations. Default is 1000.} +\item{B}{An integer specifying the number of bootstrap iterations. +Default is 1000.} -\item{K}{An optional integer representing the top list size for ranking. If not specified, it is set to one-fourth of the number of features.} +\item{K}{An optional integer representing the top list size for ranking. +If not specified, it is set to one-fourth of the number of features.} -\item{a1}{Optional numeric value used in the optimization process. If defined by the user, no optimization occurs.} +\item{a1}{Optional numeric value used in the optimization process. +If defined by the user, no optimization occurs.} -\item{a2}{Optional numeric value used in the optimization process. If defined by the user, no optimization occurs.} +\item{a2}{Optional numeric value used in the optimization process. +If defined by the user, no optimization occurs.} -\item{log}{Logical, indicating whether the data is already log-transformed. Default is \code{TRUE}.} +\item{log}{Logical, indicating whether the data is already log-transformed. +Default is \code{TRUE}.} -\item{progress}{Logical, indicating whether to display a progress bar during bootstrap sampling. Default is \code{FALSE}.} +\item{progress}{Logical, indicating whether to display a progress bar during +bootstrap sampling. Default is \code{FALSE}.} -\item{verbose}{Logical, indicating whether to display messages during the function's execution. Default is \code{TRUE}.} +\item{verbose}{Logical, indicating whether to display messages during the +function's execution. Default is \code{TRUE}.} -\item{meta.info}{A data frame containing sample-level metadata, where each row corresponds to a sample. It should include the grouping variable specified in \code{group.name}. If \code{x} is a \code{SummarizedExperiment} object, \code{meta.info} must be a vector of the metadata needed for the model to run and can be retrieved using \code{colData()}.} +\item{meta.info}{A data frame containing sample-level metadata, where each +row corresponds to a sample. It should include the grouping variable +specified in \code{group.name}. If \code{x} is a \code{SummarizedExperiment} +object, \code{meta.info} must be a vector of the metadata needed for the +model to run and can be retrieved using \code{colData()}.} -\item{cluster}{A parallel cluster object for distributed computation, e.g., created by \code{makeCluster()}. Default is 2.} +\item{cluster}{A parallel cluster object for distributed computation, +e.g., created by \code{makeCluster()}. Default is 2.} -\item{group.name}{A string specifying the column in \code{meta.info} that represents the groups or conditions for comparison.} +\item{group.name}{A string specifying the column in \code{meta.info} that +represents the groups or conditions for comparison.} -\item{formula.str}{A formula string used when covariates are present in meta.info for modeling. It should include "~ 0 + ..." to exclude the intercept from the model.} +\item{formula.str}{A formula string used when covariates are present in meta. +info for modeling. It should include "~ 0 + ..." to exclude the +intercept from the model.} -\item{survival}{Logical, indicating whether to enable survival analysis. If \code{TRUE}, then \code{meta.info} should contain \code{time} and \code{event} columns.} +\item{survival}{Logical, indicating whether to enable survival analysis. +If \code{TRUE}, then \code{meta.info} should contain \code{time} and +\code{event} columns.} -\item{paired}{Logical, indicating whether the data represent paired samples. Default is \code{FALSE}.} +\item{paired}{Logical, indicating whether the data represent paired samples. +'Default is \code{FALSE}.} -\item{n.ROTS}{Logical. If \code{TRUE}, all parameters related to \code{LimROTS} will be ignored, and the original \code{ROTS} analysis will run. This must be \code{TRUE} when \code{survival} or \code{paired} is set to \code{TRUE}.} +\item{n.ROTS}{Logical. If \code{TRUE}, all parameters related to +\code{LimROTS} will be ignored, and the original +\code{ROTS} analysis will run. This must be \code{TRUE} when +\code{survival} or \code{paired} is set to \code{TRUE}.} -\item{seed.cl}{An integer specifying the seed for randomization; if not provided, the default is 1234.} +\item{seed.cl}{An integer specifying the seed for randomization; +if not provided, the default is 1234.} -\item{robust}{indicating whether robust fitting should be used. Default is TRUE, see \link{eBayes}.} +\item{robust}{indicating whether robust fitting should be used. +Default is TRUE, see \link{eBayes}.} -\item{trend}{indicating whether to include trend fitting in the differential expression analysis. Default is TRUE. see \link{eBayes}.} +\item{trend}{indicating whether to include trend fitting in the +differential expression analysis. Default is TRUE. see \link{eBayes}.} -\item{permutating.group}{Logical, If \code{TRUE}, the permutation for calculating the null distribution is performed by permuting the target group only specified in \code{group.name}. If FALSE, the entire \code{meta.info} will be permuted (recommended to be set to TRUE).} +\item{permutating.group}{Logical, If \code{TRUE}, the permutation for +calculating the null distribution is performed by permuting the target +group only specified in \code{group.name}. If FALSE, the entire +\code{meta.info} will be permuted (recommended to be set to TRUE).} } \value{ A list of class \code{"list"} with the following elements: @@ -81,30 +112,65 @@ A list of class \code{"list"} with the following elements: \item{BH.pvalue}{Benjamini-Hochberg adjusted p-values.} } \description{ -The \code{LimROTS} function employs a reproducibility-optimized test statistic utilising the limma and ROTS methodology to simulate complex experimental designs. +The \code{LimROTS} function employs a reproducibility-optimized test statistic +utilising the limma and ROTS methodology to simulate complex experimental +designs. } \details{ -The \strong{LimROTS} approach initially uses the \link{limma} package to simulate the intensity data of proteins and metabolites. A linear model is subsequently fitted using the design matrix. Empirical Bayes variance shrinking is then implemented. To obtain the moderated t-statistics, the adjusted standard error \eqn{SEpost = √(s2.post) \times unscaled SD} for each feature is computed, along with the regression coefficient for each feature (indicating the impact of variations in the experimental settings). The \link{ROTS} approach establishes optimality based on the largest overlap of top-ranked features within group-preserving bootstrap datasets, Finally based on the optimized parameters \eqn{\alpha1} and \eqn{\alpha2} this equation used to calculates the final statistics: \deqn{t_{\alpha}(g) = \frac{\beta}{\alpha1 + \alpha2 \times SEpost}} -where \eqn{t_{\alpha}(g)} is the final statistics for each feature, \eqn{\beta} is the coefficient, and SEpost is the the adjusted standard error. LimROTS generates p-values from permutation samples using the implementation available in \link{qvalue} package, along with internal implementation of FDR adapted from ROTS package. Additionally, the qvalue package is used to calculate q-values, were the proportion of true null p-values is set to the bootstrap method \link{pi0est}. We recommend using permutation-derived p-values and qvalues. +The \strong{LimROTS} approach initially uses the +\link{limma} package to simulate the intensity data of proteins and +metabolites. A linear model is subsequently fitted using the design matrix. +Empirical Bayes variance shrinking is then implemented. To obtain the +moderated t-statistics, the adjusted standard error +\eqn{SEpost = √(s2.post) +\times unscaled SD} for each feature is computed, along with the regression +coefficient for each feature (indicating the impact of variations in the +experimental settings). The \link[ROTS]{ROTS} approach establishes optimality +based on the largest overlap of top-ranked features within group-preserving +bootstrap datasets, Finally based on the optimized parameters +\eqn{\alpha1} and +\eqn{\alpha2} this equation used to calculates the final statistics: +\deqn{t_{\alpha}(g) = \frac{\beta}{\alpha1 + \alpha2 \times SEpost}}where +\eqn{t_{\alpha}(g)} is the final statistics for each feature, +\eqn{\beta} is the coefficient, and SEpost is the the adjusted +standard error. LimROTS generates p-values from permutation samples +using the implementation available in +\link{qvalue} package, along with internal implementation of FDR +adapted from ROTS package. Additionally, the qvalue package is used +to calculate q-values, were the proportion of true null p-values is +set to the bootstrap method \link{pi0est}. We recommend using +permutation-derived p-values and qvalues. } \examples{ # Example usage: -data <- data.frame(matrix(rnorm(500), nrow = 100, ncol = 10)) # Simulated data -meta.info <- data.frame(group = factor(rep(1:2, each = 5)), row.names = colnames(data)) +data <- data.frame(matrix(rnorm(500), nrow = 100, ncol = 10)) +# Simulated data +meta.info <- data.frame( + group = factor(rep(1:2, each = 5)), + row.names = colnames(data) +) formula.str <- "~ 0 + group" -result <- LimROTS(data, meta.info = meta.info, group.name = "group", - formula.str = formula.str, B = 10) +result <- LimROTS(data, + meta.info = meta.info, group.name = "group", + formula.str = formula.str, B = 10 +) } \references{ -Ritchie, M.E., Phipson, B., Wu, D., Hu, Y., Law, C.W., Shi, W., and Smyth, G.K. (2015). limma powers differential -expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Research 43(7), e47 - -Suomi T, Seyednasrollah F, Jaakkola M, Faux T, Elo L (2017). “ROTS: An R package for reproducibility-optimized -statistical testing.” \emph{PLoS computational biology}, \emph{13}(5), e1005562. \url{doi:10.1371/journal.pcbi.1005562} -\url{https://doi.org/10.1371/journal.pcbi.1005562}, \url{http://www.ncbi.nlm.nih.gov/pubmed/28542205} - -Elo LL, Filen S, Lahesmaa R, Aittokallio T. Reproducibility-optimized test statistic for ranking genes in microarray studies. -IEEE/ACM Trans Comput Biol Bioinform. 2008;5(3):423-431. \url{doi:10.1109/tcbb.2007.1078} +Ritchie, M.E., Phipson, B., Wu, D., Hu, Y., Law, C.W., Shi, W., and Smyth, +G.K. (2015). limma powers differential expression analyses for +RNA-sequencing and microarray studies. Nucleic Acids Research 43(7), e47 + +Suomi T, Seyednasrollah F, Jaakkola M, Faux T, Elo L (2017). “ROTS: An +R package for reproducibility-optimized statistical testing. +” \emph{PLoS computational biology}, \emph{13}(5), e1005562. +\url{doi:10.1371/journal.pcbi.1005562} +\url{https://doi.org/10.1371/journal.pcbi.1005562}, +\url{http://www.ncbi.nlm.nih.gov/pubmed/28542205} + +Elo LL, Filen S, Lahesmaa R, Aittokallio T. Reproducibility-optimized test +statistic for ranking genes in microarray studies. +IEEE/ACM Trans Comput Biol Bioinform. 2008;5(3):423-431. +\url{doi:10.1109/tcbb.2007.1078} } diff --git a/man/Optimizing.Rd b/man/Optimizing.Rd index 8016c59..b82cca8 100644 --- a/man/Optimizing.Rd +++ b/man/Optimizing.Rd @@ -11,7 +11,8 @@ Optimizing(B, ssq, N, D, S, pD, pS, verbose, progress) \item{ssq}{Numeric vector. Smoothing constants to be evaluated.} -\item{N}{Integer vector. Number of top values to consider for overlap calculation.} +\item{N}{Integer vector. Number of top values to consider for overlap +calculation.} \item{D}{Numeric matrix. Observed data values.} @@ -19,7 +20,8 @@ Optimizing(B, ssq, N, D, S, pD, pS, verbose, progress) \item{pD}{Numeric matrix. Permuted data values.} -\item{pS}{Numeric matrix. Standard errors or related values for permuted data.} +\item{pS}{Numeric matrix. Standard errors or related values for +permuted data.} \item{verbose}{Logical. If \code{TRUE}, progress messages will be displayed.} @@ -29,7 +31,8 @@ Optimizing(B, ssq, N, D, S, pD, pS, verbose, progress) A list containing the optimal parameters: \itemize{ \item \code{a1}: Optimal smoothing constant or 1 for SLR. -\item \code{a2}: SLR flag (1 if smoothing constant is optimal, 0 if SLR is optimal). +\item \code{a2}: SLR flag (1 if smoothing constant is optimal, +0 if SLR is optimal). \item \code{k}: Optimal number of top values to consider for overlap. \item \code{R}: Optimal overlap value. \item \code{Z}: Optimal z-score. @@ -37,9 +40,15 @@ A list containing the optimal parameters: } } \description{ -This function optimizes parameters by calculating overlaps between observed and permuted data for multiple values of a smoothing constant (\code{ssq}) and a single-label replicate (SLR) comparison. +This function optimizes parameters by calculating overlaps between observed +and permuted data for multiple values of a smoothing constant (\code{ssq}) and a +single-label replicate (SLR) comparison. } \details{ -The function calculates overlaps for a range of smoothing constants and identifies the optimal set of parameters by maximizing a z-score-based metric, which compares the overlap of observed data to permuted data. -It computes overlap matrices for both observed (\code{D} and \code{S}) and permuted (\code{pD} and \code{pS}) data and returns the optimal parameters based on the highest z-score. +The function calculates overlaps for a range of smoothing constants and +identifies the optimal set of parameters by maximizing a z-score-based +metric, which compares the overlap of observed data to permuted data. +It computes overlap matrices for both observed (\code{D} and \code{S}) and permuted +(\code{pD} and \code{pS}) data and returns the optimal parameters based on the +highest z-score. } diff --git a/man/SanityChecK.Rd b/man/SanityChecK.Rd index c396aa7..d49b7db 100644 --- a/man/SanityChecK.Rd +++ b/man/SanityChecK.Rd @@ -21,29 +21,41 @@ SanityChecK( ) } \arguments{ -\item{x}{A matrix-like object or a \code{SummarizedExperiment} containing the data to be analyzed.} +\item{x}{A matrix-like object or a \code{SummarizedExperiment} containing the +data to be analyzed.} -\item{B}{Integer. Number of bootstrap samples or resampling iterations. Default is 1000.} +\item{B}{Integer. Number of bootstrap samples or resampling iterations. +Default is 1000.} -\item{K}{Integer. Top list size. If NULL, it will be set to a quarter of the number of rows in the data matrix. Default is NULL.} +\item{K}{Integer. Top list size. If NULL, it will be set to a quarter of +the number of rows in the data matrix. Default is NULL.} \item{a1, a2}{Optional numeric parameters related to optimization.} -\item{meta.info}{Data frame. Metadata associated with the samples (columns of \code{data.exp}). If \code{data.exp} is a \code{SummarizedExperiment}, \code{meta.info} can be a vector of \code{colData} column names to use.} +\item{meta.info}{Data frame. Metadata associated with the samples +(columns of \code{data.exp}). If \code{data.exp} is a \code{SummarizedExperiment}, +\code{meta.info} can be a vector of \code{colData} column names to use.} -\item{group.name}{Character. Column name in \code{meta.info} that defines the groups or conditions for comparison.} +\item{group.name}{Character. Column name in \code{meta.info} that defines the +groups or conditions for comparison.} -\item{formula.str}{Optional character string representing the formula for the model.} +\item{formula.str}{Optional character string representing the formula for +the model.} -\item{survival}{Logical. If TRUE, survival analysis is used, requiring \code{time} and \code{event} columns in \code{meta.info}. Default is FALSE.} +\item{survival}{Logical. If TRUE, survival analysis is used, requiring +\code{time} and \code{event} columns in \code{meta.info}. Default is FALSE.} -\item{paired}{Logical. If TRUE, indicates paired test setup. Default is FALSE.} +\item{paired}{Logical. If TRUE, indicates paired test setup. +Default is FALSE.} -\item{n.ROTS}{Logical. If TRUE, uses the ROTS method instead of LimROTS. Default is FALSE.} +\item{n.ROTS}{Logical. If TRUE, uses the ROTS method instead of LimROTS. +Default is FALSE.} -\item{verbose}{Logical, indicating whether to display messages during the function's execution. Default is TRUE.} +\item{verbose}{Logical, indicating whether to display messages during the +function's execution. Default is TRUE.} -\item{log}{Logical, indicating whether the data is already log-transformed. Default is TRUE.} +\item{log}{Logical, indicating whether the data is already log-transformed. +Default is TRUE.} } \value{ A list containing: @@ -56,8 +68,14 @@ A list containing: } } \description{ -This function performs a series of checks and initial setups for input data, metadata, and parameters, ensuring everything is correctly formatted for downstream analysis. +This function performs a series of checks and initial setups for input data, +metadata, and parameters, ensuring everything is correctly formatted for +downstream analysis. } \details{ -This function checks whether the input data and metadata are in the correct format, processes metadata from a \code{SummarizedExperiment} object if provided, and ensures that group information is correctly specified. If no top list size (\code{K}) is provided, it defaults to a quarter of the number of rows in the data. +This function checks whether the input data and metadata are in the correct +format, processes metadata from a \code{SummarizedExperiment} object if provided, +and ensures that group information is correctly specified. If no top list +size (\code{K}) is provided, it defaults to a quarter of the number of rows in +the data. } diff --git a/man/UPS1.Case4.Rd b/man/UPS1.Case4.Rd index c9eae0e..c41c009 100644 --- a/man/UPS1.Case4.Rd +++ b/man/UPS1.Case4.Rd @@ -1,13 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/UPS1.Case4.r +% Please edit documentation in R/UPS1.Case4.R \docType{data} \name{UPS1.Case4} \alias{UPS1.Case4} \title{Spectronaut and ScaffoldDIA UPS1 Spiked Dataset case 4} \format{ -An instance of the \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} class with the following assays: +An instance of the +\code{ +\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +class with the following assays: \describe{ -\item{norm}{This assay includes log2 protein intensities calculated by averaging the peptides derived from the same protein} +\item{norm}{This assay includes log2 protein intensities calculated by +averaging the peptides derived from the same protein} } The object also contains colData and rowData: \describe{ @@ -18,17 +22,26 @@ The object also contains colData and rowData: The \code{colData} contains the following columns: \describe{ \item{SampleID}{Unique identifier for each sample.} -\item{Conc}{Experimental condition or group for each sample , representing different conc. of UPS1-spiked proteins.} +\item{Conc}{Experimental condition or group for each sample , +representing different conc. of UPS1-spiked proteins.} \item{tool}{software tool used, Spectronaut or ScaffoldDIA.} \item{fake.batch}{A fake digestion batch.} } } \source{ -Generated with both spectronaut and ScaffoldDIA separately. using a mixed mode acquisition method and FASTA mode for demonstration purposes. +Generated with both spectronaut and ScaffoldDIA separately. +using a mixed mode acquisition method and FASTA mode for demonstration +purposes. } \description{ -A \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} object containing DIA proteomics data from a UPS1-spiked E. coli proteins, processed using both Spectronaut and ScaffoldDIA separately, then merged. +A \code{ +\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +object containing DIA proteomics data from a UPS1-spiked E. coli proteins, +processed using both Spectronaut and ScaffoldDIA separately, then merged. } \references{ -Gotti, C., Roux-Dalvai, F., Joly-Beauparlant, C., Mangnier, L., Leclercq, M., & Droit, A. (2022). DIA proteomics data from a UPS1-spiked E.coli protein mixture processed with six software tools. In Data in Brief (Vol. 41, p. 107829). Elsevier BV. https://doi.org/10.1016/j.dib.2022.107829 +Gotti, C., Roux-Dalvai, F., Joly-Beauparlant, C., Mangnier, L., Leclercq, M., +& Droit, A. (2022). DIA proteomics data from a UPS1-spiked E.coli protein +mixture processed with six software tools. In Data in Brief +(Vol. 41, p. 107829). Elsevier BV. https://doi.org/10.1016/j.dib.2022.107829 } diff --git a/man/bootstrapS.Rd b/man/bootstrapS.Rd index 8fb6045..db28517 100644 --- a/man/bootstrapS.Rd +++ b/man/bootstrapS.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Bootstrap_functions.r +% Please edit documentation in R/Bootstrap_functions.R \name{bootstrapS} \alias{bootstrapS} \title{Generate Bootstrap Samples with Optional Pairing} @@ -9,30 +9,47 @@ bootstrapS(B, meta.info, group.name, paired) \arguments{ \item{B}{Integer. The number of bootstrap samples to generate.} -\item{meta.info}{Data frame. Metadata containing sample information, where each row corresponds to a sample.} +\item{meta.info}{Data frame. Metadata containing sample information, where +each row corresponds to a sample.} -\item{group.name}{Character. The name of the column in \code{meta.info} that defines the grouping variable for the samples.} +\item{group.name}{Character. The name of the column in \code{meta.info} that +defines the grouping variable for the samples.} -\item{paired}{Logical. If \code{TRUE}, the function ensures the bootstrap samples are paired between two groups.} +\item{paired}{Logical. If \code{TRUE}, the function ensures the bootstrap samples +are paired between two groups.} } \value{ -A matrix of dimension \code{B} x \code{n}, where \code{n} is the number of samples. Each row corresponds -to a bootstrap sample, and each entry is a resampled row name from the metadata. +A matrix of dimension \code{B} x \code{n}, where \code{n} is the +number of samples. Each row corresponds to a bootstrap sample, and each +entry is a resampled row name from the metadata. } \description{ -This function generates bootstrap samples from the input metadata. It samples with replacement -within each group defined in the metadata, and optionally adjusts for paired groups. +This function generates bootstrap samples from the input metadata. It samples +with replacement within each group defined in the metadata, and optionally +adjusts for paired groups. } \details{ -The function works by resampling the row names of the metadata for each group separately. If \code{paired} is \code{TRUE}, -it assumes there are exactly two groups and samples the second group based on the positions of the first group to maintain pairing. +The function works by resampling the row names of the metadata for each group +separately. If \code{paired} is \code{TRUE}, it assumes there are exactly two groups +and samples the second group based on the positions of the first group to +maintain pairing. } \examples{ # Example usage: set.seed(123) -meta.info <- data.frame(group = rep(c("A", "B"), each = 5), row.names = paste0("Sample", 1:10)) -bootstrapS(B = 10, meta.info = meta.info, group.name = "group", paired = FALSE) +meta.info <- data.frame( + group = rep(c("A", "B"), each = 5), + row.names = paste0("Sample", 1:10) +) +bootstrapS( + B = 10, meta.info = meta.info, group.name = "group", + paired = FALSE +) # Paired bootstrap sampling -bootstrapS(B = 10, meta.info = meta.info, group.name = "group", paired = TRUE) +bootstrapS( + B = 10, meta.info = meta.info, group.name = "group", + paired = TRUE +) + } diff --git a/man/bootstrapSamples.limRots.Rd b/man/bootstrapSamples.limRots.Rd index 5fbc527..1a0f0fe 100644 --- a/man/bootstrapSamples.limRots.Rd +++ b/man/bootstrapSamples.limRots.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Bootstrap_functions.r +% Please edit documentation in R/Bootstrap_functions.R \name{bootstrapSamples.limRots} \alias{bootstrapSamples.limRots} \title{Generate Stratified Bootstrap Samples for limRots} @@ -9,29 +9,40 @@ bootstrapSamples.limRots(B, meta.info, group.name) \arguments{ \item{B}{Integer. The number of bootstrap samples to generate.} -\item{meta.info}{Data frame. Metadata containing sample information, where each row corresponds to a sample. Factor columns in \code{meta.info} are used to define strata for sampling.} +\item{meta.info}{Data frame. Metadata containing sample information, +where each row corresponds to a sample. Factor columns in \code{meta.info} +are used to define strata for sampling.} -\item{group.name}{Character. The name of the column in \code{meta.info} that defines the grouping variable for the samples.} +\item{group.name}{Character. The name of the column in \code{meta.info} that +defines the grouping variable for the samples.} } \value{ -A matrix of dimension \code{B} x \code{n}, where \code{n} is the number of samples. Each row corresponds -to a bootstrap sample, and each entry is a resampled row name from the metadata, stratified by group and additional factors. +A matrix of dimension \code{B} x \code{n}, where \code{n} is the +number of samples. Each row corresponds to a bootstrap sample, and each +entry is a resampled row name from the metadata, stratified by group and +additional factors. } \description{ -This function generates stratified bootstrap samples based on the groupings and additional factors in the metadata. -The function ensures that samples are drawn proportionally based on strata defined by the interaction of factor columns in the metadata. +This function generates stratified bootstrap samples based on the groupings +and additional factors in the metadata. The function ensures that samples +are drawn proportionally based on strata defined by the interaction of +factor columns in the metadata. } \details{ -The function works by first identifying the factors in the \code{meta.info} data frame that are used to create strata for sampling. -Within each group defined by \code{group.name}, the function samples according to the strata proportions, ensuring that samples are drawn -from the correct groups and strata in a proportional manner. +The function works by first identifying the factors in the \code{meta.info} data +frame that are used to create strata for sampling. Within each group defined +by \code{group.name}, the function samples according to the strata proportions, +ensuring that samples are drawn from the correct groups and strata in a +proportional manner. } \examples{ # Example usage: set.seed(123) -meta.info <- data.frame(group = rep(c(1, 2), each = 5), +meta.info <- data.frame( + group = rep(c(1, 2), each = 5), batch = rep(c("A", "B"), 5), - row.names = paste0("Sample", 1:10)) + row.names = paste0("Sample", 1:10) +) meta.info$batch <- as.factor(meta.info$batch) bootstrapSamples.limRots(B = 10, meta.info = meta.info, group.name = "group") } diff --git a/man/calOverlaps.Rd b/man/calOverlaps.Rd index 70424ce..68a54ce 100644 --- a/man/calOverlaps.Rd +++ b/man/calOverlaps.Rd @@ -9,15 +9,18 @@ calOverlaps(D, S, pD, pS, nrow, N, N_len, ssq, B, overlaps, overlaps_P) \arguments{ \item{D}{Numeric vector. Observed data values (e.g., differences).} -\item{S}{Numeric vector. Standard errors or related values associated with the observed data.} +\item{S}{Numeric vector. Standard errors or related values associated with +the observed data.} \item{pD}{Numeric vector. Permuted data values (e.g., differences).} -\item{pS}{Numeric vector. Standard errors or related values associated with the permuted data.} +\item{pS}{Numeric vector. Standard errors or related values associated with +the permuted data.} \item{nrow}{Integer. Number of rows in each block of data.} -\item{N}{Integer vector. Number of top values to consider for overlap calculation.} +\item{N}{Integer vector. Number of top values to consider for overlap +calculation.} \item{N_len}{Integer. Length of the \code{N} vector.} @@ -25,18 +28,24 @@ calOverlaps(D, S, pD, pS, nrow, N, N_len, ssq, B, overlaps, overlaps_P) \item{B}{Integer. Number of bootstrap samples or resampling iterations.} -\item{overlaps}{Numeric matrix. Matrix to store overlap results for observed data.} +\item{overlaps}{Numeric matrix. Matrix to store overlap results for observed +data.} -\item{overlaps_P}{Numeric matrix. Matrix to store overlap results for permuted data.} +\item{overlaps_P}{Numeric matrix. Matrix to store overlap results for +permuted data.} } \value{ -A list containing two matrices: \code{overlaps} for observed data and \code{overlaps_P} for permuted data. +A list containing two matrices: \code{overlaps} for observed data and +\code{overlaps_P} for permuted data. } \description{ -This function calculates the overlap between observed and permuted data for two sets of comparisons. -It computes the ratio of overlap between pairs of vectors (res1/res2 and pres1/pres2) after sorting the values. +This function calculates the overlap between observed and permuted data for +two sets of comparisons. It computes the ratio of overlap between pairs of +vectors (res1/res2 and pres1/pres2) after sorting the values. } \details{ -The function calculates overlaps for two sets of comparisons: one for observed data (res1/res2) and one for permuted data (pres1/pres2). -For each bootstrap sample, the function orders the two vectors being compared, then calculates the proportion of overlap for the top \code{N} values. +The function calculates overlaps for two sets of comparisons: one for +observed data (res1/res2) and one for permuted data (pres1/pres2).For each +bootstrap sample, the function orders the two vectors being compared, then +calculates the proportion of overlap for the top \code{N} values. } diff --git a/man/calOverlaps.slr.Rd b/man/calOverlaps.slr.Rd index b9a91f3..edc1734 100644 --- a/man/calOverlaps.slr.Rd +++ b/man/calOverlaps.slr.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/calOverlaps.slr.R +% Please edit documentation in R/calOverlaps_slr.R \name{calOverlaps.slr} \alias{calOverlaps.slr} \title{Calculate Overlaps for Single-Label Replicates (SLR)} @@ -13,24 +13,32 @@ calOverlaps.slr(D, pD, nrow, N, N_len, B, overlaps, overlaps_P) \item{nrow}{Integer. Number of rows in each block of data.} -\item{N}{Integer vector. Number of top values to consider for overlap calculation.} +\item{N}{Integer vector. Number of top values to consider for overlap +calculation.} \item{N_len}{Integer. Length of the \code{N} vector.} \item{B}{Integer. Number of bootstrap samples or resampling iterations.} -\item{overlaps}{Numeric matrix. Matrix to store overlap results for observed data.} +\item{overlaps}{Numeric matrix. Matrix to store overlap results for +observed data.} -\item{overlaps_P}{Numeric matrix. Matrix to store overlap results for permuted data.} +\item{overlaps_P}{Numeric matrix. Matrix to store overlap results for +permuted data.} } \value{ -A list containing two matrices: \code{overlaps} for observed data and \code{overlaps_P} for permuted data. +A list containing two matrices: \code{overlaps} for observed data and +\code{overlaps_P} for permuted data. } \description{ -This function computes the overlap between two sets of observed and permuted values for single-label replicates (SLR). -It calculates the proportion of overlap between pairs of vectors (res1/res2 and pres1/pres2) after sorting them. +This function computes the overlap between two sets of observed and permuted +'values for single-label replicates (SLR). +It calculates the proportion of overlap between pairs of vectors (res1/res2 +and pres1/pres2) after sorting them. } \details{ -The function calculates the overlap for two sets of comparisons: one for observed data (\code{res1}/\code{res2}) and one for permuted data (\code{pres1}/\code{pres2}). -For each bootstrap sample, the function orders the two vectors being compared, then computes the proportion of overlap for the top \code{N} values. +The function calculates the overlap for two sets of comparisons: one for +observed data (\code{res1}/\code{res2}) and one for permuted data (\code{pres1}/\code{pres2}). +For each bootstrap sample, the function orders the two vectors being +compared, then computes the proportion of overlap for the top \code{N} values. } diff --git a/man/calculateFalseDiscoveryRate.Rd b/man/calculateFalseDiscoveryRate.Rd index 4312be8..ce4bdad 100644 --- a/man/calculateFalseDiscoveryRate.Rd +++ b/man/calculateFalseDiscoveryRate.Rd @@ -11,24 +11,33 @@ calculateFalseDiscoveryRate( ) } \arguments{ -\item{observedValues}{Numeric vector. The observed test statistics or values to be evaluated for significance.} +\item{observedValues}{Numeric vector. The observed test statistics or values +to be evaluated for significance.} -\item{permutedValues}{Numeric matrix. The permuted test statistics or values, with rows corresponding to the same values as in \code{observedValues} and columns representing different permutations.} +\item{permutedValues}{Numeric matrix. The permuted test statistics or values, +with rows corresponding to the same values as in \code{observedValues} and +columns representing different permutations.} -\item{showProgress}{Logical. If \code{TRUE}, a progress bar will be shown during the computation.} +\item{showProgress}{Logical. If \code{TRUE}, a progress bar will be shown during +the computation.} } \value{ -A numeric vector of the same length as \code{observedValues}, containing the estimated FDR for each observed value. +A numeric vector of the same length as \code{observedValues}, containing +the estimated FDR for each observed value. } \description{ -This function calculates the false discovery rate (FDR) by comparing observed values to permuted values. -The function sorts observed values, compares them against permuted data, and computes FDR using the median of permutation results. +This function calculates the false discovery rate (FDR) by comparing +observed values to permuted values.The function sorts observed values, +compares them against permuted data, and computes FDR using the median of +permutation results. } \examples{ observedValues <- c(2.5, 1.8, 3.1, 0.7, 2.9) set.seed(123) permutedValues <- matrix(rnorm(5 * 5, mean = 2, sd = 1), nrow = 5) -fdr <- calculateFalseDiscoveryRate(observedValues, permutedValues, showProgress = FALSE) +fdr <- calculateFalseDiscoveryRate(observedValues, permutedValues, + showProgress = FALSE +) print(fdr) } diff --git a/man/countLargerThan.Rd b/man/countLargerThan.Rd index dcf694c..7a57952 100644 --- a/man/countLargerThan.Rd +++ b/man/countLargerThan.Rd @@ -9,11 +9,15 @@ countLargerThan(observedVec, permutedVec) \arguments{ \item{observedVec}{Numeric vector. The observed values.} -\item{permutedVec}{Numeric vector. The permuted values to compare against the observed values.} +\item{permutedVec}{Numeric vector. The permuted values to compare against +the observed values.} } \value{ -A numeric vector containing the counts of permuted values greater than or equal to the corresponding observed values. +A numeric vector containing the counts of permuted values greater +than or equal to the corresponding observed values. } \description{ -This helper function compares observed values against permuted values and counts the number of permuted values that are greater than or equal to each observed value. +This helper function compares observed values against permuted values and +counts the number of permuted values that are greater than or equal to each +observed value. } diff --git a/man/permutatedS.Rd b/man/permutatedS.Rd index 46ed7e5..2023f65 100644 --- a/man/permutatedS.Rd +++ b/man/permutatedS.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Bootstrap_functions.r +% Please edit documentation in R/Bootstrap_functions.R \name{permutatedS} \alias{permutatedS} \title{Generate Permutated Samples} @@ -7,24 +7,31 @@ permutatedS(meta.info, B) } \arguments{ -\item{meta.info}{Data frame. Metadata containing sample information, where each row corresponds to a sample.} +\item{meta.info}{Data frame. Metadata containing sample information, where +each row corresponds to a sample.} \item{B}{Integer. The number of permutations to generate.} } \value{ -A matrix of dimension \code{B} x \code{n}, where \code{n} is the number of samples (i.e., rows in \code{meta.info}). -Each row is a permutation of the row names of the metadata. +A matrix of dimension \code{B} x \code{n}, where \code{n} is the +number of samples (i.e., rows in \code{meta.info}).Each row is a permutation of +the row names of the metadata. } \description{ -This function generates permuted samples by shuffling the row names of the metadata. +This function generates permuted samples by shuffling the row names of the +metadata. } \details{ -The function creates a matrix where each row is a permuted version of the row names from \code{meta.info}. -This can be used to generate null distributions or perform randomization-based tests. +The function creates a matrix where each row is a permuted version of the +row names from \code{meta.info}.This can be used to generate null distributions +or perform randomization-based tests. } \examples{ # Example usage: set.seed(123) -meta.info <- data.frame(group = rep(c("A", "B"), each = 5), row.names = paste0("Sample", 1:10)) +meta.info <- data.frame( + group = rep(c("A", "B"), each = 5), + row.names = paste0("Sample", 1:10) +) permutatedS(meta.info = meta.info, B = 10) } diff --git a/man/proteogenomicPD.Rd b/man/proteogenomicPD.Rd index 68fde80..791c581 100644 --- a/man/proteogenomicPD.Rd +++ b/man/proteogenomicPD.Rd @@ -1,12 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/proteogenomicPD.r +% Please edit documentation in R/proteogenomicPD.R \docType{data} \name{proteogenomicPD} \alias{proteogenomicPD} \title{proteogenomicPD Dataset} \format{ -An instance of the \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} class with the following assays: +An instance of the +\code{ +\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +class with the following assays: } \description{ -A \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} object +A \code{ +\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} +object } diff --git a/man/testStatOptimized.Rd b/man/testStatOptimized.Rd index 7e0dc31..81aec4e 100644 --- a/man/testStatOptimized.Rd +++ b/man/testStatOptimized.Rd @@ -7,9 +7,11 @@ testStatOptimized(isPaired, x) } \arguments{ -\item{isPaired}{Logical. If TRUE, calculates test statistics for paired samples. If FALSE, for unpaired samples.} +\item{isPaired}{Logical. If TRUE, calculates test statistics for paired +samples. If FALSE, for unpaired samples.} -\item{x}{List of matrices or data frames. Each element represents a group of samples (columns) with the same set of features (rows).} +\item{x}{List of matrices or data frames. Each element represents a group of +samples (columns) with the same set of features (rows).} } \value{ A list containing: @@ -19,9 +21,17 @@ A list containing: } } \description{ -This function calculates the test statistic (mean differences and standard deviations) for comparing two or more groups of samples, with support for paired or unpaired samples. +This function calculates the test statistic (mean differences and standard +deviations) for comparing two or more groups of samples, with support for +paired or unpaired samples. } \details{ -The function supports comparison between two groups or multiple groups of samples. For two groups, it computes the mean differences and pooled standard deviations for unpaired samples, or paired standard deviations for paired samples. -When comparing more than two groups, it calculates the mean differences and standard deviations across all groups. For unpaired samples, a scaling factor based on sample size is used. Paired comparisons are only supported for two groups. +The function supports comparison between two groups or multiple groups of +samples. For two groups, it computes the mean differences and pooled +standard deviations for unpaired samples, or paired standard deviations for +paired samples. +When comparing more than two groups, it calculates the mean differences and +standard deviations across all groups. For unpaired samples, a scaling factor +based on sample size is used. Paired comparisons are only supported for two +groups. } diff --git a/man/testStatSurvivalOptimized.Rd b/man/testStatSurvivalOptimized.Rd index 90235a3..b0516f0 100644 --- a/man/testStatSurvivalOptimized.Rd +++ b/man/testStatSurvivalOptimized.Rd @@ -7,25 +7,29 @@ testStatSurvivalOptimized(x, survivalTime, survivalEvent) } \arguments{ -\item{x}{A list of matrices or data frames, where each element represents a group of samples -(columns) with the same set of features (rows).} +\item{x}{A list of matrices or data frames, where each element represents +a group of samples (columns) with the same set of features (rows).} -\item{survivalTime}{A numeric vector containing the survival times for each sample.} +\item{survivalTime}{A numeric vector containing the survival times for +each sample.} -\item{survivalEvent}{A binary vector indicating the occurrence of an event (1 for event, 0 for censored) -for each sample.} +\item{survivalEvent}{A binary vector indicating the occurrence of an event +(1 for event, 0 for censored) for each sample.} } \value{ A list containing: -\item{d}{A numeric vector of mean differences for each feature across the groups.} +\item{d}{A numeric vector of mean differences for each feature across +the groups.} \item{s}{A numeric vector of standard deviations for the mean differences.} } \description{ -This function calculates mean differences and standard deviations for survival data -across different sample groups, considering at-risk samples at each unique event time. +This function calculates mean differences and standard +deviations for survival data across different sample groups, considering +at-risk samples at each unique event time. } \details{ -The function identifies unique times from the survival data and computes the mean difference -for each time point by comparing samples that experienced the event against those at risk. -It calculates the standard deviation based on the variation among at-risk samples. +The function identifies unique times from the survival data and +computes the mean difference for each time point by comparing samples that +experienced the event against those at risk. It calculates the standard +deviation based on the variation among at-risk samples. } diff --git a/man/testStatistic_with_covariates.Rd b/man/testStatistic_with_covariates.Rd index fafac65..754895d 100644 --- a/man/testStatistic_with_covariates.Rd +++ b/man/testStatistic_with_covariates.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LM_with_limma.r +% Please edit documentation in R/LM_with_limma.R \name{testStatistic_with_covariates} \alias{testStatistic_with_covariates} \title{Perform Linear Modeling with Covariates using Limma} @@ -14,40 +14,51 @@ testStatistic_with_covariates( ) } \arguments{ -\item{x}{A list containing two or more data matrices where rows represent features (e.g., genes, proteins) -and columns represent samples. The list should contain at least two matrices for pairwise group comparison.} +\item{x}{A list containing two or more data matrices where rows represent +features (e.g., genes, proteins) and columns represent samples. The list +should contain at least two matrices for pairwise group comparison.} -\item{group.name}{A character string indicating the name of the group variable in \code{meta.info} to be used -in the analysis.} +\item{group.name}{A character string indicating the name of the group +variable in \code{meta.info} to be used in the analysis.} -\item{meta.info}{A data frame containing the metadata for the samples. This includes sample grouping and any -covariates to be included in the model.} +\item{meta.info}{A data frame containing the metadata for the samples. +This includes sample grouping and any covariates to be included in the model.} -\item{formula.str}{A string specifying the formula to be used in model fitting. It should follow the standard -R formula syntax (e.g., \code{~ covariate1 + covariate2}).} +\item{formula.str}{A string specifying the formula to be used in model +fitting. It should follow the standard R formula syntax +(e.g., \code{~ covariate1 + covariate2}).} -\item{trend}{A logical value indicating whether to allow for an intensity-dependent trend in the prior variance.} +\item{trend}{A logical value indicating whether to allow for an +intensity-dependent trend in the prior variance.} -\item{robust}{A logical value indicating whether to use a robust fitting procedure to protect against outliers.} +\item{robust}{A logical value indicating whether to use a robust +fitting procedure to protect against outliers.} } \value{ A list containing the following elements: -\item{d}{A vector of the test statistics (log-fold changes or F-statistics) for each feature.} -\item{s}{A vector of the standard deviations for each feature, adjusted by the empirical Bayes procedure.} +\item{d}{A vector of the test statistics (log-fold changes or F-statistics) +for each feature.} +\item{s}{A vector of the standard deviations for each feature, adjusted by +the empirical Bayes procedure.} } \description{ -This function performs linear modeling using the Limma package while accounting for covariates specified -in the \code{meta.info}. It supports two-group comparisons and multi-group analysis, incorporating covariates +This function performs linear modeling using the Limma package while +accounting for covariates specified +in the \code{meta.info}. It supports two-group comparisons and multi-group +analysis, incorporating covariates through a design matrix. } \details{ -This function first combines the data matrices from different groups and prepares a design matrix based on -the covariates specified in \code{meta.info} using the provided formula. It fits a linear model using Limma, -computes contrasts between groups, and applies empirical Bayes moderation. For two-group comparisons, the -function returns log-fold changes and associated statistics. In multi-group settings with a single covariate, +This function first combines the data matrices from different groups and +prepares a design matrix based on the covariates specified in \code{meta.info} +using the provided formula. It fits a linear model using Limma, +computes contrasts between groups, and applies empirical Bayes moderation. +For two-group comparisons, the function returns log-fold changes and +associated statistics. In multi-group settings with a single covariate, it calculates pairwise contrasts and moderated F-statistics. } \seealso{ -\code{\link[limma]{lmFit}}, \code{\link[limma]{eBayes}}, \code{\link[limma]{topTable}}, +\code{\link[limma]{lmFit}}, \code{\link[limma]{eBayes}}, +\code{\link[limma]{topTable}}, \code{\link[limma]{makeContrasts}} } diff --git a/man/testStatistic_with_covariates_Fit.Rd b/man/testStatistic_with_covariates_Fit.Rd index f55f5af..d9995d9 100644 --- a/man/testStatistic_with_covariates_Fit.Rd +++ b/man/testStatistic_with_covariates_Fit.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LM_with_limma.fit.r +% Please edit documentation in R/LM_with_limma_fit.R \name{testStatistic_with_covariates_Fit} \alias{testStatistic_with_covariates_Fit} \title{Perform Linear Modeling with Covariates using Limma} @@ -14,45 +14,51 @@ testStatistic_with_covariates_Fit( ) } \arguments{ -\item{x}{A list containing two or more data matrices where rows represent features -(e.g., genes, proteins) and columns represent samples. The list should contain at least -two matrices for pairwise group comparison.} +\item{x}{A list containing two or more data matrices where rows represent +features (e.g., genes, proteins) and columns represent samples. The list +should contain at least two matrices for pairwise group comparison.} -\item{group.name}{A character string indicating the name of the group variable in -\code{meta.info} to be used in the analysis.} +\item{group.name}{A character string indicating the name of the group +variable in\code{meta.info} to be used in the analysis.} -\item{meta.info}{A data frame containing the metadata for the samples. This includes -sample grouping and any covariates to be included in the model.} +\item{meta.info}{A data frame containing the metadata for the samples. +This includes sample grouping and any covariates to be included in the model.} -\item{formula.str}{A string specifying the formula to be used in model fitting. It should -follow the standard R formula syntax (e.g., \code{~ covariate1 + covariate2}).} +\item{formula.str}{A string specifying the formula to be used in model +fitting. It should follow the standard R formula syntax +(e.g., \code{~ covariate1 + covariate2}).} -\item{trend}{A logical value indicating whether to allow for an intensity-dependent trend in -the prior variance.} +\item{trend}{A logical value indicating whether to allow for an +intensity-dependent trend in the prior variance.} -\item{robust}{A logical value indicating whether to use a robust fitting procedure to protect -against outliers.} +\item{robust}{A logical value indicating whether to use a robust fitting +procedure to protect against outliers.} } \value{ A list containing the following elements: -\item{d}{A vector of the test statistics (log-fold changes or F-statistics) for each feature.} -\item{s}{A vector of the standard deviations for each feature, adjusted by the empirical Bayes -procedure.} -\item{corrected.logfc}{The log-fold changes for each feature after fitting the model.} +\item{d}{A vector of the test statistics (log-fold changes or F-statistics) +for each feature.} +\item{s}{A vector of the standard deviations for each feature, adjusted by t +he empirical Bayes procedure.} +\item{corrected.logfc}{The log-fold changes for each feature after fitting +the model.} } \description{ -This function performs linear modeling using the Limma package, incorporating covariates -in the model fitting process. It is designed to handle both two-group comparisons and -multi-group settings with covariates. +This function performs linear modeling using the Limma package, incorporating +covariates in the model fitting process. It is designed to handle both +two-group comparisons and multi-group settings with covariates. } \details{ -This function combines the data matrices from different groups and fits a linear model -using covariates provided in the \code{meta.info}. For two-group comparisons, the function -computes contrasts between the two groups and applies empirical Bayes moderation. For -multi-group analysis with a single covariate, pairwise contrasts are computed, and the -moderated F-statistic is calculated for each feature. +This function combines the data matrices from different groups and fits a +linear model using covariates provided in the \code{meta.info}. For two-group +comparisons, the function computes contrasts between the two groups and +applies empirical Bayes moderation. For multi-group analysis with a single +covariate, pairwise contrasts are computed, and the moderated F-statistic is +calculated for each feature. } \seealso{ -\code{\link[limma]{lmFit}}, \code{\link[limma]{eBayes}}, \code{\link[limma]{topTable}}, +\code{\link[limma]{lmFit}}, +\code{\link[limma]{eBayes}}, +\code{\link[limma]{topTable}}, \code{\link[limma]{makeContrasts}} } diff --git a/man/testStatistic_with_covariates_permutating.Rd b/man/testStatistic_with_covariates_permutating.Rd index c78df5d..9a919c0 100644 --- a/man/testStatistic_with_covariates_permutating.Rd +++ b/man/testStatistic_with_covariates_permutating.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LM_with_limma.permuting.r +% Please edit documentation in R/LM_with_limma_permuting.R \name{testStatistic_with_covariates_permutating} \alias{testStatistic_with_covariates_permutating} \title{Perform Permutation-Based Linear Modeling with Covariates using Limma} @@ -15,43 +15,55 @@ testStatistic_with_covariates_permutating( ) } \arguments{ -\item{x}{A list containing two or more data matrices where rows represent features (e.g., genes, proteins) -and columns represent samples. The list should contain at least two matrices for pairwise group comparison.} +\item{x}{A list containing two or more data matrices where rows represent +features (e.g., genes, proteins) and columns represent samples. The list +should contain at least two matrices for pairwise group comparison.} -\item{group.name}{A character string indicating the name of the group variable in \code{meta.info} to be used -in the analysis.} +\item{group.name}{A character string indicating the name of the group +variable in \code{meta.info} to be used in the analysis.} -\item{meta.info}{A data frame containing the metadata for the samples. This includes sample grouping and any -covariates to be included in the model.} +\item{meta.info}{A data frame containing the metadata for the samples. +This includes sample grouping and any covariates to be included in the model.} -\item{formula.str}{A string specifying the formula to be used in model fitting. It should follow the standard -R formula syntax (e.g., \code{~ covariate1 + covariate2}).} +\item{formula.str}{A string specifying the formula to be used in model +fitting. It should follow the standard R formula syntax +(e.g., \code{~ covariate1 + covariate2}).} -\item{trend}{A logical value indicating whether to allow for an intensity-dependent trend in the prior variance.} +\item{trend}{A logical value indicating whether to allow for an +intensity-dependent trend in the prior variance.} -\item{robust}{A logical value indicating whether to use a robust fitting procedure to protect against outliers.} +\item{robust}{A logical value indicating whether to use a robust +fitting procedure to protect against outliers.} -\item{permutating.group}{Logical, If \code{TRUE}, the permutation for calculating the null distribution is performed by permuting the target group only specified in \code{group.name}. If FALSE, the entire \code{meta.info} will be permuted (recommended to be set to TRUE).} +\item{permutating.group}{Logical, If \code{TRUE}, the permutation for +calculating the null distribution is performed by permuting the target group +only specified in \code{group.name}. If FALSE, the entire \code{meta.info} +will be permuted (recommended to be set to TRUE).} } \value{ A list containing the following elements: -\item{d}{A vector of the test statistics (log-fold changes or F-statistics) for each feature.} -\item{s}{A vector of the standard deviations for each feature, adjusted by the empirical Bayes procedure.} +\item{d}{A vector of the test statistics (log-fold changes or F-statistics) +for each feature.} +\item{s}{A vector of the standard deviations for each feature, adjusted by +the empirical Bayes procedure.} } \description{ -This function performs linear modeling using the Limma package with permutation of the covariates -to evaluate the test statistics under random assignments. It handles two-group comparisons and -multi-group settings. +This function performs linear modeling using the Limma package with +permutation of the covariates to evaluate the test statistics under random +assignments. It handles two-group comparisons and multi-group settings. } \details{ -This function combines the data matrices from different groups and permutes the covariates from \code{meta.info} -before fitting a linear model using Limma. Permutation helps assess how the covariates behave under random -conditions, providing a null distribution of the test statistics. For two-group comparisons, the function -computes contrasts between the two groups and applies empirical Bayes moderation. For multi-group analysis -with a single covariate, pairwise contrasts are computed, and the moderated F-statistic is calculated for -each feature. +This function combines the data matrices from different groups and permutes +the covariates from \code{meta.info}before fitting a linear model using Limma. +Permutation helps assess how the covariates behave under random conditions, +providing a null distribution of the test statistics. For two-group +comparisons, the function computes contrasts between the two groups and +applies empirical Bayes moderation. For multi-group analysis with a single +covariate, pairwise contrasts are computed, and the moderated F-statistic is +calculated for each feature. } \seealso{ -\code{\link[limma]{lmFit}}, \code{\link[limma]{eBayes}}, \code{\link[limma]{topTable}}, +\code{\link[limma]{lmFit}}, \code{\link[limma]{eBayes}}, +\code{\link[limma]{topTable}}, \code{\link[limma]{makeContrasts}} } diff --git a/vignettes/LimROTS.Rmd b/vignettes/LimROTS.Rmd index e9cb8de..de28d94 100644 --- a/vignettes/LimROTS.Rmd +++ b/vignettes/LimROTS.Rmd @@ -1,5 +1,7 @@ --- -title: "LimROTS: A Hybrid Method Integrating Empirical Bayes and Reproducibility-Optimized Statistics for Robust Analysis of Proteomics and Metabolomics Data" +title: "LimROTS: A Hybrid Method Integrating Empirical Bayes and + Reproducibility-Optimized Statistics for Robust Analysis of Proteomics + and Metabolomics Data" output: BiocStyle::html_document: fig_height: 7 @@ -9,76 +11,151 @@ output: toc_depth: 3 number_sections: true vignette: > - %\VignetteIndexEntry{mia} + %\VignetteIndexEntry{LimROTS} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} -bibliography: "reference.bib" +bibliography: reference.bib --- ```{r, include = FALSE} knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" + collapse = TRUE, + comment = "#>" ) ``` # How LimROTS Work? -The **LimROTS** approach initially uses the limma package @limma to simulate the intensity data of proteins and metabolites. A linear model is subsequently fitted using the design matrix. Empirical Bayes variance shrinking is then implemented. To obtain the moderated t-statistics, the adjusted standard error (SEpost = √(s2.post) \* unscaled SD) for each feature is computed, along with the regression coefficient for each feature (indicating the impact of variations in the experimental settings). The ROTS approach @rots establishes optimality based on the largest overlap of top-ranked features within group-preserving bootstrap datasets (refer to @elo2008reproducibility for further information on the reproducibility-optimization), Finally based on the optimized parameters ${\alpha1}$ and ${\alpha2}$ this equation used to calculates the final statistics: $$t_{\alpha}(g) = \frac{\beta}{\alpha1 + \alpha2 \times SEpost}$$\ - -where $t_{\alpha}(g)$ is the final statistics for each feature, ${\beta}$ is the coefficient, and SEpost is the the adjusted standard error. LimROTS generates p-values from permutation samples using the implementation available in qvalue package @qvalue, along with internal implementation of FDR adapted from ROTS package @rots. Additionally, the `qvalue` package is used to calculate q-values, were the proportion of true null p-values is set to the bootstrap method. We recommend using permutation-derived p-values and qvalues. +The **LimROTS** approach initially uses the limma package @limma to simulate +the intensity data of proteins and metabolites. A linear model is subsequently +fitted using the design matrix. Empirical Bayes variance shrinking is then +implemented. To obtain the moderated t-statistics, the adjusted standard error +(SEpost = √(s2.post) \* unscaled SD) for each feature is computed, along with +the regression coefficient for each feature (indicating the impact of +variations in the experimental settings). The ROTS approach @rots establishes +optimality based on the largest overlap of top-ranked features within +group-preserving bootstrap datasets (refer to @elo2008reproducibility for +further information on the reproducibility-optimization), Finally based on the +optimized parameters ${\alpha1}$ and ${\alpha2}$ this equation used to +calculates the final statistics: $$t_{\alpha}(g) = \frac{\beta}{\alpha1 + +\alpha2 \times SEpost}$$\ + +where $t_{\alpha}(g)$ is the final statistics for each feature, ${\beta}$ is +the coefficient, and SEpost is the the adjusted standard error. LimROTS +generates p-values from permutation samples using the implementation available +in qvalue package @qvalue, along with internal implementation of FDR adapted +from ROTS package @rots. Additionally, the `qvalue` package is used to +calculate q-values, were the proportion of true null p-values is set to the +bootstrap method. We recommend using permutation-derived p-values and qvalues. # Computational Power -The number of samples, features, bootstrap iterations, and k, which denotes the top list size for ranking, are the four primary elements that determine the amount of computing resources required for the optimisation process in LimROTS. It is therefore advised to use at least 4 cores to execute LimROTS since it uses a parallel processing implementation for the bootstrapping step. The optimisation process is sequential and also time-consuming; it is planned to be modified in order to make the upcoming LimROTS version more palatable. +The number of samples, features, bootstrap iterations, and k, which denotes +the top list size for ranking, are the four primary elements that determine +the amount of computing resources required for the optimisation process in +LimROTS. It is therefore advised to use at least 4 cores to execute LimROTS +since it uses a parallel processing implementation for the bootstrapping step. +The optimisation process is sequential and also time-consuming; it is planned +to be modified in order to make the upcoming LimROTS version more palatable. # Parameter Description for `LimROTS` -`LimROTS` takes several parameters, and it should be called correctly to obtain the desired output. The parameters are divided into three different sets: +`LimROTS` takes several parameters, and it should be called correctly to obtain +the desired output. The parameters are divided into three different sets: 1. **Parameters for `LimROTS` Implementation** -2. **Parameters for the Original `ROTS` Implementation without `limma` Integration** +2. **Parameters for the Original `ROTS` Implementation +without `limma` Integration** 3. **Common Parameters** ## Common Parameters -- **`x`**: The input data, which can be a `SummarizedExperiment` object or a matrix where rows represent features (e.g., genes, proteins) and columns represent samples. The values should be log-transformed. -- **`B`**: An integer specifying the number of bootstrap iterations. Default is 1000. -- **`K`**: An optional integer representing the top list size for ranking. If not specified, it is set to one-fourth of the number of features. -- **`a1`**: Optional numeric value used in the optimization process. If defined by the user, no optimization occurs. -- **`a2`**: Optional numeric value used in the optimization process. If defined by the user, no optimization occurs [0,1]. -- **`log`**: Logical, indicating whether the data is already log-transformed. Default is `TRUE`. -- **`progress`**: Logical, indicating whether to display a progress bar during bootstrap sampling. Default is `FALSE`. -- **`verbose`**: Logical, indicating whether to display messages during the function's execution. Default is `TRUE`. -- **`meta.info`**: A data frame containing sample-level metadata, where each row corresponds to a sample. It should include the grouping variable specified in `group.name`. If `x` (data) is a `SummarizedExperiment` object, `meta.info` must be a vector of the metadata needed for the model to run and can be retrieved using `colData()`. -- **`group.name`**: A string specifying the column in `meta.info` that represents the groups or conditions for comparison. -- **`seed.cl`**: An integer specifying the seed for randomization; if not provided, the default is 1234. -- **`cluster`**: A parallel cluster object for distributed computation, e.g., created by `makeCluster()`. Default is 2. +- **`x`**: The input data, which can be a `SummarizedExperiment` object or a +matrix where rows represent features (e.g., genes, proteins) and columns +represent samples. The values should be log-transformed. +- **`B`**: An integer specifying the number of bootstrap iterations. +Default is 1000. +- **`K`**: An optional integer representing the top list size for ranking. +If not specified, it is set to one-fourth of the number of features. +- **`a1`**: Optional numeric value used in the optimization process. If +defined by the user, no optimization occurs. +- **`a2`**: Optional numeric value used in the optimization process. If +defined by the user, no optimization occurs [0,1]. +- **`log`**: Logical, indicating whether the data is already log-transformed. +Default is `TRUE`. +- **`progress`**: Logical, indicating whether to display a progress bar +during bootstrap sampling. Default is `FALSE`. +- **`verbose`**: Logical, indicating whether to display messages during the +function's execution. Default is `TRUE`. +- **`meta.info`**: A data frame containing sample-level metadata, where each +row corresponds to a sample. It should include the grouping variable specified +in `group.name`. If `x` (data) is a `SummarizedExperiment` object, `meta.info` +must be a vector of the metadata needed for the model to run and can be +retrieved using `colData()`. +- **`group.name`**: A string specifying the column in `meta.info` that +represents the groups or conditions for comparison. +- **`seed.cl`**: An integer specifying the seed for randomization; if not +provided, the default is 1234. +- **`cluster`**: A parallel cluster object for distributed computation, e.g., +created by `makeCluster()`. Default is 2. ## Parameters for `LimROTS` Implementation -- **`formula.str`**: A formula string used when covariates are present in `meta.info` for modeling. It should include "\~ 0 + ..." to exclude the intercept from the model. -- **`robust`**: Logical, indicating whether robust fitting should be used. Default is `TRUE`. -- **`trend`**: Logical, indicating whether to include trend fitting in the differential expression analysis. Default is `TRUE`. -- **permutating.group**: Logical, If `TRUE`, the permutation for calculating the null distribution is performed by permuting the target group only specified in `group.name`. If `FALSE`, the entire `meta.info` will be permuted (recommended to be set to TRUE). +- **`formula.str`**: A formula string used when covariates are present in +`meta.info` for modeling. It should include "\~ 0 + ..." to exclude the +intercept from the model. +- **`robust`**: Logical, indicating whether robust fitting should be used. +Default is `TRUE`. +- **`trend`**: Logical, indicating whether to include trend fitting in the +differential expression analysis. Default is `TRUE`. +- **permutating.group**: Logical, If `TRUE`, the permutation for calculating +the null distribution is performed by permuting the target group only specified +in `group.name`. If `FALSE`, the entire `meta.info` will be permuted +(recommended to be set to TRUE). ## Parameters for the Original `ROTS` Implementation -- **`survival`**: Logical, indicating whether to enable survival analysis. If `TRUE`, then `meta.info` should contain `time` and `event` columns. -- **`paired`**: Logical, indicating whether the data represent paired samples. Default is `FALSE`. -- **`n.ROTS`**: Logical. If `TRUE`, all parameters related to `LimROTS` will be ignored, and the original `ROTS` analysis will run. This must be `TRUE` when `survival` or `paired` is set to `TRUE`. +- **`survival`**: Logical, indicating whether to enable survival analysis. +If `TRUE`, then `meta.info` should contain `time` and `event` columns. +- **`paired`**: Logical, indicating whether the data represent paired samples. +Default is `FALSE`. +- **`n.ROTS`**: Logical. If `TRUE`, all parameters related to `LimROTS` will +be ignored, and the original `ROTS` analysis will run. This must be `TRUE` when +`survival` or `paired` is set to `TRUE`. # UPS1 Case Study -To demonstrate LimROTS' ability to detect true negatives complex scenarios, we are using a DIA proteomics data from a UPS1-spiked E. coli protein mixture @GOTTI2022107829 includes 48 samples: 24 samples analyzed with Spectronaut and another 24 analyzed with ScaffoldDIA software, with total of 1656 proteins. Eight different concentrations of UPS1 were used (0.1 to 50 fmol), grouped into two categories: low concentrations (0.1–2.5 fmol, labeled as Conc. 2, 12 Samples from each software) and high concentrations (5–50 fmol, labeled as Conc. 1, 12 Samples from each software). +To demonstrate LimROTS' ability to detect true negatives complex scenarios, +we are using a DIA proteomics data from a UPS1-spiked E. coli protein mixture +@GOTTI2022107829 includes 48 samples: 24 samples analyzed with Spectronaut and +another 24 analyzed with ScaffoldDIA software, with total of 1656 proteins. +Eight different concentrations of UPS1 were used (0.1 to 50 fmol), grouped into +two categories: low concentrations (0.1–2.5 fmol, labeled as Conc. 2, 12 Samples +from each software) and high concentrations (5–50 fmol, labeled as Conc. 1, 12 +Samples from each software). -A synthetic, unbalanced fake batches assigned to the samples. The assignment follows the ratio of: `c(rep("M", 9), rep("F", 3), rep("M", 3), rep("F", 9), rep("M", 9), rep("F", 3), rep("M", 3), rep("F", 9))`. +A synthetic, unbalanced fake batches assigned to the samples. The assignment +follows the ratio of: +`Here is the rewritten version with a 50-character limit per line: -Additionally, 100 E. coli proteins were randomly selected, and an effect size of 10 was added to each. The expected outcome is that only the UPS1 human proteins will be identified as truly significant, while none of the remaining proteins should show significant differences between the concentration groups.\ +```{r} +c(rep("M", 9), rep("F", 3), rep("M", 3), rep("F", +9), rep("M", 9), rep("F", 3), rep("M", 3), rep("F", +9)) +``` + +Additionally, 100 E. coli proteins were randomly selected, and an effect size +of 10 was added to each. The expected outcome is that only the UPS1 human +proteins will be identified as truly significant, while none of the remaining +proteins should show significant differences between the concentration groups.\ \ -This scenario resembles a real-world case where the experiment involves unbalanced batch assignments or, for instance, an uneven gender ratio among the samples. +This scenario resembles a real-world case where the experiment involves +unbalanced batch assignments or, for instance, an uneven gender ratio among the +samples. -LimROTS can take a SummarizedExperiment object with all the metadata needed to run the model. In this example we importing UPS1.Case4 data available in LimROTS. +LimROTS can take a SummarizedExperiment object with all the metadata needed to +run the model. In this example we importing UPS1.Case4 data available +in LimROTS. The original source of the dataset can be found here @GOTTI2022107829 @@ -97,7 +174,9 @@ print(UPS1.Case4) ### Run LimROTS -Although `seed.cl` internally control the seed randomization during the parallel bootstrapping process. It is advisable to establish a consistent seed prior to executing `LimROTS` +Although `seed.cl` internally control the seed randomization during the parallel +bootstrapping process. It is advisable to establish a consistent seed prior to +executing `LimROTS` ```{r} set.seed(1234, kind = "default") @@ -115,30 +194,50 @@ group.name <- "Conc." formula.str <- "~ 0 + Conc. + tool + fake.batch" # Formula for group comparison # Run LimROTS analysis with trend and robust settings enabled -limrots.result <- LimROTS(x = UPS1.Case4, +limrots.result <- LimROTS( + x = UPS1.Case4, B = B, K = K, meta.info = meta.info, cluster = cluster, group.name = group.name, - formula.str = formula.str, trend = TRUE, robust = TRUE) + formula.str = formula.str, trend = TRUE, robust = TRUE +) ``` -**NOTE:** "In this instance, we configure the number of bootstrap iterations (B) and the count of top-ranked features for reproducibility optimization (K) to 100 both, in order to minimize the example's run-time. For actual analyses, it is advisable to utilize a greater number of bootstraps (e.g., 1000). Also, for the number of cores to use we recommend to use at least 4 cores. +**NOTE:** "In this instance, we configure the number of bootstrap iterations +(B) and the count of top-ranked features for reproducibility optimization (K) +to 100 both, in order to minimize the example's run-time. For actual analyses, +it is advisable to utilize a greater number of bootstraps (e.g., 1000). Also, +for the number of cores to use we recommend to use at least 4 cores. ### Volcano Plot with ggplot2 -Utilising a Volcano plot and mapping the human UPS1 proteins at q-values of 1% and 5%, it is evident that LimROTS accurately identified the majority of actual positive proteins while detecting a limited number of simulated E. coli proteins. +Utilising a Volcano plot and mapping the human UPS1 proteins at q-values of 1%  +and 5%, it is evident that LimROTS accurately identified the majority of  +actual positive proteins while detecting a limited number of simulated E. coli  +proteins. ```{r} # Create a data frame from the LimROTS results -limrots.result.df <- data.frame(proteins = row.names(limrots.result$data), - LimROTS.FC = limrots.result$corrected.logf, q.value = limrots.result$q_values$qvalues) +limrots.result.df <- data.frame( + proteins = row.names(limrots.result$data), + LimROTS.FC = limrots.result$corrected.logf, + q.value = limrots.result$q_values$qvalues +) # Mark proteins as true positives (HUMAN UPS1 proteins) -limrots.result.df$TP <- ifelse(grepl("HUMAN", limrots.result.df$proteins), "HUMAN_TP", "ECOLI_FP") +limrots.result.df$TP <- ifelse(grepl("HUMAN", limrots.result.df$proteins), + "HUMAN_TP", "ECOLI_FP" +) # Create a volcano plot -ggplot(limrots.result.df, aes(x = LimROTS.FC, y = -log10(q.value), color = factor(TP))) + +ggplot(limrots.result.df, aes( + x = LimROTS.FC, y = -log10(q.value), + color = factor(TP) +)) + geom_point(alpha = 0.8) + theme_bw() + - labs(title = "Volcano Plot", x = "Log Fold Change", y = "-Log10 q.value", color = "True Positive") + + labs( + title = "Volcano Plot", x = "Log Fold Change", y = "-Log10 q.value", + color = "True Positive" + ) + scale_color_manual(values = c("grey", "red")) + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "blue") + geom_hline(yintercept = -log10(0.01), linetype = "dashed", color = "black") @@ -146,16 +245,27 @@ ggplot(limrots.result.df, aes(x = LimROTS.FC, y = -log10(q.value), color = facto ### Quality Control Plots -LimROTS generates p-values from permutation samples, along with FDR. Additionally, the `qvalue` package is used to calculate q-values and Benjamini-Hochberg adjusted p-values based on the permutation-derived p-values. These can be used as Quality Control for the LimROTS results. We recommend using permutation-derived p-values and qvalues, though they should generally be very similar to the FDR and Benjamini-Hochberg adjusted p-values. +LimROTS generates p-values from permutation samples, along with FDR. +Additionally, the `qvalue` package is used to calculate q-values and +Benjamini-Hochberg adjusted p-values based on the permutation-derived p-values. +These can be used as Quality Control for the LimROTS results. We recommend +using permutation-derived p-values and qvalues, though they should generally +be very similar to the FDR and Benjamini-Hochberg adjusted p-values. ```{r results="hide" , message=FALSE, warning=FALSE} ## Quality Control Plots # Plot of q-values -plot(limrots.result$q_values, main = "Q-values", xlab = "Index", ylab = "Q-value") +plot(limrots.result$q_values, + main = "Q-values", xlab = "Index", + ylab = "Q-value" +) # Histogram of q-values -hist(limrots.result$q_values, main = "Q-value Distribution", xlab = "Q-value", col = "lightgreen", border = "white") +hist(limrots.result$q_values, + main = "Q-value Distribution", + xlab = "Q-value", col = "lightgreen", border = "white" +) # Summary of q-values summary(limrots.result$q_values)