From 313f5060d6a27d752ab46f8cf5c598ba7ba3535b Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 28 May 2020 09:04:40 -0400 Subject: [PATCH 1/5] Create write_gct.R --- R/write_gct.R | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 R/write_gct.R diff --git a/R/write_gct.R b/R/write_gct.R new file mode 100644 index 0000000..79fcef2 --- /dev/null +++ b/R/write_gct.R @@ -0,0 +1,127 @@ +#' Write CellProfiler data to gct file. +#' +#' @param x ... +#' @param path ... +#' @param channels ... +#' +#' +#' @return The input \code{x}, invisibly. +#' +write_gct <- function(x, path, channels = NULL, feature_regex = "^Nuclei_|^Cells_|^Cytoplasm_", create_row_annotations = TRUE) { + stopifnot(is.data.frame(x)) + path <- normalizePath(path, mustWork = FALSE) + + if(file.exists(path)) { + file.remove(path) + } + + # id is hash of metadata columns + x %<>% + tidyr::unite("id", matches("Metadata_"), remove = F) %>% + dplyr::rowwise() %>% + dplyr::mutate(id = digest::digest(id)) %>% + dplyr::ungroup() + + # change hash to an sequential id because some sig tools fail if not + x %<>% + dplyr::mutate(id = paste0("SAMPLE_", dense_rank(id))) + + # write.gctx does not handle Date + x %<>% dplyr::mutate_if(is.numeric.Date, as.character) + + feature_cols <- + colnames(x) %>% + stringr::str_subset(feature_regex) + + measurements <- + x %>% + dplyr::select_(.dots = feature_cols) %>% + data.matrix() %>% + t() + + column_annotations <- + x %>% + dplyr::select(matches("^id$|^Metadata_")) + + row_annotations <- + tibble::data_frame(cp_feature_name = row.names(measurements)) + + if (create_row_annotations) { + row_annotations %<>% + tidyr::separate(col = "cp_feature_name", + into = c("compartment", "feature_group", "feature_name"), + sep = "_", extra = "merge", remove = FALSE) + } + + if (!is.null(channels)) { + # get all combinations of channels + channels <- stringr::str_split(channels, ",")[[1]] + + channels <- c(as.vector(outer(channels, channels, FUN = paste, sep = "_")), + channels) + + # get channel name + channel_name <- function(feature_name) { + name <- channels[which(stringr::str_detect(feature_name, channels))[1]] + + if (is.na(name)) { + name <- "None" + } + + name + } + + # add channel name to row annotations + row_annotations %<>% + dplyr::rowwise() %>% + dplyr::mutate(channel_name = channel_name(feature_name)) %>% + ungroup() + } + + column_annotations_df <- + column_annotations %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column() %>% + dplyr::mutate(rowname = stringr::str_replace(rowname, "Metadata_", "")) + + filler <- row_annotations %>% slice(0) + filler[1,] <- colnames(filler) + filler[2:nrow(column_annotations_df),] <- NA + + column_annotations_df <- + dplyr::bind_cols(column_annotations_df[1], + filler, + column_annotations_df[2:ncol(column_annotations_df)] + ) + + measurements_df <- + measurements %>% + as.data.frame() %>% + tibble::rownames_to_column() + + measurements_df <- + dplyr::bind_cols(measurements_df[1], + row_annotations, + measurements_df[2:ncol(measurements_df)] + ) + + f <- file(path, "w") + + writeLines("#1.3", con = f) + + writeLines(sprintf("%d\t%d\t%d\t%d", + nrow(measurements), + ncol(measurements), + ncol(row_annotations), + ncol(column_annotations) - 1), + con = f) + + close(f) + + readr::write_tsv(x = column_annotations_df, path = path, append = TRUE, col_names = FALSE) + + readr::write_tsv(x = measurements_df, path = path, append = TRUE) + + invisible(x) +} From 05c94348fa2428c52b843b9cb6bf28a7c023c35e Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 28 May 2020 09:11:56 -0400 Subject: [PATCH 2/5] Update docs --- DESCRIPTION | 2 +- man/aggregate.Rd | 12 +++++++++--- man/annotate.Rd | 13 ++++++++++--- man/normalize.Rd | 18 +++++++++++++----- man/preselect.Rd | 13 ++++++++++--- man/sample.Rd | 3 +-- 6 files changed, 44 insertions(+), 17 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b58287f..fd0a868 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,4 +33,4 @@ Imports: stringr (>= 1.2.0) Remotes: cytomining/cytominer VignetteBuilder: knitr -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.0 diff --git a/man/aggregate.Rd b/man/aggregate.Rd index b11da6d..e8b608b 100644 --- a/man/aggregate.Rd +++ b/man/aggregate.Rd @@ -4,10 +4,16 @@ \alias{aggregate} \title{Aggregate data based on given grouping.} \usage{ -aggregate(sqlite_file, output_file, compartments = c("cells", - "cytoplasm", "nuclei"), operation = "mean", +aggregate( + sqlite_file, + output_file, + compartments = c("cells", "cytoplasm", "nuclei"), + operation = "mean", strata = c("Metadata_Plate", "Metadata_Well"), - image_variables = NULL, variables = "all", univariate = TRUE) + image_variables = NULL, + variables = "all", + univariate = TRUE +) } \arguments{ \item{sqlite_file}{SQLite database storing morphological profiles.} diff --git a/man/annotate.Rd b/man/annotate.Rd index aa19ca7..5fc0c5d 100644 --- a/man/annotate.Rd +++ b/man/annotate.Rd @@ -4,9 +4,16 @@ \alias{annotate} \title{Add plate and well metadata.} \usage{ -annotate(batch_id, plate_id, cell_id = NULL, external_metadata = NULL, - format_broad_cmap = FALSE, output = NULL, - perturbation_mode = "chemical", workspace_dir = ".") +annotate( + batch_id, + plate_id, + cell_id = NULL, + external_metadata = NULL, + format_broad_cmap = FALSE, + output = NULL, + perturbation_mode = "chemical", + workspace_dir = "." +) } \arguments{ \item{batch_id}{Batch ID.} diff --git a/man/normalize.Rd b/man/normalize.Rd index 02210c8..8859be3 100644 --- a/man/normalize.Rd +++ b/man/normalize.Rd @@ -4,14 +4,22 @@ \alias{normalize} \title{Normalize profiles based on given control subset} \usage{ -normalize(input_file = NULL, output_file = NULL, subset = NULL, - sample_single_cell = FALSE, input_sqlite_file = NULL, +normalize( + input_file = NULL, + output_file = NULL, + subset = NULL, + sample_single_cell = FALSE, + input_sqlite_file = NULL, compartments = c("cells", "cytoplasm", "nuclei"), - operation = "robustize", strata = c("Metadata_Plate"), - batch_id = NULL, plate_id = NULL, workspace_dir = ".", + operation = "robustize", + strata = c("Metadata_Plate"), + batch_id = NULL, + plate_id = NULL, + workspace_dir = ".", image_object_join_columns = c("TableNumber", "ImageNumber"), well_unique_id_columns = c("Metadata_Plate", "Metadata_Well"), - well_unique_id_columns_db_prefix = "") + well_unique_id_columns_db_prefix = "" +) } \arguments{ \item{input_file}{Input file with profiles to be normalized. If \code{NULL}, reads from \code{workspace_dir/backend/batch_id/plate_id/plate_id_augmented.csv}. default: \code{NULL}.} diff --git a/man/preselect.Rd b/man/preselect.Rd index 3557789..0d8bb07 100644 --- a/man/preselect.Rd +++ b/man/preselect.Rd @@ -4,9 +4,16 @@ \alias{preselect} \title{Create a list of variables using various variable selection methods} \usage{ -preselect(input, operations, replicates = NULL, batch_id = NULL, - subset = NULL, cores = NULL, output_dir = NULL, - workspace_dir = ".") +preselect( + input, + operations, + replicates = NULL, + batch_id = NULL, + subset = NULL, + cores = NULL, + output_dir = NULL, + workspace_dir = "." +) } \arguments{ \item{input}{Test data on which to perform variable selection operations. Must be CSV, rds, or feather.} diff --git a/man/sample.Rd b/man/sample.Rd index 70ffdfc..c1e5e2c 100644 --- a/man/sample.Rd +++ b/man/sample.Rd @@ -4,8 +4,7 @@ \alias{sample} \title{Sample selects replicates across specified plates.} \usage{ -sample(batch_id, pattern, output, replicates = NULL, - workspace_dir = ".") +sample(batch_id, pattern, output, replicates = NULL, workspace_dir = ".") } \arguments{ \item{batch_id}{Batch ID.} From 810e916e0b9ac0aac76e3efe75a338070688970f Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 28 May 2020 09:12:08 -0400 Subject: [PATCH 3/5] Update docs --- man/write_gct.Rd | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 man/write_gct.Rd diff --git a/man/write_gct.Rd b/man/write_gct.Rd new file mode 100644 index 0000000..66b2fee --- /dev/null +++ b/man/write_gct.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/write_gct.R +\name{write_gct} +\alias{write_gct} +\title{Write CellProfiler data to gct file.} +\usage{ +write_gct( + x, + path, + channels = NULL, + feature_regex = "^Nuclei_|^Cells_|^Cytoplasm_", + create_row_annotations = TRUE +) +} +\value{ +The input \code{x}, invisibly. +} +\description{ +@param x ... + @param path ... + @param channels ... +} From 8e6c0339d042892f96681b7dfc53e5ee4eea803d Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 28 May 2020 09:12:53 -0400 Subject: [PATCH 4/5] Format code --- R/write_gct.R | 214 +++++++++++++++++++++++++++----------------------- 1 file changed, 116 insertions(+), 98 deletions(-) diff --git a/R/write_gct.R b/R/write_gct.R index 79fcef2..143428b 100644 --- a/R/write_gct.R +++ b/R/write_gct.R @@ -7,121 +7,139 @@ #' #' @return The input \code{x}, invisibly. #' -write_gct <- function(x, path, channels = NULL, feature_regex = "^Nuclei_|^Cells_|^Cytoplasm_", create_row_annotations = TRUE) { - stopifnot(is.data.frame(x)) - path <- normalizePath(path, mustWork = FALSE) - - if(file.exists(path)) { - file.remove(path) - } +write_gct <- + function(x, + path, + channels = NULL, + feature_regex = "^Nuclei_|^Cells_|^Cytoplasm_", + create_row_annotations = TRUE) { + stopifnot(is.data.frame(x)) + path <- normalizePath(path, mustWork = FALSE) + + if (file.exists(path)) { + file.remove(path) + } - # id is hash of metadata columns - x %<>% - tidyr::unite("id", matches("Metadata_"), remove = F) %>% - dplyr::rowwise() %>% - dplyr::mutate(id = digest::digest(id)) %>% - dplyr::ungroup() + # id is hash of metadata columns + x %<>% + tidyr::unite("id", matches("Metadata_"), remove = F) %>% + dplyr::rowwise() %>% + dplyr::mutate(id = digest::digest(id)) %>% + dplyr::ungroup() - # change hash to an sequential id because some sig tools fail if not - x %<>% - dplyr::mutate(id = paste0("SAMPLE_", dense_rank(id))) + # change hash to an sequential id because some sig tools fail if not + x %<>% + dplyr::mutate(id = paste0("SAMPLE_", dense_rank(id))) - # write.gctx does not handle Date - x %<>% dplyr::mutate_if(is.numeric.Date, as.character) + # write.gctx does not handle Date + x %<>% dplyr::mutate_if(is.numeric.Date, as.character) - feature_cols <- - colnames(x) %>% - stringr::str_subset(feature_regex) + feature_cols <- + colnames(x) %>% + stringr::str_subset(feature_regex) - measurements <- - x %>% - dplyr::select_(.dots = feature_cols) %>% - data.matrix() %>% - t() + measurements <- + x %>% + dplyr::select_(.dots = feature_cols) %>% + data.matrix() %>% + t() - column_annotations <- - x %>% - dplyr::select(matches("^id$|^Metadata_")) + column_annotations <- + x %>% + dplyr::select(matches("^id$|^Metadata_")) - row_annotations <- - tibble::data_frame(cp_feature_name = row.names(measurements)) + row_annotations <- + tibble::data_frame(cp_feature_name = row.names(measurements)) - if (create_row_annotations) { + if (create_row_annotations) { row_annotations %<>% - tidyr::separate(col = "cp_feature_name", - into = c("compartment", "feature_group", "feature_name"), - sep = "_", extra = "merge", remove = FALSE) - } + tidyr::separate( + col = "cp_feature_name", + into = c("compartment", "feature_group", "feature_name"), + sep = "_", + extra = "merge", + remove = FALSE + ) + } + + if (!is.null(channels)) { + # get all combinations of channels + channels <- stringr::str_split(channels, ",")[[1]] - if (!is.null(channels)) { - # get all combinations of channels - channels <- stringr::str_split(channels, ",")[[1]] + channels <- + c(as.vector(outer( + channels, channels, FUN = paste, sep = "_" + )), + channels) - channels <- c(as.vector(outer(channels, channels, FUN = paste, sep = "_")), - channels) + # get channel name + channel_name <- function(feature_name) { + name <- + channels[which(stringr::str_detect(feature_name, channels))[1]] - # get channel name - channel_name <- function(feature_name) { - name <- channels[which(stringr::str_detect(feature_name, channels))[1]] + if (is.na(name)) { + name <- "None" + } - if (is.na(name)) { - name <- "None" + name } - name + # add channel name to row annotations + row_annotations %<>% + dplyr::rowwise() %>% + dplyr::mutate(channel_name = channel_name(feature_name)) %>% + ungroup() } - # add channel name to row annotations - row_annotations %<>% - dplyr::rowwise() %>% - dplyr::mutate(channel_name = channel_name(feature_name)) %>% - ungroup() - } - - column_annotations_df <- - column_annotations %>% - t() %>% - as.data.frame() %>% - tibble::rownames_to_column() %>% - dplyr::mutate(rowname = stringr::str_replace(rowname, "Metadata_", "")) - - filler <- row_annotations %>% slice(0) - filler[1,] <- colnames(filler) - filler[2:nrow(column_annotations_df),] <- NA - - column_annotations_df <- - dplyr::bind_cols(column_annotations_df[1], - filler, - column_annotations_df[2:ncol(column_annotations_df)] + column_annotations_df <- + column_annotations %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column() %>% + dplyr::mutate(rowname = stringr::str_replace(rowname, "Metadata_", "")) + + filler <- row_annotations %>% slice(0) + filler[1, ] <- colnames(filler) + filler[2:nrow(column_annotations_df), ] <- NA + + column_annotations_df <- + dplyr::bind_cols(column_annotations_df[1], + filler, + column_annotations_df[2:ncol(column_annotations_df)]) + + measurements_df <- + measurements %>% + as.data.frame() %>% + tibble::rownames_to_column() + + measurements_df <- + dplyr::bind_cols(measurements_df[1], + row_annotations, + measurements_df[2:ncol(measurements_df)]) + + f <- file(path, "w") + + writeLines("#1.3", con = f) + + writeLines(sprintf( + "%d\t%d\t%d\t%d", + nrow(measurements), + ncol(measurements), + ncol(row_annotations), + ncol(column_annotations) - 1 + ), + con = f) + + close(f) + + readr::write_tsv( + x = column_annotations_df, + path = path, + append = TRUE, + col_names = FALSE ) - measurements_df <- - measurements %>% - as.data.frame() %>% - tibble::rownames_to_column() - - measurements_df <- - dplyr::bind_cols(measurements_df[1], - row_annotations, - measurements_df[2:ncol(measurements_df)] - ) - - f <- file(path, "w") - - writeLines("#1.3", con = f) + readr::write_tsv(x = measurements_df, path = path, append = TRUE) - writeLines(sprintf("%d\t%d\t%d\t%d", - nrow(measurements), - ncol(measurements), - ncol(row_annotations), - ncol(column_annotations) - 1), - con = f) - - close(f) - - readr::write_tsv(x = column_annotations_df, path = path, append = TRUE, col_names = FALSE) - - readr::write_tsv(x = measurements_df, path = path, append = TRUE) - - invisible(x) -} + invisible(x) + } From e2628c6d81fb6cc6cd6cbf06e64edc8ee4e4fbdf Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 15 Nov 2020 13:05:41 -0500 Subject: [PATCH 5/5] unverified changes --- NAMESPACE | 1 + R/write_gct.R | 16 +++++++++------- man/write_gct.Rd | 6 +++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e529dd8..2aa0ea3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(normalize) export(preselect) export(regularize) export(sample) +export(write_gct) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(magrittr,extract2) diff --git a/R/write_gct.R b/R/write_gct.R index 143428b..63f42b2 100644 --- a/R/write_gct.R +++ b/R/write_gct.R @@ -1,12 +1,14 @@ #' Write CellProfiler data to gct file. #' -#' @param x ... -#' @param path ... -#' @param channels ... -#' +#' @param x A data frame of CellProfiler readouts to write to disk +#' @param path Path or connection to write to +#' @param channels Image channels present in the CellProfiler data frame #' #' @return The input \code{x}, invisibly. #' +#' @importFrom magrittr %<>% +#' @importFrom magrittr %>% +#' @export write_gct <- function(x, path, @@ -22,7 +24,7 @@ write_gct <- # id is hash of metadata columns x %<>% - tidyr::unite("id", matches("Metadata_"), remove = F) %>% + tidyr::unite("id", dplyr::matches("Metadata_"), remove = F) %>% dplyr::rowwise() %>% dplyr::mutate(id = digest::digest(id)) %>% dplyr::ungroup() @@ -46,7 +48,7 @@ write_gct <- column_annotations <- x %>% - dplyr::select(matches("^id$|^Metadata_")) + dplyr::select(dplyr::matches("^id$|^Metadata_")) row_annotations <- tibble::data_frame(cp_feature_name = row.names(measurements)) @@ -98,7 +100,7 @@ write_gct <- tibble::rownames_to_column() %>% dplyr::mutate(rowname = stringr::str_replace(rowname, "Metadata_", "")) - filler <- row_annotations %>% slice(0) + filler <- row_annotations %>% dplyr::slice(0) filler[1, ] <- colnames(filler) filler[2:nrow(column_annotations_df), ] <- NA diff --git a/man/write_gct.Rd b/man/write_gct.Rd index 66b2fee..5549ce7 100644 --- a/man/write_gct.Rd +++ b/man/write_gct.Rd @@ -16,7 +16,7 @@ write_gct( The input \code{x}, invisibly. } \description{ -@param x ... - @param path ... - @param channels ... +@param x A data frame of CellProfiler readouts to write to disk + @param path Path or connection to write to + @param channels Image channels present in the CellProfiler data frame }