diff --git a/.travis.yml b/.travis.yml index 4f46c5f5..b89dfd96 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ warnings_are_errors: true language: r before_install: - sudo apt-get install -y libudunits2-dev + - sudo apt-get install -y gdal-bin r: - bioc-devel - bioc-release diff --git a/DESCRIPTION b/DESCRIPTION index 8144a567..90af2308 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: monocle3 Title: Clustering, differential expression, and trajectory analysis for single- cell RNA-Seq -Version: 0.1.1 +Version: 0.1.2 Authors@R: person(given = "Hannah", family = "Pliner", diff --git a/NAMESPACE b/NAMESPACE index 258dcdd1..8b222ec8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(choose_graph_segments) export(cluster_cells) export(clusters) export(coefficient_table) +export(combine_cds) export(compare_models) export(detect_genes) export(estimate_size_factors) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 00000000..b3c7a3a8 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,9 @@ +# monocle3 0.1.2 + +### Changes +* Added a `NEWS.md` file to track changes to the package. +* Added a combine_cds function that will combine cell_data_sets . +* Added a message when duplicate markers are present in generate_garnett_marker_file and option to exclude them. + +### Bug fixes +* Fixed a bug that made expression matrix the wrong class with load_cellranger. diff --git a/R/cell_data_set.R b/R/cell_data_set.R index 04c4e22b..dcf7de65 100644 --- a/R/cell_data_set.R +++ b/R/cell_data_set.R @@ -110,7 +110,7 @@ new_cell_data_set <- function(expression_data, cds <- methods::new("cell_data_set", assays = SummarizedExperiment::Assays( - list(counts=expression_data)), + list(counts=as(expression_data, "dgCMatrix"))), colData = colData(sce), int_elementMetadata =sce@int_elementMetadata, int_colData = sce@int_colData, diff --git a/R/find_markers.R b/R/find_markers.R index c93e02fa..f468eedc 100644 --- a/R/find_markers.R +++ b/R/find_markers.R @@ -338,24 +338,55 @@ test_marker_for_cell_group = function(gene_id, cell_group, cell_group_df, cds, #' "./marker_file.txt". #' @param max_genes_per_group Numeric, the maximum number of genes to output #' per cell type entry. Default is 10. +#' @param remove_duplicate_genes Logical indicating whether marker genes that +#' mark multiple cell groups should be excluded. Default is FALSE. When +#' FALSE, a message will be emitted when duplicates are present. #' #' @return None, marker file is written to \code{file} parameter location. #' @export #' generate_garnett_marker_file <- function(marker_test_res, file = "./marker_file.txt", - max_genes_per_group = 10) { + max_genes_per_group = 10, + remove_duplicate_genes = FALSE) { marker_test_res <- as.data.frame(marker_test_res) if(is.null(marker_test_res$group_name)) { marker_test_res$group_name <- paste("Cell type", marker_test_res$cell_group) } group_list <- unique(marker_test_res$group_name) - #good_markers <- marker_test_res[marker_test_res$marker_test_q_value <= qval_cutoff,] - good_markers = marker_test_res %>% dplyr::group_by(group_name) %>% dplyr::top_n(max_genes_per_group, marker_score) + good_markers <- marker_test_res %>% dplyr::group_by(group_name) %>% + dplyr::top_n(max_genes_per_group, marker_score) + dups <- good_markers$gene_id[duplicated(good_markers$gene_id)] + + if ("gene_short_name" %in% colnames(good_markers)) { + dups_gsn <- good_markers$gene_short_name[duplicated(good_markers$gene_short_name)] + } + + + if(remove_duplicate_genes) { + good_markers <- good_markers[!good_markers$gene_id %in% dups,] + } else { + if (length(dups) > 0) { + if("gene_short_name" %in% colnames(good_markers)) { + message(paste("The following marker genes mark multiple cell groups.", + "Prior to using Garnett, we recommend either excluding", + "these genes using remove_duplicate_genes = TRUE, or", + "modifying your marker file to make the cell types with", + "the shared marker subtypes in a hierarchy.", + paste(dups_gsn, collapse = ", "))) + } else { + message(paste("The following marker genes mark multiple cell groups.", + "Prior to using Garnett, we recommend either excluding", + "these genes using remove_duplicate_genes = TRUE, or", + "modifying your marker file to make the cell types with", + "the shared marker subtypes in a hierarchy.", + paste(dups, collapse = ", "))) + } + } + } - #good_markers = good_markers %>% dplyr::ungr output <- list() for (group in group_list) { diff --git a/R/utils.R b/R/utils.R index 18b733d5..473e09a0 100644 --- a/R/utils.R +++ b/R/utils.R @@ -569,3 +569,135 @@ load_mtx_data <- function(mat_path, cds <- cds[,colData(cds)$n.umi >= umi_cutoff] return(cds) } + + +#' Combine a list of cell_data_set objects +#' +#' This function will combine a list of cell_data_set objects into a new +#' cell_data_set object. +#' +#' @param cds_list List of cds objects to be combined. +#' @param keep_all_genes Logical indicating what to do if there is a mismatch +#' in the gene sets of the CDSs. If TRUE, all genes are kept and cells from +#' CDSs missing a given gene will be filled in with zeroes. If FALSE, only +#' the genes in common among all of the CDSs will be kept. Default is TRUE. +#' @param cell_names_unique Logical indicating whether all of the cell IDs +#' across all of the CDSs are unique. If FALSE, the CDS name is appended to +#' each cell ID to prevent collisions. Default is FALSE. +#' +#' @return A combined cell_data_set object. +#' @export +#' +combine_cds <- function(cds_list, + keep_all_genes = TRUE, + cell_names_unique = FALSE) { + + assertthat::assert_that(is.list(cds_list), + msg=paste("cds_list must be a list.")) + + assertthat::assert_that(all(sapply(cds_list, class) == "cell_data_set"), + msg=paste("All members of cds_list must be", + "cell_data_set class.")) + + if(length(cds_list) == 1) return(cds_list[[1]]) + + assertthat::assert_that(is.logical(keep_all_genes)) + assertthat::assert_that(is.logical(cell_names_unique)) + + list_named <- TRUE + if(is.null(names(cds_list))) { + list_named <- FALSE + } + + exprs_list <- list() + fd_list <- list() + pd_list <- list() + gene_list <- c() + overlap_list <- c(row.names(fData(cds_list[[1]]))) + pdata_cols <- c() + fdata_cols <- c() + all_cells <- c() + + for(cds in cds_list) { + gene_list <- c(gene_list, row.names(fData(cds))) + overlap_list <- intersect(overlap_list, row.names(fData(cds))) + if (!keep_all_genes) { + gene_list <- overlap_list + } + + pdata_cols <- c(pdata_cols, names(pData(cds))) + fdata_cols <- c(fdata_cols, names(fData(cds))) + all_cells <- c(all_cells, row.names(pData(cds))) + } + + gene_list <- unique(gene_list) + if(length(overlap_list) == 0) { + if (keep_all_genes) { + warning(paste("No genes are shared amongst all the CDS objects.")) + } else { + stop(paste("No genes are shared amongst all the CDS objects. To generate", + "a combined CDS with all genes, use keep_all_genes = TRUE")) + } + } + pdata_cols <- unique(pdata_cols) + fdata_cols <- unique(fdata_cols) + if (sum(duplicated(all_cells)) != 0 & cell_names_unique) { + stop(paste("Cell names are not unique across CDSs - cell_names_unique", + "must be TRUE.")) + } + all_cells <- unique(all_cells) + for(i in 1:length(cds_list)) { + pd <- as.data.frame(pData(cds_list[[i]])) + exp <- exprs(cds_list[[i]]) + exp <- exp[intersect(row.names(exp), gene_list),] + if (!cell_names_unique) { + if(list_named) { + row.names(pd) <- paste(row.names(pd), names(cds_list)[[i]], sep="_") + pd$sample <- names(cds_list)[[i]] + } else { + row.names(pd) <- paste(row.names(pd), i, sep="_") + pd$sample <- i + } + colnames(exp) <- row.names(pd) + } + not_in <- pdata_cols[!pdata_cols %in% names(pd)] + for (n in not_in) { + pd[,n] <- NA + } + + fd <- as.data.frame(fData(cds_list[[i]])) + fd <- fd[intersect(row.names(fd), gene_list),] + not_in <- fdata_cols[!fdata_cols %in% names(fd)] + for (n in not_in) { + fd[,n] <- NA + } + not_in_g <- gene_list[!gene_list %in% row.names(fd)] + for (n in not_in_g) { + fd[n,] <- NA + } + if (length(not_in_g) > 0) { + extra_rows <- Matrix::Matrix(0, ncol=ncol(exp), + sparse=TRUE, + nrow=length(not_in_g)) + row.names(extra_rows) <- not_in_g + colnames(extra_rows) <- colnames(exp) + exp <- rbind(exp, extra_rows) + } + + + exprs_list[[i]] <- exp + fd_list[[i]] <- fd + pd_list[[i]] <- pd + + } + + all_fd <- do.call(cbind, fd_list) + all_fd <- all_fd[,fdata_cols] + all_pd <- do.call(rbind, pd_list) + all_exp <- do.call(cbind, exprs_list) + + all_exp <- all_exp[row.names(all_fd), row.names(all_pd)] + + + new_cell_data_set(all_exp, cell_metadata = all_pd, gene_metadata = all_fd) +} diff --git a/examples/c_elegans_L2.R b/examples/c_elegans_L2.R index 1f12f49d..cc582c70 100644 --- a/examples/c_elegans_L2.R +++ b/examples/c_elegans_L2.R @@ -35,7 +35,7 @@ cds = preprocess_cds(cds, num_dim = 100, residual_model_formula_str = "~ plate") cds = reduce_dimension(cds, umap.fast_sgd=FALSE, cores=1) plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE) + ggsave("L2_umap_corrected_plate.png", width=5, height=4, dpi = 600) -# exmaple of using t-SNE: +# example of using t-SNE: cds = reduce_dimension(cds, reduction_method="tSNE") plot_cells(cds, reduction_method="tSNE", color_cells_by="cao_cell_type") + ggsave("L2_tsne_corrected_cao_type.png", width=5, height=4, dpi = 600) @@ -205,8 +205,8 @@ garnett_markers = assigned_type_marker_test_res %>% filter(marker_test_q_value < 0.01 & specificity >= 0.5) %>% group_by(cell_group) %>% top_n(5, marker_score) -garnett_markers = garnett_markers %>% group_by(gene_short_name) %>% - filter(n() == 1) +garnett_markers = garnett_markers %>% group_by(gene_short_name) %>% + filter(n() == 1) plot_genes_by_group(cds, unique(as.character(garnett_markers$gene_id)), @@ -234,8 +234,8 @@ cds = classify_cells(cds, worm_classifier, cluster_extend = TRUE, cds_gene_id_type = "ENSEMBL") -plot_cells(cds, - group_cells_by="partition", +plot_cells(cds, + group_cells_by="partition", color_cells_by="cluster_ext_type") + ggsave("L2_umap_corrected_garnett_ext_type_our_annotations.png", width=5, height=4, dpi = 600) # Plot the overlap between clusters and annotated cell types: @@ -250,8 +250,8 @@ cds = classify_cells(cds, ceWhole, cluster_extend = TRUE, cds_gene_id_type = "ENSEMBL") -plot_cells(cds, - group_cells_by="cluster", +plot_cells(cds, + group_cells_by="cluster", color_cells_by="cluster_ext_type") + ggsave("L2_umap_corrected_garnett_ext_type_cao_annoutations.png", width=5, height=4, dpi = 600) # Plot the overlap between clusters and annotated cell types: diff --git a/man/combine_cds.Rd b/man/combine_cds.Rd new file mode 100644 index 00000000..ca3c910c --- /dev/null +++ b/man/combine_cds.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{combine_cds} +\alias{combine_cds} +\title{Combine a list of cell_data_set objects} +\usage{ +combine_cds(cds_list, keep_all_genes = TRUE, cell_names_unique = FALSE) +} +\arguments{ +\item{cds_list}{List of cds objects to be combined.} + +\item{keep_all_genes}{Logical indicating what to do if there is a mismatch +in the gene sets of the CDSs. If TRUE, all genes are kept and cells from +CDSs missing a given gene will be filled in with zeroes. If FALSE, only +the genes in common among all of the CDSs will be kept. Default is TRUE.} + +\item{cell_names_unique}{Logical indicating whether all of the cell IDs +across all of the CDSs are unique. If FALSE, the CDS name is appended to +each cell ID to prevent collisions. Default is FALSE.} +} +\value{ +A combined cell_data_set object. +} +\description{ +This function will combine a list of cell_data_set objects into a new +cell_data_set object. +} diff --git a/man/generate_garnett_marker_file.Rd b/man/generate_garnett_marker_file.Rd index f4af8744..a1356bfd 100644 --- a/man/generate_garnett_marker_file.Rd +++ b/man/generate_garnett_marker_file.Rd @@ -5,7 +5,7 @@ \title{Generate a Garnett marker file from top_markers output.} \usage{ generate_garnett_marker_file(marker_test_res, file = "./marker_file.txt", - max_genes_per_group = 10) + max_genes_per_group = 10, remove_duplicate_genes = FALSE) } \arguments{ \item{marker_test_res}{Tibble of top markers, output of @@ -16,6 +16,10 @@ generate_garnett_marker_file(marker_test_res, file = "./marker_file.txt", \item{max_genes_per_group}{Numeric, the maximum number of genes to output per cell type entry. Default is 10.} + +\item{remove_duplicate_genes}{Logical indicating whether marker genes that +mark multiple cell groups should be excluded. Default is FALSE. When +FALSE, a message will be emitted when duplicates are present.} } \value{ None, marker file is written to \code{file} parameter location. diff --git a/tests/testthat/test-combine_cds.R b/tests/testthat/test-combine_cds.R new file mode 100644 index 00000000..6b2a5ebb --- /dev/null +++ b/tests/testthat/test-combine_cds.R @@ -0,0 +1,126 @@ +testthat::skip_if_offline() +cds <- load_a549() +cds2 <- monocle3:::load_worm_embryo() + +test_that("combine_cds works", { + # all genes shared + comb <- combine_cds(list(cds, cds), + keep_all_genes = TRUE, + cell_names_unique = FALSE) + testthat::expect_equal(nrow(exprs(comb)), + nrow(exprs(cds))) + testthat::expect_equal(ncol(exprs(comb)), + ncol(exprs(cds)) + ncol(exprs(cds))) + + comb <- combine_cds(list(cds, cds), + keep_all_genes = FALSE, + cell_names_unique = FALSE) + testthat::expect_equal(nrow(exprs(comb)), + nrow(exprs(cds))) + testthat::expect_equal(ncol(exprs(comb)), + ncol(exprs(cds)) + ncol(exprs(cds))) + + testthat::expect_error(comb <- combine_cds(list(cds, cds), + keep_all_genes = TRUE, + cell_names_unique = TRUE), + "Cell names are not unique across CDSs - cell_names_unique must be TRUE.") + testthat::expect_error(comb <- combine_cds(list(cds, cds), + keep_all_genes = FALSE, + cell_names_unique = TRUE), + "Cell names are not unique across CDSs - cell_names_unique must be TRUE.") + + # some genes shared + cds3 <- cds[1:100,] + cds3 <- cds3[,Matrix::rowSums(exprs(cds3)) != 0] + expect_warning(comb <- combine_cds(list(cds, cds3), + keep_all_genes = TRUE, + cell_names_unique = FALSE)) + testthat::expect_equal(nrow(exprs(comb)), + nrow(exprs(cds))) + testthat::expect_equal(ncol(exprs(comb)), + ncol(exprs(cds)) + ncol(exprs(cds3))) + + expect_warning(comb <- combine_cds(list(cds, cds3), + keep_all_genes = FALSE, + cell_names_unique = FALSE)) + testthat::expect_equal(nrow(exprs(comb)), + nrow(exprs(cds3))) + testthat::expect_equal(ncol(exprs(comb)), + ncol(exprs(cds)) + ncol(exprs(cds3))) + + testthat::expect_error(comb <- combine_cds(list(cds, cds3), + keep_all_genes = TRUE, + cell_names_unique = TRUE), + "Cell names are not unique across CDSs - cell_names_unique must be TRUE.") + testthat::expect_error(comb <- combine_cds(list(cds, cds3), + keep_all_genes = FALSE, + cell_names_unique = TRUE), + "Cell names are not unique across CDSs - cell_names_unique must be TRUE.") + + + # no genes shared + expect_warning(comb <- combine_cds(list(cds, cds2), + keep_all_genes = TRUE, + cell_names_unique = FALSE), + "No genes are shared amongst all the CDS objects.") + testthat::expect_equal(nrow(exprs(comb)), + nrow(exprs(cds)) + nrow(exprs(cds2))) + testthat::expect_equal(ncol(exprs(comb)), + ncol(exprs(cds)) + ncol(exprs(cds2))) + + expect_error(comb <- combine_cds(list(cds, cds2), + keep_all_genes = FALSE, + cell_names_unique = FALSE), + paste("No genes are shared amongst all the CDS objects. To generate a", + "combined CDS with all genes, use keep_all_genes = TRUE")) + + expect_warning(comb <- combine_cds(list(cds, cds2), + keep_all_genes = TRUE, + cell_names_unique = TRUE), + "No genes are shared amongst all the CDS objects.") + + testthat::expect_equal(nrow(exprs(comb)), + nrow(exprs(cds)) + nrow(exprs(cds2))) + testthat::expect_equal(ncol(exprs(comb)), + ncol(exprs(cds)) + ncol(exprs(cds2))) + + expect_error(comb <- combine_cds(list(cds, cds2), + keep_all_genes = FALSE, + cell_names_unique = TRUE), + paste("No genes are shared amongst all the CDS objects. To generate a", + "combined CDS with all genes, use keep_all_genes = TRUE")) + + + # triples + expect_warning(expect_error(comb <- combine_cds(list(cds, cds2, cds3), + keep_all_genes = TRUE, + cell_names_unique = TRUE), + "Cell names are not unique across CDSs - cell_names_unique must be TRUE."), + "No genes are shared amongst all the CDS objects.") + + expect_error(comb <- combine_cds(list(cds, cds2, cds3), + keep_all_genes = FALSE, + cell_names_unique = TRUE), + paste("No genes are shared amongst all the CDS objects. To generate a", + "combined CDS with all genes, use keep_all_genes = TRUE")) + + expect_warning(comb <- combine_cds(list(cds, cds2, cds3), + keep_all_genes = TRUE, + cell_names_unique = FALSE), + "No genes are shared amongst all the CDS objects.") + testthat::expect_equal(nrow(exprs(comb)), + length(unique(c(row.names(exprs(cds)), + row.names(exprs(cds2)), + row.names(exprs(cds3)))))) + testthat::expect_equal(ncol(exprs(comb)), + ncol(exprs(cds)) + ncol(exprs(cds2)) + ncol(exprs(cds3))) + + expect_error(comb <- combine_cds(list(cds, cds2, cds3), + keep_all_genes = FALSE, + cell_names_unique = FALSE), + paste("No genes are shared amongst all the CDS objects. To generate a", + "combined CDS with all genes, use keep_all_genes = TRUE")) + + + +}) diff --git a/tests/testthat/test-find_markers.R b/tests/testthat/test-find_markers.R new file mode 100644 index 00000000..b4f7a1c8 --- /dev/null +++ b/tests/testthat/test-find_markers.R @@ -0,0 +1,3 @@ +test_that("generate_garnett_marker_file_works", { + expect_equal(2 * 2, 4) +})