Merge pull request #147 from cole-trapnell-lab/develop

Merge 0.1.2 into master
cole-trapnell-lab · Jul 6, 2019 · e212173 · e212173
2 parents 5962b74 + ac394a7
commit e212173
Show file tree

Hide file tree

Showing 12 changed files with 348 additions and 14 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,6 +4,7 @@ warnings_are_errors: true
 language: r
 before_install:
   - sudo apt-get install -y libudunits2-dev
+  - sudo apt-get install -y gdal-bin
 r:
 - bioc-devel
 - bioc-release

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: monocle3
 Title: Clustering, differential expression, and trajectory analysis for single-
     cell RNA-Seq
-Version: 0.1.1
+Version: 0.1.2
 Authors@R: 
     person(given = "Hannah",
            family = "Pliner",

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,7 @@ export(choose_graph_segments)
 export(cluster_cells)
 export(clusters)
 export(coefficient_table)
+export(combine_cds)
 export(compare_models)
 export(detect_genes)
 export(estimate_size_factors)

diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,9 @@
+# monocle3 0.1.2
+
+### Changes
+* Added a `NEWS.md` file to track changes to the package.
+* Added a combine_cds function that will combine cell_data_sets .
+* Added a message when duplicate markers are present in generate_garnett_marker_file and option to exclude them.
+
+### Bug fixes
+* Fixed a bug that made expression matrix the wrong class with load_cellranger.
diff --git a/R/cell_data_set.R b/R/cell_data_set.R
@@ -110,7 +110,7 @@ new_cell_data_set <- function(expression_data,
 
   cds <- methods::new("cell_data_set",
              assays = SummarizedExperiment::Assays(
-               list(counts=expression_data)),
+               list(counts=as(expression_data, "dgCMatrix"))),
              colData = colData(sce),
              int_elementMetadata =sce@int_elementMetadata,
              int_colData = sce@int_colData,

diff --git a/R/find_markers.R b/R/find_markers.R
@@ -338,24 +338,55 @@ test_marker_for_cell_group = function(gene_id, cell_group, cell_group_df, cds,
 #'   "./marker_file.txt".
 #' @param max_genes_per_group Numeric, the maximum number of genes to output
 #'   per cell type entry. Default is 10.
+#' @param remove_duplicate_genes Logical indicating whether marker genes that
+#'   mark multiple cell groups should be excluded. Default is FALSE. When
+#'   FALSE, a message will be emitted when duplicates are present.
 #'
 #' @return None, marker file is written to \code{file} parameter location.
 #' @export
 #'
 generate_garnett_marker_file <- function(marker_test_res,
                                          file = "./marker_file.txt",
-                                         max_genes_per_group = 10) {
+                                         max_genes_per_group = 10,
+                                         remove_duplicate_genes = FALSE) {
   marker_test_res <- as.data.frame(marker_test_res)
   if(is.null(marker_test_res$group_name)) {
     marker_test_res$group_name <- paste("Cell type", marker_test_res$cell_group)
   }
   group_list <- unique(marker_test_res$group_name)
 
-  #good_markers <- marker_test_res[marker_test_res$marker_test_q_value <= qval_cutoff,]
-  good_markers = marker_test_res %>% dplyr::group_by(group_name) %>% dplyr::top_n(max_genes_per_group, marker_score)
+  good_markers <- marker_test_res %>% dplyr::group_by(group_name) %>%
+    dplyr::top_n(max_genes_per_group, marker_score)
 
+  dups <- good_markers$gene_id[duplicated(good_markers$gene_id)]
+
+  if ("gene_short_name" %in% colnames(good_markers)) {
+    dups_gsn <- good_markers$gene_short_name[duplicated(good_markers$gene_short_name)]
+  }
+
+
+  if(remove_duplicate_genes) {
+    good_markers <- good_markers[!good_markers$gene_id %in% dups,]
+  } else {
+    if (length(dups) > 0) {
+      if("gene_short_name" %in% colnames(good_markers)) {
+        message(paste("The following marker genes mark multiple cell groups.",
+                      "Prior to using Garnett, we recommend either excluding",
+                      "these genes using remove_duplicate_genes = TRUE, or",
+                      "modifying your marker file to make the cell types with",
+                      "the shared marker subtypes in a hierarchy.",
+                      paste(dups_gsn, collapse = ", ")))
+      } else {
+        message(paste("The following marker genes mark multiple cell groups.",
+                      "Prior to using Garnett, we recommend either excluding",
+                      "these genes using remove_duplicate_genes = TRUE, or",
+                      "modifying your marker file to make the cell types with",
+                      "the shared marker subtypes in a hierarchy.",
+                      paste(dups, collapse = ", ")))
+      }
+    }
+  }
 
-  #good_markers = good_markers %>% dplyr::ungr
   output <- list()
 
   for (group in group_list) {

diff --git a/R/utils.R b/R/utils.R
@@ -569,3 +569,135 @@ load_mtx_data <- function(mat_path,
   cds <- cds[,colData(cds)$n.umi >= umi_cutoff]
   return(cds)
 }
+
+
+#' Combine a list of cell_data_set objects
+#'
+#' This function will combine a list of cell_data_set objects into a new
+#' cell_data_set object.
+#'
+#' @param cds_list List of cds objects to be combined.
+#' @param keep_all_genes Logical indicating what to do if there is a mismatch
+#'   in the gene sets of the CDSs. If TRUE, all genes are kept and cells from
+#'   CDSs missing a given gene will be filled in with zeroes. If FALSE, only
+#'   the genes in common among all of the CDSs will be kept. Default is TRUE.
+#' @param cell_names_unique Logical indicating whether all of the cell IDs
+#'   across all of the CDSs are unique. If FALSE, the CDS name is appended to
+#'   each cell ID to prevent collisions. Default is FALSE.
+#'
+#' @return A combined cell_data_set object.
+#' @export
+#'
+combine_cds <- function(cds_list,
+                        keep_all_genes = TRUE,
+                        cell_names_unique = FALSE) {
+
+  assertthat::assert_that(is.list(cds_list),
+                          msg=paste("cds_list must be a list."))
+
+  assertthat::assert_that(all(sapply(cds_list, class) == "cell_data_set"),
+                          msg=paste("All members of cds_list must be",
+                                    "cell_data_set class."))
+
+  if(length(cds_list) == 1) return(cds_list[[1]])
+
+  assertthat::assert_that(is.logical(keep_all_genes))
+  assertthat::assert_that(is.logical(cell_names_unique))
+
+  list_named <- TRUE
+  if(is.null(names(cds_list))) {
+    list_named <- FALSE
+  }
+
+  exprs_list <- list()
+  fd_list <- list()
+  pd_list <- list()
+  gene_list <- c()
+  overlap_list <- c(row.names(fData(cds_list[[1]])))
+  pdata_cols <- c()
+  fdata_cols <- c()
+  all_cells <- c()
+
+  for(cds in cds_list) {
+    gene_list <-  c(gene_list, row.names(fData(cds)))
+    overlap_list <- intersect(overlap_list, row.names(fData(cds)))
+    if (!keep_all_genes) {
+      gene_list <- overlap_list
+    }
+
+    pdata_cols <- c(pdata_cols, names(pData(cds)))
+    fdata_cols <- c(fdata_cols, names(fData(cds)))
+    all_cells <- c(all_cells, row.names(pData(cds)))
+  }
+
+  gene_list <- unique(gene_list)
+  if(length(overlap_list) == 0) {
+    if (keep_all_genes) {
+      warning(paste("No genes are shared amongst all the CDS objects."))
+    } else {
+      stop(paste("No genes are shared amongst all the CDS objects. To generate",
+                 "a combined CDS with all genes, use keep_all_genes = TRUE"))
+    }
+  }
+  pdata_cols <- unique(pdata_cols)
+  fdata_cols <- unique(fdata_cols)
+  if (sum(duplicated(all_cells)) != 0 & cell_names_unique) {
+    stop(paste("Cell names are not unique across CDSs - cell_names_unique",
+               "must be TRUE."))
+  }
+  all_cells <- unique(all_cells)
+  for(i in 1:length(cds_list)) {
+    pd <- as.data.frame(pData(cds_list[[i]]))
+    exp <- exprs(cds_list[[i]])
+    exp <- exp[intersect(row.names(exp), gene_list),]
+    if (!cell_names_unique) {
+      if(list_named) {
+        row.names(pd) <- paste(row.names(pd), names(cds_list)[[i]], sep="_")
+        pd$sample <- names(cds_list)[[i]]
+      } else {
+        row.names(pd) <- paste(row.names(pd), i, sep="_")
+        pd$sample <- i
+      }
+      colnames(exp) <- row.names(pd)
+    }
+    not_in <- pdata_cols[!pdata_cols %in% names(pd)]
+    for (n in not_in) {
+      pd[,n] <- NA
+    }
+
+    fd <- as.data.frame(fData(cds_list[[i]]))
+    fd <- fd[intersect(row.names(fd), gene_list),]
+    not_in <- fdata_cols[!fdata_cols %in% names(fd)]
+    for (n in not_in) {
+      fd[,n] <- NA
+    }
+    not_in_g <- gene_list[!gene_list %in% row.names(fd)]
+    for (n in not_in_g) {
+      fd[n,] <- NA
+    }
+    if (length(not_in_g) > 0) {
+      extra_rows <- Matrix::Matrix(0, ncol=ncol(exp),
+                                   sparse=TRUE,
+                                   nrow=length(not_in_g))
+      row.names(extra_rows) <- not_in_g
+      colnames(extra_rows) <- colnames(exp)
+      exp <- rbind(exp, extra_rows)
+    }
+
+
+    exprs_list[[i]] <- exp
+    fd_list[[i]] <- fd
+    pd_list[[i]] <- pd
+
+  }
+
+  all_fd <- do.call(cbind, fd_list)
+  all_fd <- all_fd[,fdata_cols]
+  all_pd <- do.call(rbind, pd_list)
+  all_exp <- do.call(cbind, exprs_list)
+
+  all_exp <- all_exp[row.names(all_fd), row.names(all_pd)]
+
+
+  new_cell_data_set(all_exp, cell_metadata = all_pd, gene_metadata = all_fd)
+}
diff --git a/examples/c_elegans_L2.R b/examples/c_elegans_L2.R
@@ -35,7 +35,7 @@ cds = preprocess_cds(cds, num_dim = 100, residual_model_formula_str = "~ plate")
 cds = reduce_dimension(cds, umap.fast_sgd=FALSE, cores=1)
 plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE) + ggsave("L2_umap_corrected_plate.png", width=5, height=4, dpi = 600)
 
-# exmaple of using t-SNE:
+# example of using t-SNE:
 cds = reduce_dimension(cds, reduction_method="tSNE")
 plot_cells(cds, reduction_method="tSNE", color_cells_by="cao_cell_type") + ggsave("L2_tsne_corrected_cao_type.png", width=5, height=4, dpi = 600)
 
@@ -205,8 +205,8 @@ garnett_markers = assigned_type_marker_test_res %>%
     filter(marker_test_q_value < 0.01 & specificity >= 0.5) %>%
     group_by(cell_group) %>%
     top_n(5, marker_score)
-garnett_markers = garnett_markers %>% group_by(gene_short_name) %>% 
-    filter(n() == 1) 
+garnett_markers = garnett_markers %>% group_by(gene_short_name) %>%
+    filter(n() == 1)
 
 plot_genes_by_group(cds,
                     unique(as.character(garnett_markers$gene_id)),
@@ -234,8 +234,8 @@ cds = classify_cells(cds, worm_classifier,
                            cluster_extend = TRUE,
                            cds_gene_id_type = "ENSEMBL")
 
-plot_cells(cds, 
-           group_cells_by="partition", 
+plot_cells(cds,
+           group_cells_by="partition",
            color_cells_by="cluster_ext_type") + ggsave("L2_umap_corrected_garnett_ext_type_our_annotations.png", width=5, height=4, dpi = 600)
 
 # Plot the overlap between clusters and annotated cell types:
@@ -250,8 +250,8 @@ cds = classify_cells(cds, ceWhole,
                            cluster_extend = TRUE,
                            cds_gene_id_type = "ENSEMBL")
 
-plot_cells(cds, 
-           group_cells_by="cluster", 
+plot_cells(cds,
+           group_cells_by="cluster",
            color_cells_by="cluster_ext_type") + ggsave("L2_umap_corrected_garnett_ext_type_cao_annoutations.png", width=5, height=4, dpi = 600)
 
 # Plot the overlap between clusters and annotated cell types:

diff --git a/man/combine_cds.Rd b/man/combine_cds.Rd
diff --git a/man/generate_garnett_marker_file.Rd b/man/generate_garnett_marker_file.Rd