Skip to content

Commit

Permalink
Merge pull request #147 from cole-trapnell-lab/develop
Browse files Browse the repository at this point in the history
Merge 0.1.2 into master
  • Loading branch information
ctrapnell authored Jul 6, 2019
2 parents 5962b74 + ac394a7 commit e212173
Show file tree
Hide file tree
Showing 12 changed files with 348 additions and 14 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ warnings_are_errors: true
language: r
before_install:
- sudo apt-get install -y libudunits2-dev
- sudo apt-get install -y gdal-bin
r:
- bioc-devel
- bioc-release
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: monocle3
Title: Clustering, differential expression, and trajectory analysis for single-
cell RNA-Seq
Version: 0.1.1
Version: 0.1.2
Authors@R:
person(given = "Hannah",
family = "Pliner",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(choose_graph_segments)
export(cluster_cells)
export(clusters)
export(coefficient_table)
export(combine_cds)
export(compare_models)
export(detect_genes)
export(estimate_size_factors)
Expand Down
9 changes: 9 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# monocle3 0.1.2

### Changes
* Added a `NEWS.md` file to track changes to the package.
* Added a combine_cds function that will combine cell_data_sets .
* Added a message when duplicate markers are present in generate_garnett_marker_file and option to exclude them.

### Bug fixes
* Fixed a bug that made expression matrix the wrong class with load_cellranger.
2 changes: 1 addition & 1 deletion R/cell_data_set.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ new_cell_data_set <- function(expression_data,

cds <- methods::new("cell_data_set",
assays = SummarizedExperiment::Assays(
list(counts=expression_data)),
list(counts=as(expression_data, "dgCMatrix"))),
colData = colData(sce),
int_elementMetadata =sce@int_elementMetadata,
int_colData = sce@int_colData,
Expand Down
39 changes: 35 additions & 4 deletions R/find_markers.R
Original file line number Diff line number Diff line change
Expand Up @@ -338,24 +338,55 @@ test_marker_for_cell_group = function(gene_id, cell_group, cell_group_df, cds,
#' "./marker_file.txt".
#' @param max_genes_per_group Numeric, the maximum number of genes to output
#' per cell type entry. Default is 10.
#' @param remove_duplicate_genes Logical indicating whether marker genes that
#' mark multiple cell groups should be excluded. Default is FALSE. When
#' FALSE, a message will be emitted when duplicates are present.
#'
#' @return None, marker file is written to \code{file} parameter location.
#' @export
#'
generate_garnett_marker_file <- function(marker_test_res,
file = "./marker_file.txt",
max_genes_per_group = 10) {
max_genes_per_group = 10,
remove_duplicate_genes = FALSE) {
marker_test_res <- as.data.frame(marker_test_res)
if(is.null(marker_test_res$group_name)) {
marker_test_res$group_name <- paste("Cell type", marker_test_res$cell_group)
}
group_list <- unique(marker_test_res$group_name)

#good_markers <- marker_test_res[marker_test_res$marker_test_q_value <= qval_cutoff,]
good_markers = marker_test_res %>% dplyr::group_by(group_name) %>% dplyr::top_n(max_genes_per_group, marker_score)
good_markers <- marker_test_res %>% dplyr::group_by(group_name) %>%
dplyr::top_n(max_genes_per_group, marker_score)

dups <- good_markers$gene_id[duplicated(good_markers$gene_id)]

if ("gene_short_name" %in% colnames(good_markers)) {
dups_gsn <- good_markers$gene_short_name[duplicated(good_markers$gene_short_name)]
}


if(remove_duplicate_genes) {
good_markers <- good_markers[!good_markers$gene_id %in% dups,]
} else {
if (length(dups) > 0) {
if("gene_short_name" %in% colnames(good_markers)) {
message(paste("The following marker genes mark multiple cell groups.",
"Prior to using Garnett, we recommend either excluding",
"these genes using remove_duplicate_genes = TRUE, or",
"modifying your marker file to make the cell types with",
"the shared marker subtypes in a hierarchy.",
paste(dups_gsn, collapse = ", ")))
} else {
message(paste("The following marker genes mark multiple cell groups.",
"Prior to using Garnett, we recommend either excluding",
"these genes using remove_duplicate_genes = TRUE, or",
"modifying your marker file to make the cell types with",
"the shared marker subtypes in a hierarchy.",
paste(dups, collapse = ", ")))
}
}
}

#good_markers = good_markers %>% dplyr::ungr
output <- list()

for (group in group_list) {
Expand Down
132 changes: 132 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -569,3 +569,135 @@ load_mtx_data <- function(mat_path,
cds <- cds[,colData(cds)$n.umi >= umi_cutoff]
return(cds)
}


#' Combine a list of cell_data_set objects
#'
#' This function will combine a list of cell_data_set objects into a new
#' cell_data_set object.
#'
#' @param cds_list List of cds objects to be combined.
#' @param keep_all_genes Logical indicating what to do if there is a mismatch
#' in the gene sets of the CDSs. If TRUE, all genes are kept and cells from
#' CDSs missing a given gene will be filled in with zeroes. If FALSE, only
#' the genes in common among all of the CDSs will be kept. Default is TRUE.
#' @param cell_names_unique Logical indicating whether all of the cell IDs
#' across all of the CDSs are unique. If FALSE, the CDS name is appended to
#' each cell ID to prevent collisions. Default is FALSE.
#'
#' @return A combined cell_data_set object.
#' @export
#'
combine_cds <- function(cds_list,
keep_all_genes = TRUE,
cell_names_unique = FALSE) {

assertthat::assert_that(is.list(cds_list),
msg=paste("cds_list must be a list."))

assertthat::assert_that(all(sapply(cds_list, class) == "cell_data_set"),
msg=paste("All members of cds_list must be",
"cell_data_set class."))

if(length(cds_list) == 1) return(cds_list[[1]])

assertthat::assert_that(is.logical(keep_all_genes))
assertthat::assert_that(is.logical(cell_names_unique))

list_named <- TRUE
if(is.null(names(cds_list))) {
list_named <- FALSE
}

exprs_list <- list()
fd_list <- list()
pd_list <- list()
gene_list <- c()
overlap_list <- c(row.names(fData(cds_list[[1]])))
pdata_cols <- c()
fdata_cols <- c()
all_cells <- c()

for(cds in cds_list) {
gene_list <- c(gene_list, row.names(fData(cds)))
overlap_list <- intersect(overlap_list, row.names(fData(cds)))
if (!keep_all_genes) {
gene_list <- overlap_list
}

pdata_cols <- c(pdata_cols, names(pData(cds)))
fdata_cols <- c(fdata_cols, names(fData(cds)))
all_cells <- c(all_cells, row.names(pData(cds)))
}

gene_list <- unique(gene_list)
if(length(overlap_list) == 0) {
if (keep_all_genes) {
warning(paste("No genes are shared amongst all the CDS objects."))
} else {
stop(paste("No genes are shared amongst all the CDS objects. To generate",
"a combined CDS with all genes, use keep_all_genes = TRUE"))
}
}
pdata_cols <- unique(pdata_cols)
fdata_cols <- unique(fdata_cols)
if (sum(duplicated(all_cells)) != 0 & cell_names_unique) {
stop(paste("Cell names are not unique across CDSs - cell_names_unique",
"must be TRUE."))
}
all_cells <- unique(all_cells)
for(i in 1:length(cds_list)) {
pd <- as.data.frame(pData(cds_list[[i]]))
exp <- exprs(cds_list[[i]])
exp <- exp[intersect(row.names(exp), gene_list),]
if (!cell_names_unique) {
if(list_named) {
row.names(pd) <- paste(row.names(pd), names(cds_list)[[i]], sep="_")
pd$sample <- names(cds_list)[[i]]
} else {
row.names(pd) <- paste(row.names(pd), i, sep="_")
pd$sample <- i
}
colnames(exp) <- row.names(pd)
}
not_in <- pdata_cols[!pdata_cols %in% names(pd)]
for (n in not_in) {
pd[,n] <- NA
}

fd <- as.data.frame(fData(cds_list[[i]]))
fd <- fd[intersect(row.names(fd), gene_list),]
not_in <- fdata_cols[!fdata_cols %in% names(fd)]
for (n in not_in) {
fd[,n] <- NA
}
not_in_g <- gene_list[!gene_list %in% row.names(fd)]
for (n in not_in_g) {
fd[n,] <- NA
}
if (length(not_in_g) > 0) {
extra_rows <- Matrix::Matrix(0, ncol=ncol(exp),
sparse=TRUE,
nrow=length(not_in_g))
row.names(extra_rows) <- not_in_g
colnames(extra_rows) <- colnames(exp)
exp <- rbind(exp, extra_rows)
}


exprs_list[[i]] <- exp
fd_list[[i]] <- fd
pd_list[[i]] <- pd

}

all_fd <- do.call(cbind, fd_list)
all_fd <- all_fd[,fdata_cols]
all_pd <- do.call(rbind, pd_list)
all_exp <- do.call(cbind, exprs_list)

all_exp <- all_exp[row.names(all_fd), row.names(all_pd)]


new_cell_data_set(all_exp, cell_metadata = all_pd, gene_metadata = all_fd)
}
14 changes: 7 additions & 7 deletions examples/c_elegans_L2.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ cds = preprocess_cds(cds, num_dim = 100, residual_model_formula_str = "~ plate")
cds = reduce_dimension(cds, umap.fast_sgd=FALSE, cores=1)
plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE) + ggsave("L2_umap_corrected_plate.png", width=5, height=4, dpi = 600)

# exmaple of using t-SNE:
# example of using t-SNE:
cds = reduce_dimension(cds, reduction_method="tSNE")
plot_cells(cds, reduction_method="tSNE", color_cells_by="cao_cell_type") + ggsave("L2_tsne_corrected_cao_type.png", width=5, height=4, dpi = 600)

Expand Down Expand Up @@ -205,8 +205,8 @@ garnett_markers = assigned_type_marker_test_res %>%
filter(marker_test_q_value < 0.01 & specificity >= 0.5) %>%
group_by(cell_group) %>%
top_n(5, marker_score)
garnett_markers = garnett_markers %>% group_by(gene_short_name) %>%
filter(n() == 1)
garnett_markers = garnett_markers %>% group_by(gene_short_name) %>%
filter(n() == 1)

plot_genes_by_group(cds,
unique(as.character(garnett_markers$gene_id)),
Expand Down Expand Up @@ -234,8 +234,8 @@ cds = classify_cells(cds, worm_classifier,
cluster_extend = TRUE,
cds_gene_id_type = "ENSEMBL")

plot_cells(cds,
group_cells_by="partition",
plot_cells(cds,
group_cells_by="partition",
color_cells_by="cluster_ext_type") + ggsave("L2_umap_corrected_garnett_ext_type_our_annotations.png", width=5, height=4, dpi = 600)

# Plot the overlap between clusters and annotated cell types:
Expand All @@ -250,8 +250,8 @@ cds = classify_cells(cds, ceWhole,
cluster_extend = TRUE,
cds_gene_id_type = "ENSEMBL")

plot_cells(cds,
group_cells_by="cluster",
plot_cells(cds,
group_cells_by="cluster",
color_cells_by="cluster_ext_type") + ggsave("L2_umap_corrected_garnett_ext_type_cao_annoutations.png", width=5, height=4, dpi = 600)

# Plot the overlap between clusters and annotated cell types:
Expand Down
27 changes: 27 additions & 0 deletions man/combine_cds.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/generate_garnett_marker_file.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit e212173

Please sign in to comment.