-
Notifications
You must be signed in to change notification settings - Fork 9
/
R-batchJob_revision-n10_makeSCEs-nucleiCalling_MNT.R
98 lines (72 loc) · 3.69 KB
/
R-batchJob_revision-n10_makeSCEs-nucleiCalling_MNT.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
################################################################################
### LIBD 10x snRNA-seq [pilot] revision (n=10)
### STEP 01.batchJob: Read in SCEs and perform nuclei calling (`emptyDrops()`)
### Initiated: MNT 25Feb2021
################################################################################
library(SingleCellExperiment)
library(scRNAseq)
library(batchelor)
library(EnsDb.Hsapiens.v86)
library(scater)
library(scran)
library(uwot)
library(DropletUtils)
library(jaffelab)
library(Rtsne)
library(gridExtra)
library(rtracklayer)
#### AS SUBMITTED JOB ====
### Read in (2021) 'samples.manifest' for streamlining
samples.revision <- read.table("/dcs04/lieber/lcolladotor/rawDataTDSC_LIBD001/raw-data/2021-10-15_Tran2021_published/Feb2021/samples.manifest",
sep="\t", header=F)$V1
# Make list of paths
paths.rawCounts <- c(paste0("/dcl01/ajaffe/data/lab/singleCell/10x_pilot/premRNA/Feb2021/",
samples.revision,"/outs/raw_feature_bc_matrix"))
# Simpler names for individual SCEs
names(paths.rawCounts) <- gsub("_",".", tolower(samples.revision))
## Read in raw UMI x barcode matrix - **use pre-mRNA-aligned reads
pilot.data.2 <- lapply(paths.rawCounts, function(x){ read10xCounts(x, col.names=TRUE) })
names(pilot.data.2) <- names(paths.rawCounts)
### Gene annotation (from scater) ===
# Pull in GTF information
gtf = import("/dcl01/ajaffe/data/lab/singleCell/refdata-cellranger-GRCh38-3.0.0/genes/genes.gtf")
gtf = gtf[gtf$type == "gene"]
length(gtf)
names(gtf) = gtf$gene_id
table(names(gtf) == rowData(pilot.data.2[[1]])$ID)
seqlevels(gtf)[1:25] = paste0("chr", seqlevels(gtf)[1:25])
mcols(gtf) = mcols(gtf)[,c(5:9)]
for(i in 1:length(pilot.data.2)){
rowRanges(pilot.data.2[[i]]) <- gtf
# Because some gene names are the same:
rownames(pilot.data.2[[i]]) <- uniquifyFeatureNames(rowData(pilot.data.2[[i]])$gene_id, rowData(pilot.data.2[[i]])$gene_name)
rowData(pilot.data.2[[i]])$Symbol.uniq <- rownames(pilot.data.2[[i]])
}
# Preliminary save
save(pilot.data.2, file="rdas/revision/all-FACS-n10_2021rev_SCEs_processing-QC_MNTMar2021.rda")
### Quality control ============================================================
## - Going to ignore the adaptive NMAD-approach to outlier detection for UMI/feature count
# because this hasn't been as straightforward in past experience (might throw away neurons)
## - Vignette for the 10x PBMC dataset (OSCA Ch.24) only does mito & droplet QC anyhow
# - (mention that for a sample with very heterogeneous cell comp., don't want
# to drop potential cells with low RNA content)
## Cell detection (droplet exclusion, rather)
# Can use UMI count vs barcode rank (knee/inflection plot) to decide threshold, but
# "this unnecessarily discards libraries derived from cell types with low RNA content" (OSCA, Ch. 6)
# -> Instead should prefer this Monte Carlo-simulation-based empty droplet test:
# Additionally:
# For any Sig==FALSE & Limited==TRUE, may need to increase n iterations (default = 10000) with 'niters='
# - this field = whether "the computed p-value for a...barcode is bounded by the number of iterations"
# -> In exploratory phase (preprint), not all samples passed with niters=15000 (default 10,000), so use 20,000
e.out.2 <- list()
for(i in 1:length(pilot.data.2)){
cat(paste0("Simulating empty drops for: ",names(pilot.data.2)[i],"... \n"))
set.seed(109)
e.out.2[[i]] <- emptyDrops(counts(pilot.data.2[[i]]), niters=20000)
cat(paste0("\n\t...Simulations complete. \n\t", date(), "\n\n\n"))
date()
}
names(e.out.2) <- names(pilot.data.2)
save(pilot.data.2, e.out.2, file="rdas/revision/all-FACS-n10_2021rev_SCEs_processing-QC_MNTMar2021.rda")
rm(list=ls())
sessionInfo()