-
Notifications
You must be signed in to change notification settings - Fork 0
/
Stromal_Cells_Milo_2.Rmd
378 lines (310 loc) · 17.6 KB
/
Stromal_Cells_Milo_2.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
---
Analysis of stromal cells (all non-cancer cells) from experiments 2
This notebook provides example scripts for adapting Milo to detect differential phenotypic distribution of stromal cells from Experiment 2 (w/ transcription and translation inhibitors) and generates the major results in Figure S3.
Note that each run gives slightly different results (as index cells for the phenotypic neighborhoods are randomly selected), while data presented in the manuscript are also provided in the Data directory.
General guidelines on EdgeR, which MILO wraps: https://support.bioconductor.org/p/84338/
Clear space
```{r}
rm(list = ls())
```
Load packages
```{r}
library(miloR)
library(Matrix)
library(SingleCellExperiment)
library(scater)
library(dplyr)
library(patchwork)
library(Matrix)
library(igraph)
library(SingleCellExperiment)
library(scater)
library(dplyr)
library(patchwork)
```
Change directory to the desired path.
```{r}
dir = '/Users/gans/Documents/Computation/Sequencing/Breast_tumor/GitHub/'
OUT_RESULT_DIR = paste0(dir, 'Data/Stroma_2/')
OUT_FIGURE_DIR = paste0(OUT_RESULT_DIR, 'figures/')
# "Added" means transcription and translation inhibitors are added in Experiment 2
# Following input is generated by Python notebook and saved into this format accessible by the Milo script in R
mtx_file = paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.counts.mtx')
bc_file = paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.barcodes.csv')
g_file = paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.genes.csv')
pc_file = paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.PCs.csv') # Principal components
umap_file = paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.umap.csv') # UMAP embedding
sample_file = paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.sample.txt')
group_file = paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.group.txt')
group_refined_file = paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.group.refined.txt')
cellType_file = paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.CT.txt') # Cell type
hvg_file=paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.HVG.txt') # Highly variable genes
```
Load data and build sce.
```{r}
# Transpose matrix for python - R connection (sparse matrix mtx) (the value was read in as an array)
counts = t(as.matrix(readMM(mtx_file)))
# Read cell names
bc = read.csv(bc_file, header = F, col.names='Cell', stringsAsFactors = F)
# Read gene names
g = read.csv(g_file, header = F, col.names = 'Gene', stringsAsFactors = F)
# Read HVGs
hvg = read.table(hvg_file, header = F, sep = "\t", col.names = 'HVG', stringsAsFactors = F)
# Read samle info.
sample_info = read.table(sample_file, header = F, sep = "\t", col.names = 'sample', stringsAsFactors = F)
# Read group info.
group_info = read.table(group_file, header = F, sep = "\t", col.names = 'group', stringsAsFactors = F)
# Read refined group info.
group_refined_info = read.table(group_refined_file, header = F, sep = "\t", col.names = 'group_refined', stringsAsFactors = F)
# Cell type annotation
cellType_info = read.table(cellType_file, header = F, sep = "\t", col.names = 'CT', stringsAsFactors = F)
# PC
pc <- as.matrix(read.csv(pc_file, stringsAsFactors = F, row.names = 1))
# UMAP
umap <- as.matrix(read.csv(umap_file, stringsAsFactors = F, row.names = 1))
# Create a single cell experiment object
sce <- SingleCellExperiment(list(logcounts=counts),
colData=DataFrame(cbind(sample_info, group_info, group_refined_info, cellType_info)))
# Store the gene names in this object
rownames(sce) <- g$Gene
rowData(sce) <- "Gene"
# Store the gene names in this object
colnames(sce) <- bc$Cell
as(sce, "SingleCellExperiment")
```
To maintain consistency with the Scanpy analysis, use the PCs calculated from the top 5000 HVGs, with the number of PCs determined by the knee point of the total variance explained.
```{r}
# Load PCA and umap
reducedDim(sce, "PCA") <- t(pc)
reducedDim(sce, "umap") <- t(umap)
```
```{r}
plotReducedDim(sce, colour_by="sample",
dimred="umap", point_size=0.5)
```
Differential abundance testing
Create a Milo object:
For differential abundance analysis on graph neighbourhoods, we first construct a Milo object. This extends the SingleCellExperiment class to store information about neighbourhoods on the KNN graph.
```{r}
sce <- Milo(sce)
```
Construct KNN graph.
```{r}
sce <- buildGraph(sce, k = 15, d=ncol(reducedDim(sce, "PCA")), reduced.dim = "PCA")
```
Defining representative neighbourhoods on the KNN graph
"For sampling you need to define a few parameters:
prop: the proportion of cells to randomly sample to start with (usually 0.1 - 0.2 is sufficient)
k: the k to use for KNN refinement (we recommend using the same k used for KNN graph building)
d: the number of reduced dimensions to use for KNN refinement (we recommend using the same d used for KNN graph building)
refined: indicates whether you want to use the sampling refinement algorith, or just pick cells at random. The default and recommended way to go is to use refinement. The only situation in which you might consider using random instead, is if you have batch corrected your data with a graph based correction algorithm, such as BBKNN, but the results of DA testing will be suboptimal."
```{r}
sce <- makeNhoods(sce, prop=0.1, k = 15,
d=ncol(reducedDim(sce, "PCA")), refined = TRUE, reduced_dims = "PCA")
```
Once we have defined neighbourhoods, it’s good to take a look at how big the neighbourhoods are (i.e. how many cells form each neighbourhood). This affects the power of DA testing. We can check this out using the plotNhoodSizeHist function. Empirically, we found it’s best to have a distribution peaking above 20/distribution peaking between 50 and 100. Otherwise you might consider rerunning makeNhoods increasing k and/or prop
```{r}
plotNhoodSizeHist(sce)
```
Counting cells in neighbourhoods:
Milo leverages the variation in cell numbers between replicates for the same experimental condition to test for differential abundance. Therefore we have to count how many cells from each sample are in each neighbourhood. We need to use the cell metadata and specify which column contains the sample information.
```{r}
sce <- countCells(sce, meta.data = data.frame(colData(sce)), sample="sample")
```
This adds to the Milo object a 𝑛×𝑚 matrix, where 𝑛 is the number of neighbourhoods and 𝑚 is the number of experimental samples. Values indicate the number of cells from each sample counted in a neighbourhood. This count matrix will be used for DA testing.
```{r}
head(nhoodCounts(sce))
```
Reasoning for the tests:
1. Do not have the strict replicates (e.g., 2 samples for the HCC1954-BrM_Unlabeled cells)
2. Assume the Unlabeled cells of the 2 models are biologically identical (replicates)
3. Classify the samples into 3 groups: Unlabeled, TN-labeled, HER2+-labeled
4. Test if any neighborhood has differential abundance for labeled cells or HER2+- vs. TN-labeled (with p values adjusted)
5. Use the original common dispersion estimate
Step-wise implementation of testNhoods:
Implement hypothesis testing in a generalized linear model (GLM) framework, specifically using the Negative Binomial GLM implementation in edgeR.
Analogy to edgeR: "neighborhood = gene"
```{r}
dge <- DGEList(counts=nhoodCounts(sce), lib.size=log(colSums(nhoodCounts(sce))))
```
Experimental design
```{r}
sce_design <- data.frame(colData(sce))[,c("sample","group_refined")]
sce_design <- distinct(sce_design)
rownames(sce_design) <- sce_design$sample
# Add the info. of model (cell line) & mCherry labeling status s
sce_design$model <- sapply(strsplit(sce_design$sample,"_"), `[`, 1)
sce_design$mCherry <- sapply(strsplit(sce_design$sample,"_"), `[`, 2)
```
```{r}
sce_design$group <- relevel(factor(sce_design$group), ref='Unlabeled_inhib_added')
model <- model.matrix(~group, data=sce_design)
model
```
```{r}
dge <- estimateDisp(dge, model)
plotBCV(dge)
fit <- glmFit(dge, model, robust=TRUE)
```
Syntax for specifying contrasts:
mod.constrast <- makeContrasts(contrasts=model.contrasts, levels=model)
EdgeR's syntax:
```{r}
mod.constrast <- makeContrasts(TNvsHER2=groupTNBC_Labeled_inhib_added-groupHER2BC_Labeled_inhib_added,
TNvsCtrl=groupTNBC_Labeled_inhib_added, HER2vsCtrl=groupHER2BC_Labeled_inhib_added, levels=model)
# Use glmLRT rather than glmQLFTest
da_results_TNvsHER2 <- as.data.frame(topTags(glmLRT(fit, contrast=mod.constrast[,"TNvsHER2"]), sort.by='none', n=Inf))
# da_results_LabeledvsCtrl tests either TNvsCtrl or HER2vsCtrl differs from 0 but does not generate the logFC
da_results_TNvsCtrl <- as.data.frame(topTags(glmLRT(fit, contrast=mod.constrast[,"TNvsCtrl"]), sort.by='none', n=Inf))
da_results_HER2vsCtrl <- as.data.frame(topTags(glmLRT(fit, contrast=mod.constrast[,"HER2vsCtrl"]), sort.by='none', n=Inf))
da_results_LabeledvsCtrl <- as.data.frame(topTags(glmLRT(fit, coef=2:3), sort.by='none', n=Inf))
```
Computing neighbourhood connectivity:
Milo uses an adaptation of the Spatial FDR correction introduced by cydar, which accounts for the overlap between neighbourhoods. Specifically, each hypothesis test P-value is weighted by the reciprocal of the kth nearest neighbour distance. To use this statistic we first need to store the distances between nearest neighbors in the Milo object. This is done by the calcNhoodDistance function (N.B. this step is the most time consuming of the analysis workflow and might take a couple of minutes for large datasets).
```{r}
sce <- calcNhoodDistance(sce, d=ncol(reducedDim(sce, "PCA")), reduced.dim = "PCA")
```
Performing spatial FDR correction with", fdr.weighting[1], " weighting"):
```{r}
da_results_TNvsHER2$Nhood <- as.numeric(rownames(da_results_TNvsHER2))
mod.spatialfdr <- graphSpatialFDR(x.nhoods=nhoods(sce),
graph=graph(sce),
weighting="k-distance",
pvalues=da_results_TNvsHER2[order(da_results_TNvsHER2$Nhood), ]$PValue,
indices=nhoodIndex(sce),
distances=nhoodDistances(sce),
reduced.dimensions=reducedDim(sce, "PCA"),
k=15)
da_results_TNvsHER2$SpatialFDR[order(da_results_TNvsHER2$Nhood)] <- mod.spatialfdr
```
```{r}
da_results_TNvsCtrl$Nhood <- as.numeric(rownames(da_results_TNvsCtrl))
mod.spatialfdr <- graphSpatialFDR(x.nhoods=nhoods(sce),
graph=graph(sce),
weighting="k-distance",
pvalues=da_results_TNvsCtrl[order(da_results_TNvsCtrl$Nhood), ]$PValue,
indices=nhoodIndex(sce),
distances=nhoodDistances(sce),
reduced.dimensions=reducedDim(sce, "PCA"),
k=15)
da_results_TNvsCtrl$SpatialFDR[order(da_results_TNvsCtrl$Nhood)] <- mod.spatialfdr
```
```{r}
da_results_HER2vsCtrl$Nhood <- as.numeric(rownames(da_results_HER2vsCtrl))
mod.spatialfdr <- graphSpatialFDR(x.nhoods=nhoods(sce),
graph=graph(sce),
weighting="k-distance",
pvalues=da_results_HER2vsCtrl[order(da_results_HER2vsCtrl$Nhood), ]$PValue,
indices=nhoodIndex(sce),
distances=nhoodDistances(sce),
reduced.dimensions=reducedDim(sce, "PCA"),
k=15)
da_results_HER2vsCtrl$SpatialFDR[order(da_results_HER2vsCtrl$Nhood)] <- mod.spatialfdr
```
```{r}
da_results_LabeledvsCtrl$Nhood <- as.numeric(rownames(da_results_LabeledvsCtrl))
mod.spatialfdr <- graphSpatialFDR(x.nhoods=nhoods(sce),
graph=graph(sce),
weighting="k-distance",
pvalues=da_results_LabeledvsCtrl[order(da_results_LabeledvsCtrl$Nhood), ]$PValue,
indices=nhoodIndex(sce),
distances=nhoodDistances(sce),
reduced.dimensions=reducedDim(sce, "PCA"),
k=15)
da_results_LabeledvsCtrl$SpatialFDR[order(da_results_LabeledvsCtrl$Nhood)] <- mod.spatialfdr
```
Inspecting DA testing results:
Inspect the distribution of uncorrected P values, to verify that the test was balanced.
```{r}
ggplot(da_results_LabeledvsCtrl, aes(-log10(SpatialFDR))) + geom_histogram(bins=50)
```
Build an abstracted graph of neighbourhoods for visualization
```{r}
sce <- buildNhoodGraph(sce)
```
We might also be interested in visualizing wheather DA is particularly evident in certain cell types. To do this, we assign a cell type label to each neighbourhood by finding the most abundant cell type within cells in each neighbourhood. We can label neighbourhoods in the results data.frame using the function annotateNhoods. This also saves the fraction of cells harbouring the label.
```{r}
da_results_LabeledvsCtrl <- annotateNhoods(sce, da_results_LabeledvsCtrl,
coldata_col = "CT")
head(da_results_LabeledvsCtrl)
da_results_HER2vsCtrl <- annotateNhoods(sce, da_results_HER2vsCtrl,
coldata_col = "CT")
head(da_results_HER2vsCtrl)
da_results_TNvsCtrl <- annotateNhoods(sce, da_results_TNvsCtrl,
coldata_col = "CT")
head(da_results_TNvsCtrl)
# These cells don't exist in this dataset: 6,7,8,14,15,18
```
Note: Here, we apply a few "tricks" to facilitate setting the color bar to the desired range and assigning the order of cell types to match that of experiment 1.
By default, R's plotting functions automatically scale the color bar based on the minimum and maximum values in the data, without providing an easy way to manually adjust it. To ensure a consistent color bar across all four plots for the differential abundance tests (HER2BC and TNBC from experiments 1 and 2), we artificially add two data points representing the minimum and maximum log fold change values. These points are added to the minority cell types that were not captured in Experiment 2 (i.e., thus w/o neighborhoods), making them easy to track and remove later.
```{r}
nr <- nrow(da_results_HER2vsCtrl)
da_results_HER2vsCtrl[c(nr+1,nr+2,nr+3,nr+4,nr+5,nr+6),] <- da_results_HER2vsCtrl[1:6,]
da_results_HER2vsCtrl[c(nr+1,nr+2,nr+3,nr+4,nr+5,nr+6),'logFC'] <- c(-9.165263,-9.165263,-9.165263,9.165263,9.165263,9.165263)
da_results_HER2vsCtrl[c(nr+1,nr+2,nr+3,nr+4,nr+5,nr+6),'CT'] <- c('06_','07_','08_','14_','15_','18_')
nr <- nrow(da_results_TNvsCtrl)
da_results_TNvsCtrl[c(nr+1,nr+2,nr+3,nr+4,nr+5,nr+6),] <- da_results_TNvsCtrl[1:6,]
da_results_TNvsCtrl[c(nr+1,nr+2,nr+3,nr+4,nr+5,nr+6),'logFC'] <- c(-9.165263,-9.165263,-9.165263,9.165263,9.165263,9.165263)
da_results_TNvsCtrl[c(nr+1,nr+2,nr+3,nr+4,nr+5,nr+6),'CT'] <- c('06_','07_','08_','14_','15_','18_')
```
```{r}
library(ggplot2)
library(ggbeeswarm)
library(scales)
p <- plotDAbeeswarm(da_results_HER2vsCtrl, group.by = "CT", alpha = 0.5, )
p <- p + geom_quasirandom(size=1)
p <- p+scale_color_gradient2(low='navyblue', mid='gray74', high='darkred', guide = "colourbar") + guides(color = guide_colourbar())
#p <- p + scale_fill_continuous(limits = c(min(da_results_LabeledvsCtrl$FDR), max(da_results_LabeledvsCtrl$FDR)))
ggsave(paste0(OUT_RESULT_DIR,'Tr_tl_inhibitors_Added_milo_HER2vsCtrl.svg'), plot=p, width=10, height=20, dpi=300)
```
```{r}
library(ggplot2)
library(ggbeeswarm)
library(scales)
p <- plotDAbeeswarm(da_results_TNvsCtrl, group.by = "CT", alpha = 0.5, )
p <- p + geom_quasirandom(size=1)
p <- p+scale_color_gradient2(low='navyblue', mid='gray74', high='darkred', guide = "colourbar") + guides(color = guide_colourbar())
#p <- p + scale_fill_continuous(limits = c(min(da_results_LabeledvsCtrl$FDR), max(da_results_LabeledvsCtrl$FDR)))
ggsave(paste0(OUT_RESULT_DIR,'Tr_tl_inhibitors_Added_milo_TNvsCtrl.svg'), plot=p, width=10, height=20, dpi=300)
# group.by = "CT")
```
Save differential abundance results
1. Neighborhood indexes (remember that the R index starts from 1 and python from 0)
2. Cells associated with each neighborhood
3. p value
4. lfc (log fold change)
5. Adjacency matrix of the neighborhood indexes
```{r}
write.csv(nhoodIndex(sce), paste0(OUT_RESULT_DIR,'Tr_tl_inhibitors_Added_milo.nhoodIndex.csv'))
write.csv(as.matrix(nhoodCounts(sce)), paste0(OUT_RESULT_DIR,'Tr_tl_inhibitors_Added_milo.nhoodCounts.csv'))
write.csv(as.matrix(fit$coefficients), paste0(OUT_RESULT_DIR,'Tr_tl_inhibitors_Added_milo.nhoodCountsFit.csv'))
write.csv(da_results_TNvsCtrl, paste0(OUT_RESULT_DIR,'Tr_tl_inhibitors_Added_milo.da_TN.csv'))
write.csv(da_results_HER2vsCtrl, paste0(OUT_RESULT_DIR,'Tr_tl_inhibitors_Added_milo.da_HER2vsCtrl.csv'))
write.csv(da_results_LabeledvsCtrl, paste0(OUT_RESULT_DIR,'Tr_tl_inhibitors_Added_milo.da_LabeledvsCtrl.csv'))
```
```{r}
write.csv(as.matrix(nhoodAdjacency(sce)),
paste0(OUT_RESULT_DIR,'Tr_tl_inhibitors_Added_milo.nhoodIndex.adj.csv'))
```
nhoodDistances(sce) is a list of sparse matrices (symmetric):
1. The name of nhoodDistances(sce) corresponds to the index of the index cell in each neighborhood (stored in nhoodIndex(sce))
2. Each ".@ Dimnames" is a symmetric matrix storing the distance between cells within this neighborhood (cells not belonging to this neighborhood are all set to 0)
3. To access list, use [[]]
e.g.
nhoodIndex(sce)[1]
[[1]]
[1] 5389
nhoodDistances(sce)[1]
$`5389`
28 x 28 sparse Matrix of class "dgCMatrix"
names(nhoodDistances(sce)[1])
[1] "5389"
```{r}
for(i in 1:length(nhoodDistances(sce))) {
write.csv(as.matrix(nhoodDistances(sce)[[i]]),
paste0(OUT_RESULT_DIR,'Tr_tl_inhibitors_Added_milo.nhoodDistances_',names(nhoodDistances(sce)[i]),'.csv'))
}
```
```{r}
saveRDS(sce, paste0(OUT_RESULT_DIR, 'Tr_tl_inhibitors_Added_milo.rds'))
```