-
Notifications
You must be signed in to change notification settings - Fork 9
/
10x_miscPUBLICATIONstuff_furtherExploration_MNT.R
435 lines (350 loc) · 19.8 KB
/
10x_miscPUBLICATIONstuff_furtherExploration_MNT.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
### MNT 10x snRNA-seq workflow: (step 04?:)
### Miscellaneous lookings-into / finalizing graphics for manuscript
### - Brief neuron-specific clustering for DLPFC (Maynard-Collado-Torres et al.)
### - 10x pilot snRNA-seq paper (Tran-Maynard et al.)
##################################################################################
library(SingleCellExperiment)
library(EnsDb.Hsapiens.v86)
library(scater)
library(scran)
library(batchelor)
library(DropletUtils)
library(jaffelab)
### Palette taken from `scater`
tableau10medium = c("#729ECE", "#FF9E4A", "#67BF5C", "#ED665D",
"#AD8BC9", "#A8786E", "#ED97CA", "#A2A2A2",
"#CDCC5D", "#6DCCDA")
tableau20 = c("#1F77B4", "#AEC7E8", "#FF7F0E", "#FFBB78", "#2CA02C",
"#98DF8A", "#D62728", "#FF9896", "#9467BD", "#C5B0D5",
"#8C564B", "#C49C94", "#E377C2", "#F7B6D2", "#7F7F7F",
"#C7C7C7", "#BCBD22", "#DBDB8D", "#17BECF", "#9EDAE5")
#### Does sex impact cluster proportions? ================
## --> Logistic regression: Sex ~ cell type proportion [for each cell type]
### Velmeshev, et al (PFC & ACC) ===
load("/dcl01/ajaffe/data/lab/singleCell/velmeshev2019/analysis_MNT/SCE_asd-velmeshev-etal_MNT.rda",
verbose=T)
# *Since these regions were analyzed together, just keep them together
table(sce.asd$individual, sce.asd$sex)
# 31 individuals, 7 female
table(sce.asd$individual, sce.asd$BAregion)
# Basically individuals could have been sampled multiple times - BA24[ACC] and/or a PFC BAregion
apply(table(sce.asd$cluster, sce.asd$sex), 2, function(x){round(prop.table(x),3)})
# F M
# AST-FB 0.042 0.044
# AST-PP 0.098 0.067
# Endothelial 0.019 0.027
# IN-PV 0.028 0.041
# IN-SST 0.035 0.046
# IN-SV2C 0.018 0.018
# IN-VIP 0.056 0.058
# L2/3 0.133 0.126
# L4 0.069 0.065
# L5/6 0.032 0.035
# L5/6-CC 0.043 0.044
# Microglia 0.034 0.031
# Neu-mat 0.037 0.040
# Neu-NRGN-I 0.043 0.032
# Neu-NRGN-II 0.082 0.079
# Oligodendrocytes 0.130 0.151
# OPC 0.101 0.096 - broadly it looks pretty much the same
# Make cell proportion table per individual
propDat <- apply(table(sce.asd$cluster, sce.asd$individual), 2, function(x){signif(prop.table(x),6)})
# Matching sex, binarized
ind.idx <- colnames(propDat)
sex.idx <- sce.asd$sex[match(ind.idx, sce.asd$individual)]
sex.idx <- as.numeric(sex.idx=="F")
regrStats.sex <- list()
for(i in levels(sce.asd$cluster)){
regrStats.sex[[i]] <- glm(sex.idx ~ propDat[i, ], family="binomial")
}
# P-values for regression on cell type prop.
sapply(regrStats.sex, function(x){coef(summary(x))["propDat[i, ]", "Pr(>|z|)"]})
# AST-FB AST-PP Endothelial IN-PV
# 0.8598569 0.6190065 0.2571417 0.1365944
# IN-SST IN-SV2C IN-VIP L2/3
# 0.2652023 0.7407465 0.8813374 0.5832419
# L4 L5/6 L5/6-CC Microglia
# 0.7443358 0.7411243 0.9355193 0.9933819
# Neu-mat Neu-NRGN-I Neu-NRGN-II Oligodendrocytes
# 0.6790237 0.9039778 0.7741251 0.7656764
# OPC
# 0.7381165
# (Btw marker statistics computed on:
# mod <- with(colData(sce.asd.pfc), model.matrix(~ diagnosis + age + sex + Capbatch + RIN))
# mod <- mod[ ,-1])
# --> don't think I want to include these additional predictors tho
## By simpler `lm(prop ~ sex + region + Dx)` ===
coldat.asd <- data.frame(sample=colnames(propDat))
coldat.asd$sex <- sce.asd$sex[match(coldat.asd$sample, sce.asd$individual)]
coldat.asd$region <- sce.asd$region[match(coldat.asd$sample, sce.asd$individual)]
coldat.asd$Dx <- sce.asd$diagnosis[match(coldat.asd$sample, sce.asd$individual)]
regrStats.sex.lm <- list()
for(i in levels(sce.asd$cluster)){
#regrStats.sex.lm[[i]] <- lm(propDat[i, ] ~ sex.idx)
regrStats.sex.lm[[i]] <- lm(propDat[i, ] ~ sex + region + Dx, data=coldat.asd)
}
# Exploring some results
t(sapply(regrStats.sex.lm, function(x){signif(coef(summary(x)),3)["sexM", ]}))
# Estimate Std. Error t value Pr(>|t|)
# AST-FB -0.004110 0.00831 -0.4940 0.625
# AST-PP -0.009340 0.02860 -0.3270 0.746
# Endothelial 0.005510 0.00625 0.8820 0.386
# IN-PV 0.009420 0.00693 1.3600 0.186
# IN-SST 0.009490 0.00751 1.2600 0.217
# IN-SV2C 0.000296 0.00350 0.0846 0.933
# IN-VIP -0.001570 0.00995 -0.1580 0.876
# L2/3 -0.015300 0.02020 -0.7590 0.454
# L4 -0.004040 0.01970 -0.2050 0.839
# L5/6 -0.002470 0.00682 -0.3630 0.720
# L5/6-CC -0.001750 0.00926 -0.1890 0.852
# Microglia 0.006170 0.01250 0.4920 0.626
# Neu-mat -0.011100 0.02060 -0.5400 0.593
# Neu-NRGN-I -0.004480 0.01510 -0.2970 0.769
# Neu-NRGN-II 0.004500 0.02890 0.1560 0.877
# Oligodendrocytes 0.020900 0.04410 0.4750 0.638
# OPC -0.002140 0.01570 -0.1360 0.893
## These two seem to recapitulate the comment on AST-PP associated with Dx:
# (or other observations - see Fig. S1G and S1H)
t(sapply(regrStats.sex.lm, function(x){signif(coef(summary(x)),3)["DxControl", ]}))
# Estimate Std. Error t value Pr(>|t|)
# AST-FB -6.92e-04 0.00686 -0.1010 0.9200
# AST-PP -5.62e-02 0.02360 -2.3800 0.0244
# Endothelial 8.03e-03 0.00516 1.5600 0.1310
# IN-PV 3.75e-03 0.00572 0.6550 0.5180
# IN-SST 9.61e-03 0.00620 1.5500 0.1330
# IN-SV2C -4.19e-05 0.00289 -0.0145 0.9890
# IN-VIP -5.13e-03 0.00822 -0.6240 0.5380
# L2/3 -2.54e-02 0.01670 -1.5200 0.1390
# L4 -4.98e-03 0.01630 -0.3060 0.7620
# L5/6 -4.22e-04 0.00563 -0.0749 0.9410
# L5/6-CC -6.50e-03 0.00765 -0.8500 0.4030
# Microglia 6.13e-03 0.01030 0.5920 0.5590
# Neu-mat 1.08e-02 0.01700 0.6350 0.5310
# Neu-NRGN-I 5.29e-03 0.01250 0.4250 0.6740
# Neu-NRGN-II 1.67e-02 0.02390 0.6990 0.4910
# Oligodendrocytes 5.17e-02 0.03640 1.4200 0.1670
# OPC -1.26e-02 0.01290 -0.9750 0.3380
t(sapply(regrStats.sex.lm, function(x){signif(coef(summary(x)),3)["regionPFC", ]}))
# Estimate Std. Error t value Pr(>|t|)
# AST-FB -0.01330 0.00703 -1.900 0.06860
# AST-PP 0.05150 0.02420 2.130 0.04250
# Endothelial -0.01700 0.00529 -3.210 0.00337
# IN-PV -0.00567 0.00587 -0.965 0.34300
# IN-SST 0.00214 0.00636 0.337 0.73900
# IN-SV2C -0.00411 0.00297 -1.390 0.17700
# IN-VIP -0.01350 0.00842 -1.600 0.12200
# L2/3 -0.01360 0.01710 -0.797 0.43200
# L4 0.01190 0.01670 0.711 0.48300
# L5/6 -0.00184 0.00577 -0.319 0.75200
# L5/6-CC -0.01020 0.00784 -1.300 0.20400
# Microglia 0.02860 0.01060 2.700 0.01180
# Neu-mat -0.01910 0.01740 -1.100 0.28300
# Neu-NRGN-I -0.01590 0.01280 -1.250 0.22300
# Neu-NRGN-II -0.02300 0.02450 -0.942 0.35500
# Oligodendrocytes 0.02340 0.03730 0.628 0.53500
# OPC 0.01970 0.01330 1.490 0.14800
# Just print results for sex predictor:
asd.lm.sex <- t(sapply(regrStats.sex.lm, function(x){signif(coef(summary(x)),3)["sexM", ]}))
write.table(asd.lm.sex, file="tables/revision/suppForReviewers_Velmeshev-etal_clusterProp-lm-on-sex_MNT2021.tsv",
sep="\t", row.names=T, col.names=T)
### And for Mathys, et al ============
load("rdas/referenceDatasets/SCE_mathys-PFC-BA10_MNT.rda", verbose=T)
# sce.mathys
colnames(colData(sce.mathys))
# [1] "projid" "pre.cluster" "broad.cell.type" "Subcluster"
# [5] "individualID" "Dx" "age_death" "msex"
# [9] "race" "sizeFactor"
table(sce.mathys$Subcluster, sce.mathys$msex)
# Pretty even distribution, by eye
# Do the same thing as above
propDat <- apply(table(sce.mathys$Subcluster, sce.mathys$individualID), 2, function(x){signif(prop.table(x),6)})
# Matching sex, binarized
ind.idx <- colnames(propDat)
sex.idx <- sce.mathys$msex[match(ind.idx, sce.mathys$individualID)]
sex.idx <- as.numeric(sex.idx=="0")
regrStats.sex <- list()
for(i in sort(unique(sce.mathys$Subcluster))){
regrStats.sex[[i]] <- glm(sex.idx ~ propDat[i, ], family="binomial")
}
# P-values for regression on cell type prop.
sapply(regrStats.sex, function(x){signif(coef(summary(x)),6)["propDat[i, ]", "Pr(>|z|)"]})
# Ast0 Ast1 Ast2 Ast3 End1 End2 Ex0 Ex1
# 0.7961580 0.8174390 0.7589670 0.9583130 0.5789900 0.4910990 0.6330960 0.0545296
# Ex11 Ex12 Ex14 Ex2 Ex3 Ex4 Ex5 Ex6
# 0.4111320 0.6637870 0.2281180 0.7249140 0.1487160 0.0754658 0.1397110 0.4928440
# Ex7 Ex8 Ex9 In0 In1 In10 In11 In2
# 0.3383150 0.1877990 0.8266440 0.3886890 0.0700323 0.0788781 0.4623090 0.1488410
# In3 In4 In5 In6 In7 In8 In9 Mic0
# 0.2427360 0.2654090 0.1671240 0.7676490 0.2821590 0.6259190 0.9835870 0.7097980
# Mic1 Mic2 Mic3 Oli0 Oli1 Oli3 Oli4 Oli5
# 0.9702780 0.5932430 0.2823240 0.3986670 0.1447250 0.1929830 0.5829920 0.9261180
# Opc0 Opc1 Opc2 Per
# 0.8286980 0.1100380 0.5692740 0.7238730
ps.logisticReg <- sapply(regrStats.sex, function(x){signif(coef(summary(x)),6)["propDat[i, ]", "Pr(>|z|)"]})
## By simpler `lm(prop ~ sex)` =============
coldat.ad <- data.frame(sample=colnames(propDat))
coldat.ad$sex <- sce.mathys$msex[match(coldat.ad$sample, sce.mathys$individualID)]
coldat.ad$Dx <- sce.mathys$Dx[match(coldat.ad$sample, sce.mathys$individualID)]
regrStats.sex.lm.ad <- list()
for(i in sort(unique(sce.mathys$Subcluster))){
regrStats.sex.lm.ad[[i]] <- lm(propDat[i, ] ~ sex + Dx, data=coldat.ad)
}
# P-values for regression on cell type prop.
sapply(regrStats.sex.lm.ad, function(x){signif(coef(summary(x)),6)["sex", "Pr(>|t|)"]})
ps.lm.ad <- sapply(regrStats.sex.lm.ad, function(x){signif(coef(summary(x)),6)["sex", "Pr(>|t|)"]})
# All > 0.05 except 'Ex1': 0.0484782
# Exploring some results
t(sapply(regrStats.sex.lm.ad, function(x){signif(coef(summary(x)),3)["sex", ]}))
table(t(sapply(regrStats.sex.lm.ad, function(x){signif(coef(summary(x)),3)["sex", ]}))[, "Pr(>|t|)"] <= 0.05)
# none
# Dx effect
t(sapply(regrStats.sex.lm.ad, function(x){signif(coef(summary(x)),3)["DxAD", ]}))
which(t(sapply(regrStats.sex.lm.ad, function(x){signif(coef(summary(x)),3)["DxAD", ]}))[, "Pr(>|t|)"] <= 0.05)
# one, 'Ast2' (p-value = 0.027)
ps.Dx.ad <- t(sapply(regrStats.sex.lm.ad, function(x){signif(coef(summary(x)),3)["DxAD", ]}))[ ,"Pr(>|t|)"]
p.adjust(ps.Dx.ad, method="BH")
# none after FDR correction. Authors also do some interesting overrepresentation tests...
# not sure how much to expect this recaps that, but since we're just testing for sex:
# Print results for sex predictor:
ad.lm.sex <- t(sapply(regrStats.sex.lm.ad, function(x){signif(coef(summary(x)),3)["sex", ]}))
write.table(ad.lm.sex, file="tables/revision/suppForReviewers_Mathys-etal_clusterProp-lm-on-sex_MNT2021.tsv",
sep="\t", row.names=T, col.names=T)
## Median gene capture per cell type? ===
# NAc
load("rdas/regionSpecific_NAc-ALL-n5_cleaned-combined_SCE_MNTMar2020.rda", verbose=T)
# sce.nac.all, chosen.hvgs.nac.all, pc.choice.nac.all, clusterRefTab.nac.all, ref.sampleInfo
sce.nac.all <- sce.nac.all[ ,-which(sce.nac.all$cellType.final=="ambig.lowNtrxts")]
sce.nac.all$cellType.final <- droplevels(sce.nac.all$cellType.final)
cell.idx <- splitit(sce.nac.all$cellType.final)
sapply(cell.idx, function(x){quantile(sce.nac.all$detected[x])})
# Astro Inhib.1 Inhib.2 Inhib.3 Inhib.4 Micro MSN.D1.1 MSN.D1.2 MSN.D1.3
# 0% 473 2782 2917.00 1980.00 1047.0 836 1021.00 1508 1632.0
# 25% 2729 5205 4066.75 4872.50 4280.5 1706 4513.25 4385 4919.0
# 50% 3147 5974 4723.50 5480.00 4896.0 2190 5005.50 4933 5363.0
# 75% 3587 6823 6219.00 6226.25 5445.0 2720 5577.00 5342 5783.5
# 100% 6526 9332 8298.00 9561.00 9021.0 3792 7846.00 7743 8378.0
# MSN.D1.4 MSN.D2.1 MSN.D2.2 Oligo OPC
# 0% 1372 800.00 1240.0 303.0 910.0
# 25% 5480 5104.75 5258.0 1689.5 2884.0
# 50% 5972 5603.50 5724.0 2216.0 3300.0
# 75% 6468 6188.25 6186.5 2767.5 3708.5
# 100% 9874 9108.00 9917.0 6129.0 6325.0
apply(table(sce.nac.all$cellType.final, sce.nac.all$donor), 2, function(x){round(prop.table(x),2)})
# Br5161 Br5182 Br5207 Br5212 Br5287
# Astro 0.07 0.00 0.00 0.22 0.02
# Inhib.1 0.00 0.00 0.00 0.00 0.00
# Inhib.2 0.00 0.01 0.00 0.00 0.00
# Inhib.3 0.00 0.02 0.04 0.00 0.01
# Inhib.4 0.00 0.02 0.01 0.00 0.01
# Micro 0.04 0.00 0.00 0.04 0.05
# MSN.D1.1 0.00 0.03 0.00 0.00 0.00
# MSN.D1.2 0.00 0.07 0.00 0.00 0.00
# MSN.D1.3 0.01 0.09 0.07 0.00 0.01
# MSN.D1.4 0.09 0.35 0.41 0.10 0.11
# MSN.D2.1 0.00 0.03 0.03 0.00 0.00
# MSN.D2.2 0.02 0.38 0.42 0.07 0.01
# Oligo 0.71 0.00 0.00 0.49 0.73
# OPC 0.05 0.00 0.00 0.06 0.05
# AMY
load("rdas/regionSpecific_Amyg-n2_cleaned-combined_SCE_MNTFeb2020.rda", verbose=T)
# sce.amy, chosen.hvgs.amy, pc.choice.amy, clusterRefTab.amy, ref.sampleInfo
sce.amy <- sce.amy[ ,-which(sce.amy$cellType.split=="Ambig.lowNtrxts")]
sce.amy$cellType.split <- droplevels(sce.amy$cellType.split)
cell.idx <- splitit(sce.amy$cellType.split)
sapply(cell.idx, function(x){quantile(sce.amy$detected[x])})
# Astro Excit.1 Excit.2 Excit.3 Inhib.1 Inhib.2 Inhib.3 Inhib.4 Inhib.5
# 0% 460.00 634.00 1990.00 2722 1197.0 2566 3025.0 3233.00 629
# 25% 2926.25 7426.00 3257.75 4506 6623.5 6458 6809.0 5208.00 6480
# 50% 3567.00 8389.00 3672.50 4876 7420.0 7277 7378.0 6098.00 7342
# 75% 4383.50 9096.25 3992.00 6054 8177.5 7994 7946.5 7019.25 7853
# 100% 6609.00 11625.00 5072.00 8118 10176.0 9158 8734.0 7936.00 9368
# Micro Oligo OPC
# 0% 217 600 1223.0
# 25% 1800 2044 3382.5
# 50% 2370 2555 3921.0
# 75% 2749 3138 4422.5
# 100% 4271 5607 6378.0
# Broadly
quantile(sce.amy$detected)
# 0% 25% 50% 75% 100%
# 217.00 2225.00 2934.50 3911.75 11625.00
## AMY vs. mouse MeA ===
load("/dcl01/ajaffe/data/lab/singleCell/ucla_mouse-MeA/2019Cell/SCE_mouse-MeA_downstream-processing_MNT.rda", verbose=T)
# sce.amy.mm, chosen.hvgs.amy.mm
table(sce.amy.mm$subCluster)
## Dist of genes expressed?
sce.amy.mm$detected <- apply(assay(sce.amy.mm,"logcounts"), 2, function(x){table(x > 0)["TRUE"]})
subClust.idx <- splitit(sce.amy.mm$subCluster)
sapply(subClust.idx, function(x){quantile(sce.amy.mm$detected[x])[c("25%", "75%")]})
# AS EN MG MU N.1 N.2 N.3 N.4 N.5 N.6 N.7
# 25% 273 353.5 312 412.25 241.00 249 285.00 264.00 505.25 265.5 834.5
# 75% 1043 1106.5 1123 1210.75 813.25 705 888.75 765.75 1450.00 1148.0 2178.5
#
# N.8 N.9 N.10 N.11 N.12 N.13 N.14 N.15 N.16 OL OPC OPC.OL
# 25% 365.75 320.50 226 379.00 514 228 245.5 230 253 696 639.00 454.0
# 75% 1245.50 961.75 479 1802.25 2001 374 452.0 354 474 1864 2181.75 2708.5
# Broadly
quantile(sce.amy.mm$detected)
# 0% 25% 50% 75% 100%
# 177 299 612 1299 8547
# For some reason, these were actually somewhat ordered by n Genes detected...
head(sce.amy.mm$detected, n=40)
tail(sce.amy.mm$detected, n=40)
apply(assay(sce.amy.mm,"logcounts")[ ,1:20], 2, function(x){table(x > 0)["TRUE"]})
apply(assay(sce.amy.mm,"logcounts")[ ,43316:43345], 2, function(x){table(x > 0)["TRUE"]})
# oh they're not. They're ordered by $sample. So they're quite batch effect-y.....
table(sce.amy.mm$sample)
sce.amy.mm$sample[1:40]
sce.amy.mm$sample[3258:3269]
3261+ 3540 +2188 +4913+ 4149+ 2960 # n males
#[1] 21011
sce.amy.mm$sample[21000:21040]
# [1] "M6" "M6" "M6" "M6" "M6" "M6" "M6" "M6" "M6" "M6" "M6" "M6" "F1" "F1" "F1"
# [16] "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1"
# [31] "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1" "F1"
# This is strange:
sample.idx <- splitit(sce.amy.mm$sample)
sapply(sample.idx, function(x){quantile(sce.amy.mm$detected[x])})
# AHhhhhh:
sce.amy.mm$detected[21000:21040]
# ** so these data are ordered by $sample, and then ~ in descending order of detected # genes
# (or maybe capture). Curious.
## Concordance b/tw region-specific and pan-brain broad annotation ===
load("/dcs04/lieber/marmaypag/Tran_LIBD001/Matt/MNT_thesis/snRNAseq/10x_pilot_FINAL/rdas/all-FACS-homogenates_n12_PAN-BRAIN-Analyses_MNTFeb2020.rda",
verbose=T)
# sce.all.n12, chosen.hvgs.all.n12, pc.choice.n12, ref.sampleInfo, clusterRefTab.all.n12
# Getting rid of sub-cell types in sACC sample and the pan-brain assignments to compare
table(gsub("MSN","Inhib",ss(as.character(sce.all.n12$cellType.RS),"\\.",1)) ==
ss(as.character(sce.all.n12$cellType),"\\.",1))
# FALSE TRUE
# 866 33204 - 33204/(866+ 33204) = 97.5% concordant
# Region specific
table(ss(as.character(sce.all.n12$cellType.RS),"\\.",1))
# Ambig Astro Excit Inhib Micro MSN Oligo OPC Tcell
# 445 3864 2927 2019 2956 642 18664 2527 26
table( ss(as.character(sce.all.n12$cellType),"\\.",1))
#Ambig Astro Excit Inhib Micro Oligo OPC
# 32 3828 2848 3110 3077 18614 2561
# What's the broad assignment of the .RS 'Ambig's??
table(ss(as.character(sce.all.n12$cellType.RS),"\\.",1))
ambig.idx <- which(ss(as.character(sce.all.n12$cellType.RS),"\\.",1)=="Ambig")
table(sce.all.n12$cellType[ambig.idx])
# Ambig.hiVCAN Astro.1 Astro.2 Excit.1 Excit.2 Excit.3
# 0 0 2 0 0 0
# Excit.4 Excit.5 Excit.6 Excit.7 Excit.8 Inhib.1
# 0 0 0 0 0 368 - that's interesting lol
# Inhib.2 Inhib.3 Inhib.4 Inhib.5 Micro Oligo (out of 950)
# 2 0 0 0 66 1
# OPC
# 6
panClust.idx <- splitit(sce.all.n12$cellType)
sapply(panClust.idx, function(x){quantile(sce.all.n12$sum[x])})
# Inhib.1 median/IQR is lower than all the other neuronal subclusters...
# start to wonder if a lot of this cluster is driven by this technical artifact
# Let's remove those
sce.all.sub <- sce.all.n12[ ,-ambig.idx]
table(gsub("MSN","Inhib",ss(as.character(sce.all.sub$cellType.RS),"\\.",1)) ==
ss(as.character(sce.all.sub$cellType),"\\.",1))
# FALSE TRUE
# 421 33204
33204/(33204+421) # 98.7% (and if you remove that weird 32 'Ambig.hiVCAN' -> 98.8% concordant)