-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathFeaturePred.V2.0.R
723 lines (594 loc) · 32.1 KB
/
FeaturePred.V2.0.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
#!/usr/bin/Rscript
# This script was written by Oliver Pain.
start.time <- Sys.time()
suppressMessages(library("optparse"))
option_list = list(
make_option("--PLINK_prefix_chr", action="store", default=NA, type='character',
help="Path to per chromosome PLINK binaries [required]"),
make_option("--weights", action="store", default=NA, type='character',
help="Path for .pos file describing features [required]"),
make_option("--weights_dir", action="store", default=NA, type='character',
help="Directory containing the weights corresponding to the features in the .pos file [required]"),
make_option("--ref_ld_chr", action="store", default=NA, type='character',
help="Path to FUSION 1KG reference [required]"),
make_option("--n_cores", action="store", default=1, type='numeric',
help="Specify the number of cores available [required]"),
make_option("--score_files", action="store", default=NA, type='character',
help="Path to SCORE files corresponding to weights [optional]"),
make_option("--ref_expr", action="store", default=NA, type='character',
help="Path to reference expression data [optional]"),
make_option("--memory", action="store", default=2000, type='numeric',
help="RAM available in MB [required]"),
make_option("--plink", action="store", default=NA, type='character',
help="Path to PLINK software [required]"),
make_option("--save_score", action="store", default='T', type='logical',
help="Save SCORE files [optional]"),
make_option("--save_ref_expr", action="store", default='T', type='logical',
help="Save reference expression data [optional]"),
make_option("--output", action="store", default=NA, type='character',
help="Name of output directory [required]"),
make_option("--all_mod", action="store", default=F, type='logical',
help="Set to T for predictions based on all TWAS models for each gene, rather than the best model alone [required]"),
make_option("--pigz", action="store", default=NA, type='character',
help="Path to pigz binary [required]"),
make_option("--targ_pred", action="store", default=T, type='logical',
help="Set to F to create SCORE file and expression reference only [optional]"),
make_option("--ref_maf", action="store", default=NA, type='character',
help="Path to per chromosome PLINK .frq files [required]"),
make_option("--chr", action="store", default=NA, type='numeric',
help="Specify chromosome number [optional]")
)
opt = parse_args(OptionParser(option_list=option_list))
system(paste('mkdir -p ',opt$output))
if(is.na(opt$chr)){
CHROMS<-1:22
} else {
CHROMS<-opt$chr
}
sink(file = paste(opt$output,'/FeaturePredictions.log',sep=''), append = F)
cat(
'#################################################################
# FeaturePred V2.0
# V1.0 20/02/2019
#################################################################
Options are:\n')
print(opt)
cat('Analysis started at',as.character(start.time),'\n')
sink()
if(opt$targ_pred == T){
if(is.na(opt$PLINK_prefix_chr)){
stop('PLINK_prefix_chr has not been specified.\n')
}
missing_target_files <- paste0(opt$PLINK_prefix_chr, CHROMS, '.bed')[!file.exists(paste0(opt$PLINK_prefix_chr, CHROMS, '.bed'))]
if (length(missing_target_files) > 0) {
cat(paste0(missing_target_files, collapse = "\n"), "\n")
stop("The following --PLINK_prefix_chr files are missing:", paste0(missing_target_files, collapse= ', '), "\n")
}
}
if(is.na(opt$ref_expr)){
if(is.na(opt$ref_ld_chr)){
stop('--ref_ld_chr must be specified.')
}
missing_ref_files <- paste0(opt$ref_ld_chr, CHROMS, '.bed')[!file.exists(paste0(opt$ref_ld_chr, CHROMS, '.bed'))]
if (length(missing_ref_files) > 0) {
cat(paste0(missing_ref_files, collapse = "\n"), "\n")
stop("The following --ref_ld_chr files are missing:", paste0(missing_ref_files, collapse= ', '), "\n")
}
}
if(is.na(opt$weights)){
stop('--weights must be specified.')
}
if(!file.exists(opt$weights)){
stop('--weights files does not exist.')
}
if(is.na(opt$plink)){
stop('--plink must be specified.\n')
}
plink_error<-system(paste0(opt$plink),ignore.stdout=T, ignore.stderr=T)
if(plink_error == 127){
stop('--plink cannot be found. Check the path for plink software.\n')
} else {
plink_log<-system(paste0(opt$plink,' --help --noweb'),intern=T)
if(length(grep('PLINK v1.9',plink_log[1])) == 0) {
cat('\nWarning: Check you are using PLINK v1.9!\n\n')
system(paste0('rm plink.log'))
}
}
# Check for required frq files
if(opt$targ_pred == T | is.na(opt$ref_expr)){
missing_frq_files <- paste0(opt$ref_maf, CHROMS, '.frq')[!file.exists(paste0(opt$ref_maf, CHROMS, '.frq'))]
if (length(missing_frq_files) > 0) {
cat(paste0(missing_frq_files, collapse = "\n"), "\n")
stop("The following --ref_maf files are missing:", paste0(missing_frq_files, collapse= ', '), "\n")
}
}
if(is.na(opt$pigz)){
stop('--pigz must be specified.\n')
}
pigz_error<-system(paste0(opt$pigz, ' -h'),ignore.stdout=T, ignore.stderr=T)
if(pigz_error == 127){
stop('--pigz cannot be found. Check the path for pigz software.\n')
}
library(data.table)
library(foreach)
library(doMC)
registerDoMC(opt$n_cores)
###################################
# Check SNP overlap between FUSION reference and target dataset
###################################
if(opt$targ_pred == T){
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Harmonsising reference and target data...',sep='')
sink()
# Read in the SNPs in reference sample
Ref<-NULL
for(i in 1:22){
Ref<-rbind(Ref, fread(paste0(opt$ref_ld_chr,i,'.bim')))
}
if(sum(duplicated(Ref$V2)) > 0){
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Duplicates are present in the reference data.\n')
sink()
q()
}
Target<-NULL
for(i in 1:22){
Target<-rbind(Target, fread(paste0(opt$PLINK_prefix_chr,i,'.bim')))
}
if(sum(duplicated(Ref$V2)) > 0){
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Duplicates are present in the target data.\n')
sink()
q()
}
# Idenitfy SNPs that are in both datasets and have matching allele codes
Ref$IUPAC<-NA
Ref$IUPAC[Ref$V5 == 'A' & Ref$V6 =='T' | Ref$V5 == 'T' & Ref$V6 =='A']<-'W'
Ref$IUPAC[Ref$V5 == 'C' & Ref$V6 =='G' | Ref$V5 == 'G' & Ref$V6 =='C']<-'S'
Ref$IUPAC[Ref$V5 == 'A' & Ref$V6 =='G' | Ref$V5 == 'G' & Ref$V6 =='A']<-'R'
Ref$IUPAC[Ref$V5 == 'C' & Ref$V6 =='T' | Ref$V5 == 'T' & Ref$V6 =='C']<-'Y'
Ref$IUPAC[Ref$V5 == 'G' & Ref$V6 =='T' | Ref$V5 == 'T' & Ref$V6 =='G']<-'K'
Ref$IUPAC[Ref$V5 == 'A' & Ref$V6 =='C' | Ref$V5 == 'C' & Ref$V6 =='A']<-'M'
if(sum(is.na(Ref$IUPAC)) > 0){
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat(sum(is.na(Ref$IUPAC)),"non-SNP variants removed from reference data (Ref_non_SNPs.txt).\n")
sink()
write.table(Ref[is.na(Ref$IUPAC),], paste0(opt$output,'/Ref_non_SNPs.txt'), col.names=T, row.names=F, quote=F)
Ref<-Ref[!is.na(Ref$IUPAC),]
}
Target$IUPAC<-NA
Target$IUPAC[Target$V5 == 'A' & Target$V6 =='T' | Target$V5 == 'T' & Target$V6 =='A']<-'W'
Target$IUPAC[Target$V5 == 'C' & Target$V6 =='G' | Target$V5 == 'G' & Target$V6 =='C']<-'S'
Target$IUPAC[Target$V5 == 'A' & Target$V6 =='G' | Target$V5 == 'G' & Target$V6 =='A']<-'R'
Target$IUPAC[Target$V5 == 'C' & Target$V6 =='T' | Target$V5 == 'T' & Target$V6 =='C']<-'Y'
Target$IUPAC[Target$V5 == 'G' & Target$V6 =='T' | Target$V5 == 'T' & Target$V6 =='G']<-'K'
Target$IUPAC[Target$V5 == 'A' & Target$V6 =='C' | Target$V5 == 'C' & Target$V6 =='A']<-'M'
if(sum(is.na(Target$IUPAC)) > 0){
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat(sum(is.na(Target$IUPAC)),'non-SNP variants removed from target data (Targ_non_SNPs.txt).\n')
sink()
write.table(Target[is.na(Target$IUPAC),], paste0(opt$output,'/Targ_non_SNPs.txt'), col.names=T, row.names=F, quote=F)
Target<-Target[!is.na(Target$IUPAC),]
}
Ref_Target<-merge(Ref, Target, by='V2')
n_flip<-sum(Ref_Target$IUPAC.x == 'R' & Ref_Target$IUPAC.y == 'Y' |
Ref_Target$IUPAC.x == 'Y' & Ref_Target$IUPAC.y == 'R' |
Ref_Target$IUPAC.x == 'K' & Ref_Target$IUPAC.y == 'M' |
Ref_Target$IUPAC.x == 'M' & Ref_Target$IUPAC.y == 'K' )
if(n_flip > 0){
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat(n_flip,'SNPs will be flipped!\n')
sink()
flip_list<- Ref_Target$V2[Ref_Target$IUPAC.x == 'R' & Ref_Target$IUPAC.y == 'Y' |
Ref_Target$IUPAC.x == 'Y' & Ref_Target$IUPAC.y == 'R' |
Ref_Target$IUPAC.x == 'K' & Ref_Target$IUPAC.y == 'M' |
Ref_Target$IUPAC.x == 'M' & Ref_Target$IUPAC.y == 'K' ]
write.table(flip_list, paste0(opt$output,'/flip.snplist'), col.names=F, row.names=F, quote=F)
}
# Idenitfy variants that match
incl<-Ref_Target$V2[ Ref_Target$IUPAC.x == 'R' & Ref_Target$IUPAC.y == 'R' |
Ref_Target$IUPAC.x == 'Y' & Ref_Target$IUPAC.y == 'Y' |
Ref_Target$IUPAC.x == 'K' & Ref_Target$IUPAC.y == 'K' |
Ref_Target$IUPAC.x == 'M' & Ref_Target$IUPAC.y == 'M' ]
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat(paste0('Done!\n'))
cat(paste0('Number of SNPs in FUSION LD Reference = ',length(Ref$V2),'.\n'))
cat(paste0('Number of SNPs in target PLINK files = ',length(Target$V2),'.\n'))
cat(paste0('Number of SNPs in both = ',length(incl),'.\n'))
cat(paste0('Percentage of SNPs in FUSION LD Reference that are in the target PLINK files = ',round(length(incl)/length(Ref$V2)*100,2),'%.\n'))
sink()
# Save list of SNPs that intersect the target and ref
fwrite(list(incl),paste0(opt$output,'/intersect.snplist'),col.names=F, na='NA', quote=F)
rm(Ref)
rm(Target)
rm(Ref_Target)
rm(incl)
gc()
}
###################################
# Convert FUSION weights into SCORE files
###################################
# Read in the .pos file
pos<-data.frame(fread(opt$weights, nThread=opt$n_cores))
if(all(names(pos) != 'PANEL')){
stop("--weight file does not contain PANEL column.\n")
}
pos<-pos[pos$CHR %in% CHROMS,]
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('The .pos file contains ',dim(pos)[1],' features.\n',sep='')
sink()
# Attach weights directory to WGT values in pos file
pos$FILE<-paste0(opt$weights_dir,'/',pos$PANEL,'/',sub('.*/','',pos$WGT))
# Remove .wgt.RDat from the WGT values
pos$WGT<-gsub('.wgt.RDat','',pos$WGT)
pos$WGT<-gsub('.*/','',pos$WGT)
if(is.na(opt$score_files)){
# Read in the SNPs in reference sample
Ref<-NULL
for(i in 1:22){
Ref<-rbind(Ref, fread(paste0(opt$ref_ld_chr,i,'.bim')))
}
Ref$IUPAC<-NA
Ref$IUPAC[Ref$V5 == 'A' & Ref$V6 =='T' | Ref$V5 == 'T' & Ref$V6 =='A']<-'W'
Ref$IUPAC[Ref$V5 == 'C' & Ref$V6 =='G' | Ref$V5 == 'G' & Ref$V6 =='C']<-'S'
Ref$IUPAC[Ref$V5 == 'A' & Ref$V6 =='G' | Ref$V5 == 'G' & Ref$V6 =='A']<-'R'
Ref$IUPAC[Ref$V5 == 'C' & Ref$V6 =='T' | Ref$V5 == 'T' & Ref$V6 =='C']<-'Y'
Ref$IUPAC[Ref$V5 == 'G' & Ref$V6 =='T' | Ref$V5 == 'T' & Ref$V6 =='G']<-'K'
Ref$IUPAC[Ref$V5 == 'A' & Ref$V6 =='C' | Ref$V5 == 'C' & Ref$V6 =='A']<-'M'
opt$score_files<-paste0(opt$output,'/SCORE_FILES')
system(paste0('mkdir -p ',opt$score_files))
# Create SCORE file for each set of weights using FUSION make_score.R
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Converting weights files into PLINK SCORE files...',sep='')
sink()
tmp<-foreach(i=1:length(pos$FILE), .combine=c) %dopar% {
load(pos$FILE[i])
# Resolve strand flips with reference
snps$IUPAC<-NA
snps$IUPAC[snps$V5 == 'A' & snps$V6 =='T' | snps$V5 == 'T' & snps$V6 =='A']<-'W'
snps$IUPAC[snps$V5 == 'C' & snps$V6 =='G' | snps$V5 == 'G' & snps$V6 =='C']<-'S'
snps$IUPAC[snps$V5 == 'A' & snps$V6 =='G' | snps$V5 == 'G' & snps$V6 =='A']<-'R'
snps$IUPAC[snps$V5 == 'C' & snps$V6 =='T' | snps$V5 == 'T' & snps$V6 =='C']<-'Y'
snps$IUPAC[snps$V5 == 'G' & snps$V6 =='T' | snps$V5 == 'T' & snps$V6 =='G']<-'K'
snps$IUPAC[snps$V5 == 'A' & snps$V6 =='C' | snps$V5 == 'C' & snps$V6 =='A']<-'M'
snps_ref<-merge(snps, Ref[,c('V2','V5','V6','IUPAC')], by='V2')
flip_list<- snps_ref$V2[snps_ref$IUPAC.x == 'R' & snps_ref$IUPAC.y == 'Y' |
snps_ref$IUPAC.x == 'Y' & snps_ref$IUPAC.y == 'R' |
snps_ref$IUPAC.x == 'K' & snps_ref$IUPAC.y == 'M' |
snps_ref$IUPAC.x == 'M' & snps_ref$IUPAC.y == 'K' ]
snp_allele_comp<-function(x=NA){
x_new<-x
x_new[x == 'A']<-'T'
x_new[x == 'T']<-'A'
x_new[x == 'G']<-'C'
x_new[x == 'C']<-'G'
x_new[!(x %in% c('A','T','G','C'))]<-NA
return(x_new)
}
snps$IUPAC<-NULL
snps$V5[snps$V2 %in% flip_list]<-snp_allele_comp(snps$V5[snps$V2 %in% flip_list])
snps$V6[snps$V2 %in% flip_list]<-snp_allele_comp(snps$V6[snps$V2 %in% flip_list])
# Remove models with invalid (all NA or invariant weights)
valid_snp_columns <- apply(wgt.matrix, 2, function(col) all(!is.na(col)))
non_invariant_columns <- apply(wgt.matrix, 2, function(col) length(unique(col[!is.na(col)])) > 1)
valid_models <- valid_snp_columns & non_invariant_columns
valid_models <- names(valid_models[valid_models])
cv.performance<-cv.performance[,colnames(cv.performance) %in% valid_models]
wgt.matrix<-wgt.matrix[,colnames(wgt.matrix) %in% valid_models]
if(opt$all_mod == F){
best = which.min(cv.performance[2,])
if ( names(best) == "top1" ) {
keep = which.max(wgt.matrix[,best]^2)
} else {
keep = 1:nrow(wgt.matrix)
}
write.table(format(cbind( (snps[,c(2,5,6)]) , wgt.matrix[,best])[keep,], digits=3), paste0(opt$score_files,'/',pos$WGT[i],'.SCORE') , quote=F , row.names=F , col.names=F , sep='\t' )
# Write out a snplist for each SCORE file to reduce compution time for scoring
system(paste0('cut -f 1 ',opt$score_files,'/',pos$WGT[i],'.SCORE > ',opt$score_files,'/',pos$WGT[i],'.snplist'))
} else {
for(mod in colnames(wgt.matrix)){
if (mod == "top1" ) {
wgt.matrix[-which.max(wgt.matrix[,mod]^2),mod]<-0
}
write.table(format(cbind((snps[,c(2,5,6)]) , wgt.matrix[,mod]),digits=3), paste0(opt$score_files,'/',pos$WGT[i],'.',mod,'.SCORE') , quote=F , row.names=F , col.names=F , sep='\t' )
}
# Write out a snplist for each SCORE file to reduce compution time for scoring
write.table(snps$V2, paste0(opt$score_files,'/',pos$WGT[i],'.snplist') , quote=F , row.names=F , col.names=F , sep='\t' )
}
}
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Done!\n',sep='')
sink()
} else {
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Using precomputed score files.\n',sep='')
sink()
}
###################################
# Predict features in the reference sample
###################################
# Read in reference fam file
ref_fam<-fread(paste0(opt$ref_ld_chr,'1.fam'), nThread=opt$n_cores)
ref_fam<-ref_fam[,1:2]
names(ref_fam)<-c('FID','IID')
if(is.na(opt$ref_expr)){
# Create directory for the PROFILE files
system(paste0('mkdir ',opt$output,'/REF_PROFILE_FILES'))
# Calculate profile scores (i.e. feature predictions)
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Predicting features in reference sample...\n',sep='')
sink()
# Create column IDs to be combined to the feature predictions.
write.table(ref_fam, paste0(opt$output,'/REF_PROFILE_FILES/REF.IDs'), col.names=T, row.names=F, quote=F, sep='\t')
error_table_all<-NULL
error_table<-foreach(i=1:length(pos$FILE), .combine=rbind) %dopar% {
if(opt$all_mod == F){
# Calculate feature predictions
error<-system(paste0(opt$plink,' --bfile ',opt$ref_ld_chr,pos$CHR[i],' --extract ',opt$score_files,'/',pos$WGT[i],'.snplist --allow-no-sex --read-freq ',opt$ref_maf,pos$CHR[i],'.frq --score ',opt$score_files,'/',pos$WGT[i],'.SCORE 1 2 4 --out ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],' --memory ', floor((opt$memory*0.4)/opt$n_cores)),ignore.stdout=T, ignore.stderr=T)
# Delete temporary files and extract feature prediction column to reduce disk space
system(paste0("awk '{print $6}' ",opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.profile | tail -n +2 > ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.profile_mini'),intern=T)
system(paste0("echo ",pos$PANEL,'.',pos$WGT[i]," | cat - ",opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.profile_mini > ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.profile_mini_tmp && mv ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.profile_mini_tmp ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.profile_mini'),intern=T)
system(paste0('rm ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.profile'),ignore.stdout=T, ignore.stderr=T)
system(paste0('rm ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.nosex'),ignore.stdout=T, ignore.stderr=T)
data.frame(N=i,Error=error)
} else {
load(pos$FILE[i])
error_tmp<-NULL
for(mod in colnames(wgt.matrix)){
# Calculate feature predictions
error<-system(paste0(opt$plink,' --bfile ',opt$ref_ld_chr,pos$CHR[i],' --extract ',opt$score_files,'/',pos$WGT[i],'.snplist --allow-no-sex --read-freq ',opt$ref_maf,pos$CHR[i],'.frq --score ',opt$score_files,'/',pos$WGT[i],'.',mod,'.SCORE 1 2 4 --out ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.',mod,' --memory ', floor((opt$memory*0.4)/opt$n_cores)),ignore.stdout=T, ignore.stderr=T)
if(file.exists(paste0(opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.',mod,'.profile'))){
# Delete temporary files and extract feature prediction column to reduce disk space
system(paste0("awk '{print $6}' ",opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.',mod,'.profile | tail -n +2 > ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.',mod,'.profile_mini'),intern=T)
system(paste0("echo ",pos$PANEL,'.',pos$WGT[i],'.',mod," | cat - ",opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.',mod,'.profile_mini > ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.',mod,'.profile_mini_tmp && mv ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.',mod,'.profile_mini_tmp ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.',mod,'.profile_mini'),intern=T)
system(paste0('rm ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.',mod,'.profile'),ignore.stdout=T, ignore.stderr=T)
system(paste0('rm ',opt$output,'/REF_PROFILE_FILES/',pos$WGT[i],'.',mod,'.nosex'),ignore.stdout=T, ignore.stderr=T)
error_tmp<-rbind(error_tmp, data.frame(N=i,Model=mod,Error=error))
}
}
error_tmp
}
}
# Split feature predictions into list of <1000 files, then paste each list of files in batches, and then past all batches.
system(paste0("find ", opt$output,"/REF_PROFILE_FILES -type f -name '*.profile_mini' | split -l 500 -a 4 -d - ",opt$output,"/profile_mini_lists"))
system(paste0("echo ", opt$output, "/REF_PROFILE_FILES/REF.IDs | cat - ",opt$output,"/profile_mini_lists0000 > ",opt$output,"/profile_mini_lists0000_temp && mv ",opt$output,"/profile_mini_lists0000_temp ",opt$output,"/profile_mini_lists0000"))
tmp<-foreach(k=list.files(path=opt$output, pattern="^profile_mini_lists*"), .combine=c) %dopar% {
exit_status<-system(paste0("paste $(cat ",opt$output,"/",k,") > ", opt$output,"/merge_",k))
if(exit_status != 0){
stop()
}
}
exit_status<-system(paste0("paste ", opt$output,"/merge_profile_mini_lists* > ", opt$output,"/REF_expr.txt"))
if(exit_status != 0){
stop()
}
# Delete temporary files
system(paste0("rm ",opt$output,'/profile_mini_lists*'))
system(paste0("rm ",opt$output,'/merge_profile_mini_lists*'))
system(paste0("rm ",opt$output,"/REF_PROFILE_FILES/*.profile_mini"))
# Output file containing list of features that couldn't be predicted.
error_table$Error[error_table$Error > 0]<-1
error_table<-error_table[error_table$Error > 0,]
if(dim(error_table)[1] > 0){
for(i in 1:dim(error_table)[1]){
k<-error_table$N[i]
error_table$ID[i]<-pos$WGT[k]
tmp1<-read.table(paste0(opt$output,'/REF_PROFILE_FILES/',pos$WGT[k],'.log'),sep='*')
NoValid<-sum(grepl('Error: No valid entries in --score file.',tmp1$V1))
error_table$Reason[i]<-'Error: No valid entries in --score file.'
}
error_table$Error<-NULL
error_table<-error_table[c('ID','Reason')]
error_table_all<-rbind(error_table_all,error_table)
write.table(error_table_all, paste0(opt$output,'/Prediction_failed.txt'), col.names=T, row.names=F, quote=T)
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat(paste0(dim(error_table_all)[1],' feature/s cannot not be predicted (',opt$output,'/Prediction_failed.txt)\n'))
sink()
rm(error_table)
rm(error_table_all)
gc()
}
system(paste0("rm ",opt$output,"/REF_PROFILE_FILES/*.log"))
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Done!\n')
sink()
} else {
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Using precomputed reference expression data.\n',sep='')
sink()
}
###################################
# Calculate the mean and SD of feature predictions in the reference
###################################
if(is.na(opt$ref_expr)){
REF_expr<-fread(paste0(opt$output,"/REF_expr.txt"), nThread=opt$n_cores)
ref_scale<-data.frame( ID=names(REF_expr[,-1:-2]),
Mean=sapply(REF_expr[,-1:-2], function(x) mean(x)),
SD=sapply(REF_expr[,-1:-2], function(x) sd(x)))
if(opt$save_ref_expr == T){
system(paste0('mkdir ',opt$output,'/Reference_Expression'))
fwrite(ref_scale, paste0(opt$output,'/Reference_Expression/Scale.txt'), sep=' ', nThread=opt$n_cores, na='NA')
REF_expr_scaled<-REF_expr
REF_expr_scaled[ , (names(REF_expr_scaled)[-1:-2]) := lapply(.SD, function(x) round(as.numeric(scale(x)),3)), .SDcols = (names(REF_expr_scaled)[-1:-2])]
for(panel in unique(pos$PANEL)){
REF_expr_scaled_pan<-REF_expr_scaled[,c(T,T,grepl(panel, names(REF_expr_scaled)[-1:-2])), with=FALSE]
fwrite(REF_expr_scaled_pan, paste0(opt$output,'/Reference_Expression/Reference_Expression_',panel,'.txt'), nThread = opt$n_cores, sep=' ', na='NA')
system(paste0(opt$pigz,' ',opt$output,'/Reference_Expression/Reference_Expression_',panel,'.txt'))
}
}
system(paste0('rm ',opt$output,'/REF_expr.txt'))
rm(REF_expr)
rm(REF_expr_scaled)
gc()
} else {
ref_scale<-data.frame(fread(paste0(opt$ref_expr,'/Scale.txt'), nThread=opt$n_cores, fill=T))
}
if(opt$targ_pred == T){
###################################
# Extract single individual from reference to be merged with the target to insert missing SNPs.
###################################
write.table(ref_fam[1,], paste0(opt$output,'/ref_keep'), col.names=F, row.names=F, quote=F)
rm(ref_fam)
gc()
foreach(chr=CHROMS, .combine=c) %dopar% {
system(paste0(opt$plink,' --bfile ',opt$ref_ld_chr,chr,' --keep ',opt$output,'/ref_keep --make-bed --out ',opt$output,'/ref_indiv_chr',chr,' --memory ', floor((opt$memory*0.4)/opt$n_cores)),ignore.stdout=T, ignore.stderr=T)
# Update ref indiv ID to be distinct from target samples
ref_fam<-fread(paste0(opt$output,'/ref_indiv_chr',chr,'.fam'))
ref_fam$V1<-paste0('REF_',ref_fam$V1)
ref_fam$V2<-paste0('REF_',ref_fam$V2)
write.table(ref_fam, paste0(opt$output,'/ref_indiv_chr',chr,'.fam'), col.names=F, row.names=F, quote=F)
rm(ref_fam)
gc()
}
###################################
# Calculate feature predictions in the target sample
###################################
# Crate a directory to store the predicted expression
system(paste0('mkdir ',opt$output,'/TARG_PROFILE_FILES'))
# Create an file containing IDs to be merged with predicted expression
targ_fam<-fread(paste0(opt$PLINK_prefix_chr,'1.fam'), nThread=opt$n_cores)
targ_fam_id<-targ_fam[,1:2]
names(targ_fam_id)<-c('FID','IID')
write.table(targ_fam_id, paste0(opt$output,'/TARG_PROFILE_FILES/TARG.IDs'), col.names=T, row.names=F, quote=F, sep=' ')
rm(targ_fam)
rm(targ_fam_id)
gc()
# Calculate and process gene expression by chromosome to avoid large memory requirement.
for(chr in CHROMS[CHROMS %in% unique(pos$CHR)]){
# Create a directory for predicted expression of genes on chromosome
system(paste0('mkdir ',opt$output,'/TARG_PROFILE_FILES/chr',chr))
# Remove any duplicate or 3+ allele variants
system(paste0(opt$plink,' --bfile ',opt$PLINK_prefix_chr,chr,' --bmerge ',opt$PLINK_prefix_chr,chr,' --merge-mode 6 --make-bed --out ',opt$output,'/targ_chr',chr,'_noDup'))
if(file.exists(paste0(opt$output,'/targ_chr',chr,'_noDup.missnp'))){
missnp<-fread(paste0(opt$output,'/targ_chr',chr,'_noDup.missnp'), header=F)$V1
log_chr<-readLines(paste0(opt$output,'/targ_chr',chr,'_noDup.log'))
log_chr<-log_chr[grepl("Warning: Multiple positions seen for variant", log_chr)]
log_chr<-unique(gsub("'\\.","",gsub(".*variant '","",log_chr)))
missnp<-c(missnp, log_chr)
write.table(missnp, paste0(opt$output,'/targ_chr',chr,'_noDup.missnp'), col.names=F, row.names=F, quote=F)
# Extract SNPs in reference and flip SNPs if necessary.
# And remove any duplicate 3+ allele variants
system(paste0(opt$plink,' --bfile ',opt$PLINK_prefix_chr,chr,' --make-bed --exclude ',opt$output,'/targ_chr',chr,'_noDup.missnp --out ',opt$output,'/targ_chr',chr,' --memory ',floor((opt$memory*0.4))))
if(n_flip > 0){
system(paste0(opt$plink,' --bfile ',opt$output,'/targ_chr',chr,' --make-bed --extract ',opt$output,'/intersect.snplist --flip ',opt$output,'/flip.snplist --out ',opt$output,'/targ_chr',chr,' --memory ',floor((opt$memory*0.4))))
} else {
system(paste0(opt$plink,' --bfile ',opt$output,'/targ_chr',chr,' --make-bed --extract ',opt$output,'/intersect.snplist --out ',opt$output,'/targ_chr',chr,' --memory ',floor((opt$memory*0.4))))
}
} else {
# Extract SNPs in reference and flip SNPs if necessary.
if(n_flip > 0){
system(paste0(opt$plink,' --bfile ',opt$PLINK_prefix_chr,chr,' --make-bed --extract ',opt$output,'/intersect.snplist --flip ',opt$output,'/flip.snplist --out ',opt$output,'/targ_chr',chr,' --memory ',floor((opt$memory*0.4))))
} else {
system(paste0(opt$plink,' --bfile ',opt$PLINK_prefix_chr,chr,' --make-bed --extract ',opt$output,'/intersect.snplist --out ',opt$output,'/targ_chr',chr,' --memory ',floor((opt$memory*0.4))))
}
}
# Merge and remove single individual with the target data to insert missing reference SNPs.
system(paste0(opt$plink,' --bfile ',opt$output,'/targ_chr',chr,' --bmerge ',opt$output,'/ref_indiv_chr',chr,' --make-bed --out ',opt$output,'/ref_targ_chr',chr,' --memory ',floor((opt$memory*0.4))))
system(paste0(opt$plink,' --bfile ',opt$output,'/ref_targ_chr',chr,' --remove ',opt$output,'/ref_indiv_chr',chr,'.fam --make-bed --out ',opt$output,'/targ_chr',chr,' --memory ',floor((opt$memory*0.4))))
# Delete the temporary files
system(paste0('rm ',opt$output,'/ref_targ_chr*'))
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Predicting features in chromosome ',chr,'...',sep='')
sink()
# Subset features on the chromosome
pos_chr<-pos[pos$CHR == chr,]
# Predict each panel seperately to avoid large files when many panels in one run.
for(panel in unique(pos_chr$PANEL)){
pos_chr_panel<-pos_chr[pos_chr$PANEL == panel,]
TARG_expr<-foreach(i=1:length(pos_chr_panel$FILE), .combine=cbind) %dopar% {
if(opt$all_mod == F){
# Calculate feature predictions
tmp<-system(paste0(opt$plink,' --bfile ',opt$output,'/targ_chr',chr,' --extract ',opt$score_files,'/',pos_chr_panel$WGT[i],'.snplist --allow-no-sex --read-freq ',opt$ref_maf,chr,'.frq --score ',opt$score_files,'/',pos_chr_panel$WGT[i],'.SCORE 1 2 4 --out ',opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_',pos_chr_panel$WGT[i],' --memory ', floor((opt$memory*0.4)/opt$n_cores)),ignore.stdout=T, ignore.stderr=T)
if(tmp != 0){
# Delete temporary files
system(paste0('rm ',opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_',pos_chr_panel$WGT[i],'.*'),ignore.stdout=T, ignore.stderr=T)
return(NULL)
}
# Read in the predictions, extract SCORE column and change header.
feature<-fread(paste0(opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_',pos_chr_panel$WGT[i],'.profile'), nThread=1)
feature<-feature[,6]
names(feature)<-paste0(panel,'.',pos_chr_panel$WGT[i])
# Delete temporary files
system(paste0('rm ',opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_',pos_chr_panel$WGT[i],'.*'),ignore.stdout=T, ignore.stderr=T)
# Scale expression to the reference and round
feature<-feature-ref_scale$Mean[ref_scale$ID == names(feature)]
feature<-feature/ref_scale$SD[ref_scale$ID == names(feature)]
feature<-round(feature,3)
feature
} else {
load(pos_chr_panel$FILE[i])
feature_tmp<-NULL
for(mod in colnames(wgt.matrix)){
# Calculate feature predictions
tmp<-system(paste0(opt$plink,' --bfile ',opt$output,'/targ_chr',chr,' --extract ',opt$score_files,'/',pos_chr_panel$WGT[i],'.snplist --allow-no-sex --read-freq ',opt$ref_maf,chr,'.frq --score ',opt$score_files,'/',pos_chr_panel$WGT[i],'.',mod,'.SCORE 1 2 4 --out ',opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_',pos_chr_panel$WGT[i],'.',mod,' --memory ', floor((opt$memory*0.4)/opt$n_cores)),ignore.stdout=T, ignore.stderr=T)
if(tmp != 0){
# Delete temporary files
system(paste0('rm ',opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_',pos_chr_panel$WGT[i],'.',mod,'.*'),ignore.stdout=T, ignore.stderr=T)
return(NULL)
}
# Read in the predictions, extract SCORE column and change header.
feature<-fread(paste0(opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_',pos_chr_panel$WGT[i],'.',mod,'.profile'), nThread=1)
feature<-feature[,6]
names(feature)<-paste0(panel,'.',pos_chr_panel$WGT[i],'.',mod)
# Delete temporary files
system(paste0('rm ',opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_',pos_chr_panel$WGT[i],'.',mod,'.*'),ignore.stdout=T, ignore.stderr=T)
# Scale expression to the reference and round
feature<-feature-ref_scale$Mean[ref_scale$ID == names(feature)]
feature<-feature/ref_scale$SD[ref_scale$ID == names(feature)]
feature<-round(feature,3)
feature_tmp<-cbind(feature_tmp, feature)
}
feature_tmp
}
}
# Remove any columns containing NA
TARG_expr<-TARG_expr[,which(unlist(lapply(TARG_expr, function(x)!is.na(x[1])))),with=F]
# Save TARG_expr and compress
fwrite(TARG_expr, paste0(opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_expression.txt'), nThread = opt$n_cores, sep=' ', na='NA')
system(paste0(opt$pigz,' ',opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_expression.txt'))
# Delete TARG_expr and run garbage collection.
# Garbage collection turns out to be very important here as memory can spike in subsequent loop.
rm(TARG_expr)
rm(pos_chr_panel)
gc()
}
system(paste0("rm ",opt$output,"/targ_chr",chr,"*"),ignore.stdout=T, ignore.stderr=T)
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Done!\n')
sink()
rm(pos_chr)
gc()
}
# Combine per chromosome predicted expression values and insert FID and IID columns
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Combining per chromomsome files...')
sink()
for(panel in unique(pos$PANEL)){
pos_panel<-pos[pos$PANEL == panel,]
for(chr in unique(pos_panel$CHR)){
system(paste0(opt$pigz,' -d ',opt$output,'/TARG_PROFILE_FILES/chr',chr,'/',panel,'_expression.txt.gz'))
}
system(paste0("paste -d ' ' $(echo ",opt$output,'/TARG_PROFILE_FILES/TARG.IDs $(ls ',opt$output,'/TARG_PROFILE_FILES/chr*/',panel,'_expression.txt)) | ',opt$pigz,' > ',opt$output,'/FeaturePredictions_',panel,'.txt.gz'))
system(paste0('rm ',opt$output,'/TARG_PROFILE_FILES/chr*/',panel,'_expression.txt'))
}
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Done!\n')
sink()
system(paste0("rm -r ",opt$output,"/TARG_PROFILE_FILES"))
}
# Delete temporary files
system(paste0("rm ",opt$output,"/ref_indiv*"))
system(paste0("rm ",opt$output,"/ref_keep"))
system(paste0("rm ",opt$output,"/intersect.snplist"))
if(opt$score_files == paste0(opt$output,"/SCORE_FILES") & opt$save_score == F){
system(paste0("rm -r ",opt$output,"/SCORE_FILES"))
}
if(is.na(opt$ref_expr)){
system(paste0("rm -r ",opt$output,"/REF_PROFILE_FILES"))
}
end.time <- Sys.time()
time.taken <- end.time - start.time
sink(file = paste0(opt$output,'/FeaturePredictions.log'), append = T)
cat('Analysis finished at',as.character(end.time),'\n')
cat('Analysis duration was',as.character(round(time.taken,2)),attr(time.taken, 'units'),'\n')
sink()