-
Notifications
You must be signed in to change notification settings - Fork 1
/
1_WGS_Mutetc_single_hard.sh
212 lines (187 loc) · 7.68 KB
/
1_WGS_Mutetc_single_hard.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/bin/bash
###set parameter
fq1=$1
fq2=$2
speice_name=$3
###set env
out_dir=out
##define software path/jar file
trimmomatic=~/jiaqiLi/fqQC/software/Trimmomatic-0.38/trimmomatic-0.38.jar
bwa=/home/ggj/jiaqiLi/whole_genome_seq/software/bwa-0.7.15/bwa
samtools=/home/ggj/jiaqiLi/whole_genome_seq/software/samtools-1.3.1/samtools
gatk=/home/ggj/jiaqiLi/whole_genome_seq/software/gatk-4.0.5.1/gatk
#VEP=/your_path_to/ensembl-vep/vep
#ref_file
reference=/home/ggj/jiaqiLi/whole_genome_seq/zebrafish/reference/zebrafish_genomic.fna.gz
chromosome=( CM002885.2, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, \
chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, \
chr20, chr21, chr22, chr23, chr24, chr25 )
#export GATK_bundle=GATK_bundle_PATH
#sequence information
RGID=L005
#thread_num
thread_num=16
###prepare ref index
time $bwa index $reference && echo "bwa index done"
time $samtools faidx $reference && echo "samtools index done"
time $gatk CreateSequenceDictionary \
-R $reference -O ${refrence}.dict \
&& echo "gatk dict done"
###work flow
##mkdir
if [ ! -d "$out_dir" ]; then
mkdir $out_dir
fi
##fast quality control
echo "quality control begin"
if [ ! -d "$out_dir/cleanfq" ]; then
mkdir $out_dir/cleanfq
fi
#fastqc of original fq
time fastqc $fq1 $fq2 -o $out_dir
#trimmomatic set as illumina
#put the adapter file in the work dir&&change the shell
time java -jar ${trimmomatic} PE -phred33 -trimlog log_trim \
$fq1 $fq2 \
$out_dir/cleanfq/$speice_name.pair.1.fq $out_dir/cleanfq/$speice_name.trimUnpair.1.fq \
$out_dir/cleanfq/$speice_name.pair.2.fq $out_dir/cleanfq/$speice_name.trimUnpair.2.fq \
ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 \
SLIDINGWINDOW:5:20 LEADING:5 TRAILING:5 MINLEN:50 \
&& echo "trimmomatic done"
#fastqc of trimmed fq
time fastqc $out_dir/cleanfq/$speice_name.pair.1.fq $out_dir/cleanfq/$speice_name.pair.2.fq -o $out_dir/cleanfq \
&& echo "quality control done"
##bwa align with reference and sort to bam file
if [ ! -d "$out_dir/bwa" ]; then
mkdir $out_dir/bwa
fi
#align with ref_path genome
time $bwa mem -t $thread_num -M -R "@RG\tID:$RGID\tPL:ILLUMINA\tSM:$speice_name" \
$reference $out_dir/cleanfq/$speice_name.pair.1.fq $out_dir/cleanfq/$speice_name.pair.2.fq \
| $samtools view -b -o $out_dir/bwa/$speice_name.bam \
&& echo "bwa mem done"
time $samtools sort -@ $thread_num -m 4G -O bam -o $out_dir/bwa/$speice_name.sorted.bam $out_dir/bwa/$speice_name.bam \
&& echo "bwa sort done"
##gatk
#mark duplicate
time $gatk MarkDuplicates \
-I $out_dir/bwa/$speice_name.sorted.bam \
-M $out_dir/bwa/$speice_name.sorted.markdup_metrics.txt \
-O $out_dir/bwa/$speice_name.sorted.markdup.bam \
&& echo "** markdup done **"
##apply realiagnment
##gatk BQSR
time $gatk BaseRecalibrator \
-R $ref_path/$ref_file \
-I $out_dir/bwa/$speice_name.sorted.markdup.bam \
--known-sites $GATK_bundle/1000G_phase1.indels.hg38.vcf \
--known-sites $GATK_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf \
--known-sites $GATK_bundle/dbsnp_146.indels.hg38.vcf \
-O $out_dir/bwa/$speice_name.sorted.markdup.recal.table \
&& echo "$speice_name.sorted.markdup.recal.table done"
time $gatk ApplyBQSR \
--bqsr-recal-file $out_dir/bwa/$speice_name.sorted.markdup.recal.table \
-R $ref_path/$ref_file \
-I $out_dir/bwa/$speice_name.sorted.markdup.bam \
-O $out_dir/bwa/$speice_name.sorted.markdup.BQSR.bam \
&& echo "BQSR.bam done"
time $samtools sort -@ $thread_num -m 4G -O bam -o $out_dir/bwa/$speice_name.sorted.markdup.BQSR.bam \
&& echo "bwa sort done"
time $samtools index $out_dir/bwa/$speice_name.sorted.markdup.BQSR.bam \
&& echo "sorted bam index done"
rm $out_dir/bwa/$speice_name.bam
rm $out_dir/bwa/$speice_name.sorted.bam
##gatk single hard sort
if [! -d "$out_dir/gatk" ]; then
mkdir $out_dir/gatk_single_hard
fi
#use LINUX FIFO to Concurrent
#chromosome=( chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, \
chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, \
chr20, chr21, chr22, chr23, chr24, chr25 )
TMPFIFO=/tmp/$$.fifo #声明管道名称,'$$'表示脚本当前运行的进程PID
mkfifo $TMPFIFO #创建管道
exec 5<>${TMPFIFO} #创建文件标示符“5”,调用进程内部执行一个可执行文件
rm -rf ${TMPFIFO} #清除创建的管道文件
#为并发线程创建同样个数的占位
for((i=1;i<=$thread_num;i++))
do
echo ;
#借用read命令一次读取一行的特性,使用一个echo默认输出一个换行符,来确保每一行只有一个线程占位;
done >&5
#将占位信息写入管道
for i in ${chromosome[@]}; do
read -u5
#从文件描述符管道中,获取一个管道的线程占位然后开始执行操作;read中 -u 后面跟fd,表示从文件描述符中读入,该文件描述符可以是exec新开启的
{
time $gatk HaplotypeCaller \
-R $reference \
--emit-ref-confidence GVCF \
-I $out_dir/bwa/${specie}.sorted.markdup.bam \
-L $i \
-O $out_dir/gatk_single_hard/${specie}.${i}.gvcf \
&& echo "gvcf done" && \
#call variety
time $gatk GenotypeGVCFs \
-R $reference \
-V $out_dir/gatk_single_hard/${specie}.${i}.gvcf \
-O $out_dir/gatk_single_hard/${specie}.${i}.vcf \
&& echo "vcf done"
sleep 1
echo "" >&5
#任务执行完后在fd5中写入一个占位符,以保证这个线程执行完后,线程继续保持占位
} &
done
wait #等待父进程的子进程都执行结束后再结束父进程
exec 5>&- #关闭fd5的管道
#merge chromosome file
merge_vcfs=""
for i in ${chromosome[@]}; do
merge_vcfs=${merge_vcfs}" -I $out_dir/gatk_single_hard/${specie}.${i}.vcf \\"\n
done
time $gatk merge_vcfs ${merge_vcfs} -O $out_dir/gatk_single_hard/${specie}.vcf \
&& echo "merge vcfs done"
time bgzip -f $out_dir/gatk_single_hard/${specie}.vcf
time tabix -p vcf $out_dir/gatk_single_hard/${specie}.vcf.gz
##SNP INDEL select
# 使用SelectVariants,选出SNP
time $gatk SelectVariants \
-select-type SNP \
-V $out_dir/gatk_single_hard/${specie}.vcf.gz \
-O $out_dir/gatk_single_hard/${specie}.snp.vcf.gz
# 为SNP作硬过滤
time $gatk VariantFiltration \
-V $out_dir/gatk_single_hard/${specie}.snp.vcf.gz \
--filter-expression "QD < 2.0 || MQ < 40.0 || FS > 60.0 || SOR > 3.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" \
--filter-name "Filter" \
-O $out_dir/gatk_single_hard/${specie}.snp.fiter.vcf.gz
# 使用SelectVariants,选出Indel
time $gatk SelectVariants \
-select-type INDEL \
-V $out_dir/gatk_single_hard/${specie}.vcf.gz \
-O $out_dir/gatk_single_hard/${specie}.indel.vcf.gz
# 为Indel作过滤
time $gatk VariantFiltration \
-V $out_dir/gatk_single_hard/${specie}.vcf.gz \
--filter-expression "QD < 2.0 || FS > 200.0 || SOR > 10.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" \
--filter-name "Filter" \
-O $out_dir/gatk_single_hard/${specie}.fiter.indel.vcf.gz
# 重新合并过滤后的SNP和Indel
time $gatk MergeVcfs \
-I $out_dir/gatk_single_hard/${specie}.snp.fiter.vcf.gz \
-I $out_dir/gatk_single_hard/${specie}.fiter.indel.vcf.gz \
-O $out_dir/gatk_single_hard/${specie}.filter.vcf.gz
# 删除无用中间文件
#rm -f
##variety annotation
## VEP annotation
time $VEP --fasta $reference \
--vcf --merged --fork 10 --hgvs --force_overwrite --everything \
--offline --dir_cache /your_path_to/ensembl-vep/cachedir \
-i $out_dir/population/$out_name.snp.VQSR.vcf.gz \
-o $out_dir/population/$out_name.snp.VQSR.VEP.vcf.gz
time $VEP --fasta $reference \
--vcf --merged --fork 10 --hgvs --force_overwrite --everything \
--offline --dir_cache /your_path_to/ensembl-vep/cachedir \
-i $out_dir/population/$out_name.indel.VQSR.vcf.gz \
-o $out_dir/population/$out_name.indel.VQSR.VEP.vcf.gz