-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlow_coverage_consensus.sh
executable file
·83 lines (58 loc) · 4.57 KB
/
low_coverage_consensus.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash
# Script: low_coverage_consensus.sh
# This file re-runs certain commands in the ARTIC pipeline 'artic minion' to generate a low-coverage consensus (1X, 2X, 5X) sequence. The default pipeline adds N's to positions with less than
# The script also exports coverage for each consensus sequence, and depth for each sample
# This script requires filter_pipeline.sh to be run before!!
#Script needs two command-line options to navigate to the correct directory and identify the run
# $1: full path to run folder where reads are demultiplexed by Guppy, so with sub-folders barcode01, barcode02, etc.
# $2: run number or id
cd $1 #change to run folder where reads are demultiplexed by Guppy
for dir in */;
do
echo "$dir"
echo "${dir%/}"
cd ${dir}/
pwd
echo "Moving to $dir folder"
if [ -e sample_${dir%/}.consensus.fasta ] #This if statement will check if there is a consensus file
then
#First let's remake the depth mask, with the depth option. Three commands, for three depths
artic_make_depth_mask --depth 1 --store-rg-depths ~/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta sample_${dir%/}.primertrimmed.rg.sorted.bam sample_${dir%/}.coverage_mask_1X.txt
artic_make_depth_mask --depth 2 --store-rg-depths ~/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta sample_${dir%/}.primertrimmed.rg.sorted.bam sample_${dir%/}.coverage_mask_2X.txt
artic_make_depth_mask --depth 5 --store-rg-depths ~/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta sample_${dir%/}.primertrimmed.rg.sorted.bam sample_${dir%/}.coverage_mask_5X.txt
echo "Finished making depth mask"
#skipping the 'artic_plot_amplicon_depth' command and the 'bgzip' and 'tabix' commands of the 'artic minion' pipeline as they have no bearing here
#Make the mask, i.e. preconsensus.fasta for each coverage
artic_mask ~/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta sample_${dir%/}.coverage_mask_1X.txt sample_${dir%/}.fail.vcf sample_${dir%/}.preconsensus_1X.fasta
artic_mask ~/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta sample_${dir%/}.coverage_mask_1X.txt sample_${dir%/}.fail.vcf sample_${dir%/}.preconsensus_2X.fasta
artic_mask ~/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta sample_${dir%/}.coverage_mask_5X.txt sample_${dir%/}.fail.vcf sample_${dir%/}.preconsensus_5X.fasta
echo "Finished making preconsensus"
#Make the consensus FASTA
bcftools consensus -f sample_${dir%/}.preconsensus_1X.fasta sample_${dir%/}.pass.vcf.gz -m sample_${dir%/}.coverage_mask_1X.txt -o sample_${dir%/}.consensus_1X.fasta
bcftools consensus -f sample_${dir%/}.preconsensus_2X.fasta sample_${dir%/}.pass.vcf.gz -m sample_${dir%/}.coverage_mask_2X.txt -o sample_${dir%/}.consensus_2X.fasta
bcftools consensus -f sample_${dir%/}.preconsensus_2X.fasta sample_${dir%/}.pass.vcf.gz -m sample_${dir%/}.coverage_mask_5X.txt -o sample_${dir%/}.consensus_5X.fasta
echo "Finished generating consensus files"
#Relabel FASTA headers
artic_fasta_header sample_${dir%/}.consensus_1X.fasta "sample_${dir%/}/ARTIC/nanopolish_1X"
artic_fasta_header sample_${dir%/}.consensus_2X.fasta "sample_${dir%/}/ARTIC/nanopolish_2X"
artic_fasta_header sample_${dir%/}.consensus_5X.fasta "sample_${dir%/}/ARTIC/nanopolish_5X"
echo "Consensus files for minimum 1X, 2X, and 5X coverage done for $dir files"
#Now need to get stats from each of those files and the original 20X file:
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' sample_${dir%/}.consensus_1X.fasta | tail -n +2 | grep "^>" -v | awk -F 'N' '{print NF-1"," "1X"}' >> $1/n_bases.dat
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' sample_${dir%/}.consensus_2X.fasta | tail -n +2 | grep "^>" -v | awk -F 'N' '{print NF-1"," "2X"}' >> $1/n_bases.dat
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' sample_${dir%/}.consensus_5X.fasta | tail -n +2 | grep "^>" -v | awk -F 'N' '{print NF-1"," "5X"}' >> $1/n_bases.dat
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' sample_${dir%/}.consensus.fasta | tail -n +2 | grep "^>" -v | awk -F 'N' '{print NF-1"," "20X"}' >> $1/n_bases.dat
echo "Output number of N bases for different coverages to file"
#Output the run_sample labels
for consensus_coverage in *.consensus*.fasta
do
echo $2"_"${dir%/} >> $1/run_sample.dat
done
fi
echo "Finished running pipeline on sample with $dir files"
#Moving back up one level in directory
cd ..
done
cd $1
paste -d "," run_sample.dat n_bases.dat >> uncalled_bases_$2.csv
echo "Script complete for $2 files"