-
Notifications
You must be signed in to change notification settings - Fork 6
/
run_phylip.sh
executable file
·1173 lines (1005 loc) · 42.5 KB
/
run_phylip.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env bash
#-------------------------------------------------------------------------------------------------------
#: PROGRAM: run_phylip.sh
#: AUTHOR: Pablo Vinuesa, Center for Genomic Sciences, UNAM, Mexico
#: https://www.ccg.unam.mx/~vinuesa/ twitter: @pvinmex
#
#: PROJECT START: October 16th, 2013
# This program has been developed mainly for teaching purposes,
# with improvements/new features added as the script was used in
# diverse courses taught to undergrads at https://www.lcg.unam.mx
# (LCG-UNAM) and the International Workshops on Bioinformatics (TIB)
#: AIM: run PHYLIP's distance methods [NJ|UPGMA] for DNA and proteins (dnadist|protdist) with optional bootstrapping
# This script was written to teach intermediate Bash scripting to my students at the
# Bachelor's Program in Genome Sciences at the Center for Genome Sciences, UNAM, Mexico
# https://www.lcg.unam.mx
#
#: INPUT: multiple sequence alignments (PROT|DNA) with at least 4 distinct sequences in phylip format
#
#: OUTPUT: [NJ|UPGMA] phylogenies and, if requested, bootstrap consensus trees
#: SOURCE: Freely available on GitHub @ https://github.com/vinuesa/intro2linux
# Released under the GPLv3 License.
# http://www.gnu.org/copyleft/gpl.html
#### DEPENDENCIES
# Assumes that the following binaries and scripts are all in $PATH, checked by check_dependencies()
#
# 1) Binaries from the PHYLIP package:
# seqboot dnadist protdist neighbor consense
# NOTE: Linux-compatible PHYLIP binaries are supplied in the distro\'s bin/ directory
# https://github.com/vinuesa/intro2linux/tree/master/bin
#: TODO:
# implement also parsimony and ML analyses and parallelize bootstrapping
#: KNOWN BUGS: None.
# Please report any errors you may encounter through the GitHub issue pages
#-------------------------------------------------------------------------------------------------------
# make sure the user has at least bash version 4, since the script uses standard arrays (introduced in version 4),
# but future development may require hashes, introduced in version 4
[ "${BASH_VERSION%%.*}" -lt 4 ] && echo "$HOSTNAME is running an ancient bash: ${BASH_VERSION}; ${0##*/} requires bash version 4 or higher" && exit 1
# 0. Define strict bash settings
set -e # exit on non-zero exit status
set -u # exit if unset variables are encountered
set -o pipefail # exit after unsuccessful UNIX pipe command
# set and export LC_NUMERIC=en_US.UTF-8, to avoid problems with locales tha use 1,32
LC_NUMERIC=en_US.UTF-8
export LC_NUMERIC
args="$*"
progname=${0##*/} # run_phylip.sh
VERSION=2.0
# GLOBALS
#DATEFORMAT_SHORT="%d%b%y" # 16Oct13
#TIMESTAMP_SHORT=$(date +${DATEFORMAT_SHORT})
date_F=$(date +%F |sed 's/-/_/g')- # 2013_10_20
date_T=$(date +%T |sed 's/:/./g') # 23.28.22 (hr.min.secs)
start_time="$date_F$date_T"
#current_year=$(date +%Y)
wkdir=$(pwd)
# initialize variables
def_DNA_model=F84
def_prot_model=JTT
input_phylip=
runmode=
boot=100
CV=1
model=
DEBUG=0
sequential=0
TiTv=2
upgma=0
outgroup=0
gamma="0.0"
outgroup=1
nw_utils_ok=
declare -a outfiles # to collect the output files written to disk
#---------------------------------------------------------------------------------#
#>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTION DEFINITIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#
#---------------------------------------------------------------------------------#
function print_dev_history()
{
cat <<EOF_
$progname $VERSION has been developed mainly for teaching purposes,
with improvements/new features added as the script was used in
diverse courses taught to undergrads at https://www.lcg.unam.mx
and the International Workshops on Bioinformatics (TIB)
TODO: * implement also parsimony and ML analyses
* parallelize bootstrapping
# v2.0 2020-11-19; * fixed bug in write_protdist_params in the if [ $model == JTT ] || [ ...] conditional
* fixed an error in the model name input checks, changing PMG for PMB
* fixed grep in check_nw_utils_ok and made more robust conditional checking to call the function
* use variable expansion only once in naming outfiles outtress, saving it to a variable
to make the code more streamlined and possibly a tiny bit faster
* removed useless [ -s outtre|outfile ] tests for the orignal tress|outfiles in the bootstrapping block,
as these are already checked in the first code block computing tress from the original alignment
# v1.6 2020-11-19; * streamlined write_PHYLIP_param functions by grouping echo calls in { } > params; avoiding concatenation
# * improved description of write_PHYLIP_param functionality
# v1.5 2020-11-15; * added check_phylip_ok to validate input phylip file with -ge 4 sequences and -ge 4 characters
* added remove_phylip_param_files
* added option -I to call print_install_notes
* added [ "${BASH_VERSION%%.*}" -lt 4 ] && die
* makes more extensive use of internal Bash variables and variable expansions, including \${var/pattern/string}
# v1.4 2020-11-14; * added function check_nw_utils_ok, to check that nw_display and nw_support can be executed
as they may be in path but cannot find /lib64/libc.so.6: version GLIBC_2.14
when the binaries are compiled with dynamic linking to the libs
# v1.3 2020-11-14; * improved layout of output messages;
* improved regex in extract_tree_from_outfile (now also for NJ tree)
* if nw_support and nw_display are not available,
prints both the NJ and boot trees to screen if bootstrap analysis was performed
# v1.2 2020-11-11; set and export LC_NUMERIC=en_US.UTF-8, to avoid problems with locales that use 1,32 instead of 1.32
# v1.1 2020-11-11; added function extract_tree_from_oufile and calls it on NJ|UPGMA bootRepl consensus trees
if nw_display is not available in PATH
#v1.0 2020-11-09; This version has thorough error checking and reporting with improved code flow
* re-ingeneered the main code block. It now first runs the standard distance-matrix + clustering computations
before doing bootstrapping. This helps in rapidly detecting problematic model parameters, as reflected in negative distances
* displays NJ|UPGMA tree without bootstrap values, if no bootstrapping is requested
* added functions:
- check_matrix to check if they contain negative values, issuing an error message
- display_treeOI display trees with nw_display, only if they do not contain negative branch lengths, issuing an error message
- print_dev_history to clean up the script\'s header section
* made 100% shellcheck-compliant; prints all error messages to >&2 (STDERR)
* changed/fixed defaults for CV=1 & gamma="0.0";
* use outfiles+=() to capture outfiles for -R 1 and -R 2 when boot -eq 0; now runs with -i -R1|2 as single opts
* Changed default boot=100
* now accepts TiTv=real like 6.7 and checks for real numbers passed to -t
* checks if outtree contains negative branches before attempting to display with nw_display & prints proper message
# v0.5 2020-11-09; added functions:
* rdm_odd_int to compute random odd integers to pass to PHYLIP programs
* check_output (silently checks for expected output files, dying if not found)
#v0.4 2020-11-08; released to the public domain @ https://github.com/vinuesa/intro2linux
* added nw_support and nw_display calls to map and display bottstrap support values onto NJ or UPGMA trees
* fixed several warnings issued by shellcheck
* added strict bash interpreter calling with set -e; set -u; set -o pipefail
* localized variables received in functions
* explicitly set default_DNA_model=F84 and default_prot_model=JTT
* added option -v to print version and exit
#v0.3 September 15, 2017; improved checking of user input
#v0.2 September 16, 2015; basic options and parameter checking for protdist phylogenies
#v0.1 October 16, 2013; basic options and parameter checking for dnadist phylogenies
EOF_
exit 0
}
#---------------------------------------------------------------------------------#
function print_install_notes()
{
cat <<INSTALL
#1. PHYLIP PHYLogeny Inference Package by Joe Felensetein https://evolution.genetics.washington.edu/phylip.html
The bin/ directory of the intro2linux distribution provides precompiled Linux-x86_64 binaries
for seqboot dnadist protdist neighbor consense. Copy them into your \$HOME/bin directory or any other in your PATH
If you need executables for a different architecture, visit https://evolution.genetics.washington.edu/phylip/executables.html
#2. Newick Utilities http://cegg.unige.ch/newick_utils
The Newick Utilities have been described in an Open-Access paper (that is, available online for everyone):
The Newick Utilities: High-throughput Phylogenetic tree Processing in the UNIX Shell
Thomas Junier and Evgeny M. Zdobnov
Bioinformatics 2010 26:1669-1670
http://bioinformatics.oxfordjournals.org/content/26/13/1669.full
doi:10.1093/bioinformatics/btq243
The src/ dir contains the newick-utils-1.6-Linux-x86_64-disabled-extra.tar.gz source file
Visit http://cegg.unige.ch/newick_utils if you need other pre-compiled versions
Brief compilation notes
1. Copy the newick-utils-1.6-Linux-x86_64-disabled-extra.tar.gz to a suitable directory in your \$HOME
2. cd into the directory hoding the source *tar.gz file
3. unpack and compile with the following commands
tar -xvzf newick-utils-1.6-Linux-x86_64-disabled-extra.tar.gz
cd newick-utils-1.6
./configure --prefix=$HOME # if you do NOT have administrator privileges
#./configure # if you have superuser privileges
make
make check
make install # if you do NOT have administrator privileges
# sudo make install # if you have superuser privileges
INSTALL
exit 0
}
#---------------------------------------------------------------------------------#
function check_output()
{
local outfile=$1
if [ ! -s "$outfile" ]
then
echo
echo " >>> ERROR! The expected output file $outfile was not produced, will exit now!" >&2
echo
exit 2
fi
}
#---------------------------------------------------------------------------------#
function check_phylip_ok()
{
# implements a rudimentary format validation check
local phyfile=$1
local num_tax=
local num_char=
num_tax=$(awk 'NR == 1 {print $1}' "$phyfile")
num_char=$(awk 'NR == 1 {print $2}' "$phyfile")
if [[ ! "$num_tax" =~ ^([0-9]+)$ ]] || [[ ! "$num_char" =~ ^([0-9]+)$ ]]
then
echo "ERROR: $phyfile is not a phylip-formatted input file!"
echo
exit 1
elif [[ "$num_tax" =~ ^([0-9]+)$ ]] && [ "$num_tax" -lt 4 ]
then
echo "ERROR: $phyfile should contain at least 4 taxa"
exit 1
elif [[ "$num_char" =~ ^([0-9]+)$ ]] && [ "$num_char" -lt 5 ]
then
echo "ERROR: $phyfile should contain at least 5 aligned residues"
exit 1
else
echo "# File validation OK: $phyfile seems to be a standard phylip file ..."
echo '-------------------------------------------------------------------------------------------'
fi
}
#---------------------------------------------------------------------------------#
function check_dependencies
{
# check that the following PHYLIP binaries are in $PATH; die if not
# optional: the Newick_utilities src is found in the src/ dir
dependencies=(seqboot dnadist protdist neighbor consense)
for programname in "${dependencies[@]}"
do
local bin=
bin=$(type -P "$programname")
if [ -z "$bin" ]; then
echo
echo "# ERROR: $programname not in place!" >&2
echo "# ... you will need to install \"$programname\" first or include it in \$PATH" >&2
echo "# ... exiting" >&2
exit 1
fi
done
echo
echo '# Run check_dependencies() ... looks good: all required external dependencies are in place!'
echo '-------------------------------------------------------------------------------------------'
}
#---------------------------------------------------------------------------------#
function check_nw_utils_ok()
{
# This function checks that nw_display and nw_support can be executed
# as they may be in path but cannot find /lib64/libc.so.6: version GLIBC_2.14
# when the binaries are compiled with dynamic linking to the libs
# The function returns 0 when nw_support nw_display cannot be executed and 1 when they can
# The function does not run and return anything if nw_support nw_display are not in PATH
dependencies=(nw_support nw_display)
local nw_utils_ok=
local bin=
for programname in "${dependencies[@]}"
do
bin=$(type -P "$programname")
if [ -n "$bin" ]; then
# prints something like nw_support: /lib64/libc.so.6: version `GLIBC_2.14' not found
"$programname" 2>> "nw_utils_check.out.tmp"
fi
done
# check the contents of nw_utils_check.out.tmp and set proper flag
if [ -s nw_utils_check.out.tmp ]
then
# set the nw_utils_ok flag to 0 if the lib is not found, to check later in the code when nw_support and nw_display are called
grep -i error nw_utils_check.out.tmp &> /dev/null
nw_utils_ok=$?
[ "$nw_utils_ok" -eq 0 ] && echo "# WARNING: ${dependencies[*]} are in PATH but cannot find /lib64/libc.so.6: version GLIBC_2.14" >&2
rm nw_utils_check.out.tmp
echo "$nw_utils_ok"
fi
}
#---------------------------------------------------------------------------------#
function rdm_odd_int()
{
# generate a random odd integer for seqboot and neighbor
while true
do
i=$RANDOM
if (( "$i" % 2 )) # true when odd
then
echo "$i"
break
fi
done
}
#---------------------------------------------------------------------------------#
function check_matrix()
{
# check that the distance matrix does not contain negative values due to wrong model parameters
local matrix=$1
if grep -El ' \-[0-9\.]+' "$matrix"
then
echo "ERROR: computed negative distances!" >&2
[ "$runmode" -eq 1 ] && echo "You may need to ajust the model and|or gamma value; try lowering TiTv if > 6" >&2
[ "$runmode" -eq 2 ] && echo "You may need to ajust the matrix and|or gamma value" >&2
exit 3
fi
}
#---------------------------------------------------------------------------------#
function display_treeOK()
{
# checks that tree does not contain branches with negative lengths,
# as these cannot be displayed with nw_dsiplay
local tree=$1
# check that there are no negative branch lengths in the nj_tree
if ! grep -El '\-[0-9\.]+' "$tree"
then
echo "# displaying $tree ..."
echo
nw_display -w 80 -Ir "$tree"
echo
else
echo "ERROR: cannot display $nj_tree with nw_dsiplay, as it contains branches with negative lengths!" >&2
[ "$runmode" -eq 1 ] && echo "You may need to adjust (lower?) TiTv=$TiTv" >&2
[ "$runmode" -eq 2 ] && echo "You may need to use another matrix or adjunst gamma" >&2
exit 4
fi
}
#---------------------------------------------------------------------------------#
function extract_tree_from_outfile()
{
# grep out lines containing tree characters | - and print to STDOUT
local outfile=$1
grep --color=never -E '[[:blank:]]+\||-' "$outfile" | grep -Ev 'Neighbor-|^---'
}
#------------------------------------- PHYLIP FUNCTIONS ---------------------------------------#
# >>> these are fucntions to write the command files to pass parameters to PHYLIP programs <<< #
function remove_phylip_param_files()
{
[ -s dnadist.params ] && rm dnadist.params
[ -s protdist.params ] && rm protdist.params
[ -s seqboot.params ] && rm seqboot.params
[ -s neighbor.params ] && rm neighbor.params
[ -s consense.params ] && rm consense.params
return 0
}
#------------------------------------- PHYLIP FUNCTIONS ---------------------------------------#
function write_dnadist_params
{
# writes a parameter file to run dnadist, based on provided arguments
local model=$1
local boot=$2
local TiTv=$3
local sequential=$4
local gamma=$5
local CV=$6
# Runmode 1 = dnadist
if [ "$model" = 'F84' ] || [ -z "$model" ]
then
{
[ "$model" = 'Kimura' ] && echo "D"
[ "$model" = 'Jukes-Cantor' ] && echo -ne "D\nD\n"
[ "$model" = 'LogDet' ] && echo -ne "D\nD\nD\n"
[ "$TiTv" != "2" ] && echo -ne "T\n$TiTv\n"
[ "$gamma" != "0" ] && echo -ne "G\n"
[ "$boot" -gt 0 ] && echo -ne "M\nD\n$boot\n"
[ "$sequential" -eq 1 ] && echo -ne "I\n"
echo -ne "Y\n"
[ "$gamma" != "0" ] && echo -ne "$CV\n"
} > dnadist.params
fi
}
#---------------------------------------------------------------------------------#
function write_protdist_params
{
# write_protdist_params "$model" "$boot" "$sequential" "$gamma" "$CV"
# writes a parameter file to run protdist, based on provided arguments
# Models: JTT, PMB, PAM, Kimura
local model=$1
local boot=$2
local sequential=$3
local gamma=$4
local CV=$5
if [ "$model" = 'JTT' ] || [ -n "$model" ]
then
{ # Runmode 2 = protdist
[ "$model" = 'PMB' ] && echo "P"
[ "$model" = 'PAM' ] && echo -ne "P\nP\n"
[ "$model" = 'Kimura' ] && echo -ne "P\nP\nP\n"
[ "$gamma" != "0" ] && echo -ne "G\n"
[ "$boot" -gt 0 ] && echo -ne "M\nD\n$boot\n"
[ "$sequential" -eq 1 ] && echo -ne "I\n"
echo -ne "Y\n"
[ "$gamma" != "0" ] && echo -ne "$CV\n"
} > protdist.params
fi
}
#---------------------------------------------------------------------------------#
function write_seqboot_params
{
# writes a parameter file to run seqboot, based on provided arguments
local boot=$1
local sequential=$2
# Write Seqboot params
{
[ "$boot" -gt 0 ] && echo -ne "R\n$boot\n"
[ "$sequential" -eq 1 ] && echo -ne "I\n"
echo -ne "Y\n"
echo -ne "$ROI\n"
} > seqboot.params
}
#---------------------------------------------------------------------------------#
function write_neighbor_params
{
# writes a parameter file to run neighbor, based on provided arguments
local boot=$1
local upgma=$2
local outgroup=$3
{
# Write $ROI params
[ "$upgma" -gt 0 ] && echo -ne "N\n"
[ "$outgroup" -gt 1 ] && echo -ne "O\n$outgroup\n"
[ "$boot" -gt 0 ] && echo -ne "M\n$boot\n$ROI\n"
echo -ne "Y\n"
} > neighbor.params
}
#---------------------------------------------------------------------------------#
function write_consense_params
{
#writes a parameter file to run consense; very difficult ;)
[ -s consense.params ] && rm consense.params
{
[ "$outgroup" -gt 1 ] && echo -ne "O\n$outgroup\n"
echo -ne "Y\n"
} > consense.params
}
#---------------------------------------------------------------------------------#
# don't leave litter behind ... remove intermediate input/output files
function cleanup_dir
{
[ -s infile ] && rm infile
[ -s outfile ] && rm outfile
[ -s intree ] && rm intree
[ -s outtree ] && rm outtree
for file in *.params
do
[ -s "$file" ] && rm "$file"
done
}
#-------------------------------- END PHYLIP FUNCTIONS----------------------------#
function print_help
{
#':b:c:m:I:o:R:t:hHIDsuv'
cat<<EOF
$progname v.$VERSION [OPTIONS]
-h prints this help message
REQUIRED
-i <input_phylip_file> (if sequential, use also flag -s)
-R <integer> RUNMODE
1 run dnadist (expects DNA alignment in phylip format)
2 run protdist (expects PROT alignment in phylip format)
OPTIONAL
-b <integer> No. of bootstrap pseudoreplicates [default: $boot]
-g <real number> alpha for gamma distribution [default: $gamma]
-o <integer> outgroup sequence no. [default: $outgroup]
-s sequential format; -s (flag; no val)! [default: interleaved]
-t <digit> Transition/Transversion ratio [default: $TiTv]
-m <model name> NOTE: TYPE AS SHOWN!
DNA: F84, Kimura, Jukes-Cantor, LogDet [default: $def_DNA_model]
PROT: JTT, PMB, PAM, Kimura [default: $def_prot_model]
-u <flag> use UPGMA for clustering [default: NJ]
-v <flag> print program version and exit
-D <flag> Activate debugging to keep cmd files [default: $DEBUG]
-H <flag> print development history and exit
-I <flag> print installation notes and exit
AIM: run PHYLIP\'s distance methods [NJ|UPGMA] for DNA and proteins with bootstrapping
This code was written to teach basic Bash scripting to my students at the
Bachelor\'s Program in Genome Sciences, Center for Genome Sciences, UNAM, Mexico
https://www.lcg.unam.mx/
OUTPUT: [NJ|UPGMA] phylogenies and, if requested, bootstrap consensus trees
and [NJ|UPGMA] phylogenies with bootstrap support values mappend on bipartitions
EXTERNAL DEPENDENCIES:
* PHYLIP (https://evolution.genetics.washington.edu/phylip.html) programs:
seqboot dnadist protdist neighbor consense
* Newick utilities programs (http://cegg.unige.ch/newick_utils) programs:
optional: nw_support and nw_display; need to install separately
- Notes: PHYLIP Linux 64bit binaries available in the bin/ dir
Copy them to your \$HOME/bin directory or any other in your PATH
LICENSING & SOURCE
Author: Pablo Vinuesa | https://www.ccg.unam.mx/~vinuesa/ | twitter: @pvinmex
Released under the GNU General Public License version 3 (GPLv3)
http://www.gnu.org/copyleft/gpl.html
source: https://github.com/vinuesa/intro2linux
EOF
exit 1
}
#-------------------------------------------------------------------------------------------------#
#---------------------------------------- GET OPTIONS --------------------------------------------#
#-------------------------------------------------------------------------------------------------#
# GETOPTS
while getopts ':b:g:i:m:R:o:t:hDHIsuv' OPTIONS
do
case $OPTIONS in
b) boot=$OPTARG
;;
g) gamma=$OPTARG
[ "$gamma" != 0 ] && CV=$(echo "1/sqrt($gamma)" | bc -l) && printf -v CV "%.4f\n" "$CV"
;;
h) print_help
;;
i) input_phylip=$OPTARG
;;
s) sequential=1
;;
m) model=$OPTARG
;;
o) outgroup=$OPTARG
;;
R) runmode=$OPTARG
;;
t) TiTv=$OPTARG
;;
u) upgma=1
;;
v) echo "$progname v$VERSION" && exit
;;
D) DEBUG=1
;;
H) print_dev_history
;;
I) print_install_notes
;;
:) printf "argument missing from -%s option\n" "$OPTARG"
print_help
exit 2
;;
\?) echo "need the following args: "
print_help
exit 3
;;
esac >&2 # print the ERROR MESSAGES to STDERR
done
shift $((OPTIND - 1))
#--------------------------#
# >>> Check User Input <<< #
#--------------------------#
if [ -z "$input_phylip" ]
then
echo
echo "# ERROR: no input phylip file defined!"
print_help
exit 1
fi
if [ -z "$runmode" ]
then
echo
echo "# ERROR: no runmode defined!"
print_help
exit 1
fi
# automatically set TiTv=0 when running with protein matrices or the LogDet DNA model
[ "$runmode" -eq 2 ] && TiTv=0
[ "$model" == LogDet ] && TiTv=0
# check that bootstrap value is an integer
re='^[0-9]+$'
if [[ ! "$boot" =~ $re ]]
then
echo
echo "# ERROR: boot:$boot is not a positive integer >= 0; provide a value between 100 and 1000" >&2
echo
print_help
echo
exit 3
fi
# check that Ti/Tv is a real number, integer or decimal
re_TiTv='^[0-9.]+$'
if [[ ! "$TiTv" =~ $re_TiTv ]] && [ "$runmode" -eq 1 ]
then
echo
echo "# ERROR: TiTv:$TiTv is not an integer >= 0; provide a value between 0-10" >&2
echo
print_help
echo
exit 3
elif [[ "$TiTv" =~ $re_TiTv ]] && [ "$runmode" -eq 1 ] && { [ "$model" == Jukes-Cantor ] || [ "$model" == LogDet ]; }
then
echo
echo "# ERROR: $model is only valid when analyzing DNA alignments under the F84 or K2P models" >&2
echo
print_help
exit 1
fi
if [ "$gamma" != 0 ] # note, need to treat as string, since gamma will be generalle a float like 0.5
then
# this is to avoid having dots within filename (converts gamma==0.2 to gammaf=02)
gammaf=${gamma/\./} #gammaf=${gamma%.*}${gamma##*.}
else
gammaf="$gamma"
fi
# check model vs. runmode compatibility and suitable bootstrap values are provided
# using two alternative sintaxes for educational purposes
if [ "$runmode" -eq 1 ] && { [ "$model" = JTT ] || [ "$model" = PMB ] || [ "$model" = PAM ] || [ "$model" = "$def_prot_model" ]; }
then
echo
echo "# ERROR: $model is only valid when analyzing protein alignments under -R 2" >&2
echo
print_help
exit 1
elif [ "$runmode" -eq 2 ] && [[ "$model" =~ ^(F84|Jukes-Cantor|LogDet|"$def_DNA_model")$ ]]
then
echo
echo "# ERROR: $model is only valid when analyzing DNA alignments under -R 1" >&2
echo
print_help
exit 1
elif [ "$boot" -lt 0 ]
then
echo
echo "# ERROR: bootstrap value=$boot is < 0 and not permitted" >&2
echo
print_help
exit 1
elif [ "$boot" -gt 1000 ]
then
echo
echo "# WARNING: bootstrap value=$boot is > 1000. This may take a long time to run."
echo -n "# Are you sure you want to continue anyway? Type Y|N "
read -r answer
if [ "$answer" = N ] || [ "$answer" = n ]
then
echo
echo "# Ok, will exit then!"
echo
exit 2
else
echo
echo "# Ok, will proceed running with $boot bootstrap pseudoreplicates ..."
echo
fi
elif [ "$boot" -gt 0 ] && [ "$boot" -lt 100 ]
then
echo
echo "# WARNING: bootstrap value=$boot is a bit low. Use a value >= 100."
echo -n "# Are you sure you want to continue anyway? Type Y|N "
read -r answer
if [ "$answer" = N ] || [ "$answer" = n ]
then
echo
echo "# Ok, will exit then!"
echo
exit 2
else
echo
echo "# Ok, will proceed ..."
echo
fi
fi
# set the default DNA or PROT models, if not defined by the user
if [ "$runmode" -eq 1 ] && [ -z "$model" ]
then
model="$def_DNA_model"
#echo "# DNA model set to default: $def_DNA_model..."
fi
if [ "$runmode" -eq 2 ] && [ -z "$model" ]
then
model="$def_prot_model"
#echo "# Protein matrix set to default: $def_prot_model ..."
fi
# check that the user provided a valid DNA substitution model
if [ "$runmode" -eq 1 ] && [[ ! "$model" =~ ^(F84|Kimura|Jukes-Cantor|LogDet|"$def_DNA_model")$ ]]
then
echo
echo "# ERROR: $model is not a recognized substitution model for DNA sequences used by PHYLIP" >&2
echo
print_help
echo
exit 3
fi
# check that the user provided a valid name for empirical substitution matrix
# note the much shorter and cleaner notation of this test using extended regexes, than the previous one
if [ "$runmode" -eq 2 ] && [[ ! "$model" =~ ^(JTT|PMB|PAM|Kimura)$ ]]
then
echo
echo "# ERROR: $model is not a recognized substitution matrix for protein sequences used by PHYLIP" >&2
echo
print_help
echo
exit 3
fi
#>>> Set the script's run random odd number required by diverse PHYLIP programs <<<
ROI=$(rdm_odd_int)
# print the run settings
echo
echo "### $progname v.$VERSION run on $start_time with the following parameters:"
echo "# work_directory=$wkdir"
echo "# input_phylip=$input_phylip"
echo "# model=$model | gamma=$gamma | CV=$CV | gammaf=$gammaf | ti/tv ratio=$TiTv |
outgroup=$outgroup | UPGMA=$upgma | bootstrap no.=$boot | ROI=$ROI"
echo "# runmode=$runmode"
echo "# DEBUG=$DEBUG"
echo "# command: $progname ${args[*]}"
echo
#-------------------------------------------------------------------------------------------------#
#------------------------------------------ MAIN CODE --------------------------------------------#
#-------------------------------------------------------------------------------------------------#
# 1. make sure the external dependencies are found in PATH
# and that nw_display and nw_support find the required GLIBC_2.14 in /lib64/libc.so.6
check_dependencies
nw_utils_ok=$(check_nw_utils_ok) # nw_utils_ok -eq 1 when OK, -eq 0 when not
# 2. Start processing the input file
# make sure there are no old outfile or outtree files lying around from previous runs
[ -s infile ] && rm infile
[ -s outfile ] && rm outfile
[ -s outtree ] && rm outtree
# nor old params files
remove_phylip_param_files
# 2.1) make sure we have an input file or die
if [ -s "$input_phylip" ]
then
# make basic format validation check
check_phylip_ok "$input_phylip"
cp "$input_phylip" infile
else
echo
echo "# FATAL ERROR: input phylip file $input_phylip does not exist or is empty" >&2
echo "# Make sure $input_phylip is in $wkdir. Exiting ..." >&2
echo
exit 1
fi
# ----------------------------------------------------------- #
# >>>>>>>>>>>>>>>> Compute distance matrices <<<<<<<<<<<<<<<< #
# ----------------------------------------------------------- #
# 3. In any case (with or without bootstrap) compute the NJ tree for the original phylip file
# Run dnadist or protdist, as required
echo
echo ">>> Computing distance matrix for $input_phylip ..."
if [ "$runmode" -eq 1 ]
then
echo "# running write_dnadist_params $model 0 $TiTv $sequential $gamma $CV"
write_dnadist_params "$model" "0" "$TiTv" "$sequential" "$gamma" "$CV"
echo "# running dnadist < dnadist.params"
dnadist < dnadist.params &> /dev/null
check_output outfile
# https://fvue.nl/wiki/Bash:_Error_%60Unbound_variable%27_when_appending_to_empty_array
# Set last item specifically
# nstead of appending one element, set the last item specifically, without any "unbound variable" error
# t[${#t[*]}]=foo
dnadist_outfile="${input_phylip%.*}_${model}${gammaf}gamma_distMat.out"
cp outfile "$dnadist_outfile" && \
outfiles+=("$dnadist_outfile")
# check that the distance matrix does not contain negative values due to wrong model parameters
check_matrix "$dnadist_outfile"
mv outfile infile
elif [ "$runmode" -eq 2 ]
then
echo "# running write_protdist_params $model 0 $sequential $gamma $CV"
write_protdist_params "$model" "0" "$sequential" "$gamma" "$CV"
echo "# running protdist < protdist.params"
protdist < protdist.params &> /dev/null
check_output outfile
protdist_outfile="${input_phylip%.*}_${model}${gammaf}gamma_distMat.out"
cp outfile "$protdist_outfile" && \
outfiles+=("$protdist_outfile")
# check that the distance matrix does not contain negative values due to wrong model parameters
check_matrix "$protdist_outfile"
mv outfile infile
fi
# ------------------------------------------------------------------------------------ #
# >>>>>>>>>>>>>>>> Computing NJ|UPGMA trees from original alignments <<<<<<<<<<<<<<<<< #
# ------------------------------------------------------------------------------------ #
echo
echo ">>> Computing distance tree for $input_phylip ..."
# 4. now that we have the dist matrix, do the clustering with NJ or UPGMA
echo "# running write_neighbor_params 0 $upgma"
write_neighbor_params "0" "$upgma" "$outgroup"
echo "# running neighbor < neighbor.params"
neighbor < neighbor.params &> /dev/null
check_output outfile
check_output outtree
# 5.1 rename outtrees and tree outfiles; remap bootstrap values to bipartitions and display tree to screen
if [ "$upgma" -gt 0 ]
then
# https://fvue.nl/wiki/Bash:_Error_%60Unbound_variable%27_when_appending_to_empty_array
# Set last item specifically
# instead of appending one element, set the last item specifically, without any "unbound variable" error
# t[${#t[*]}]=foo
upgma_tree=
upgma_outfile=
if [ -s outtree ]
then
upgma_tree="${input_phylip%.*}_${model}${gammaf}gamma_UPGMA.ph"
mv outtree "$upgma_tree"
echo "# wrote tree $upgma_tree to disk"
outfiles[${#outfiles[*]}]="$upgma_tree"
fi
if [ -s outfile ]
then
upgma_outfile="${input_phylip%.*}_${model}${gammaf}gamma_UPGMA.outfile"
mv outfile "$upgma_outfile"
outfiles[${#outfiles[*]}]="$upgma_outfile"
fi
# check that there are no negative branch lengths in the nj_tree
# and display with nw_display, only if no bootstrapping is requested
if [ "$boot" -eq 0 ] && [[ $(type -P nw_display) ]] && [[ "$nw_utils_ok" -eq 1 ]]
then
display_treeOK "$upgma_tree"
elif [ "$boot" -eq 0 ] && { [[ ! $(type -P nw_display) ]] || [[ "$nw_utils_ok" -ne 1 ]]; }
then
echo "# extract_tree_from_outfile $upgma_outfile"
echo
extract_tree_from_outfile "$upgma_outfile"
echo
fi
else
nj_tree=
nj_outfile=
if [ -s outtree ]
then
nj_tree="${input_phylip%.*}_${model}${gammaf}gamma_NJ.ph"
mv outtree "$nj_tree"
echo "# wrote tree $nj_tree to disk"
outfiles[${#outfiles[*]}]="$nj_tree"
fi
if [ -s outfile ]
then
nj_outfile="${input_phylip%.*}_${model}${gammaf}gamma_NJ.outfile"
mv outfile "$nj_outfile"
outfiles[${#outfiles[*]}]="$nj_outfile"
fi
# check that there are no negative branch lengths in the nj_tree
# and display with nw_display, only if no bootstrapping is requested
if [ "$boot" -eq 0 ] && [[ $(type -P nw_display) ]] && [[ "$nw_utils_ok" -eq 1 ]]
then
display_treeOK "$nj_tree"
elif [ "$boot" -eq 0 ] && { [[ ! $(type -P nw_display) ]] || [[ "$nw_utils_ok" -ne 1 ]]; }
then
echo "# extract_tree_from_outfile $nj_outfile"
echo
extract_tree_from_outfile "$nj_outfile"
echo
fi
fi
echo "# > finished computing distance matrix and tree for $input_phylip!"
if [ "$boot" -eq 0 ]
then
# 6. Print final output summary message
echo
echo '===================== OUTPUT SUMMARY ====================='
no_outfiles=${#outfiles[@]}
echo "# $no_outfiles output files were generated:"
printf "%s\n" "${outfiles[@]}"
echo
echo -n "# FINISHED run at: "; date
echo " ==> Exiting now ..."
echo
else
echo "=================================================================="
echo
fi
# ----------------------------------------------------------- #
# >>>>>>>>>>>>>>>>>>>> Bootstrap Analysis <<<<<<<<<<<<<<<<<<< #
# ----------------------------------------------------------- #
# Run seqboot if requested
# run seqboot if -b > 0
if [ "$boot" -gt 0 ]
then
# 1. restore the original infile for the bootstrapping and standard NJ/UPGMA analysis below
cp "$input_phylip" infile
check_output infile
echo
echo ">>> Bootstrap Analysis based on $boot pseudoreplicates for $input_phylip ..."
echo "# running write_seqboot_params $boot $sequential"
write_seqboot_params "$boot" "$sequential"
echo "# running seqboot < seqboot.params &> /dev/null"
seqboot < seqboot.params &> /dev/null
check_output outfile
mv outfile infile