-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathblastfilterer.py
62 lines (53 loc) · 1.96 KB
/
blastfilterer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
# Credit: Chris Gulvik with modification by Joe Healey
from argparse import ArgumentParser
def parseArgs():
parser = ArgumentParser(description='Filters BLAST output format 6 '
'(tab-delimited) for best hit based on bitscore. '
'Handles additional data following bitscore (column 12).',
add_help=False)
req = parser.add_argument_group('Required')
req.add_argument('-i', '--infile', required=True, metavar='FILE',
help='input file in NCBI\'s BLAST -outfmt 6 format')
opt = parser.add_argument_group('Optional')
opt.add_argument('-h', '--help', action='help',
help='show this help message and exit')
opt.add_argument('-c', '--column', type=int, metavar='{1,2}', choices=[1, 2],
default=1, help='report best hit per query label (1st column; \'1\') '
'or target (2nd column; \'2\') [1]')
opt.add_argument('-o', '--outfile', metavar='FILE',
default=None, help='output file [<infile>.best-filt.tab]')
opt.add_argument('-s', '--bitscore', type=float, metavar='FLOAT',
default=0.0, help='minimum alignment Bit score [0.0]')
return parser.parse_args()
def main():
opts = parseArgs()
if opts.outfile is None:
outfile = opts.infile + '.best-filt.tab'
else:
outfile = opts.outfile
# Identify unique query labels and will not assume sorted file
with open(opts.infile, 'r') as ifh:
query_labels = set()
for ln in ifh:
query_labels.add(ln.split('\t')[opts.column-1])
# Filter best hits
best = {k: ['0']*12 for k in query_labels}
with open(opts.infile, 'r') as ifh:
for ln in ifh:
data = ln.rstrip('\n').split('\t')
if float(data[11]) > opts.bitscore and \
float(data[11]) > float(best[data[opts.column-1]][11]):
best[data[opts.column-1]] = data
# Write output
if opts.outfile is None:
for k,v in sorted(best.items()):
if v != ['0']*12:
print('\t'.join(v))
else:
with open(outfile, 'w') as o:
for k,v in sorted(best.items()):
if v != ['0']*12:
o.write('\t'.join(v) + '\n')
if __name__ == '__main__':
main()