-
Notifications
You must be signed in to change notification settings - Fork 0
/
ublast4.py
80 lines (61 loc) · 3.14 KB
/
ublast4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from Bio import SeqIO
import sys
import os
from StringIO import StringIO # Python 2
import subprocess
from random import randrange
from optparse import OptionParser
def chunkstring(string, length):
return (string[0+i:length+i] for i in range(0, len(string), length))
parser = OptionParser()
parser.add_option("-i", "--infile", dest="filename", help="input sequence file", metavar="FILE")
parser.add_option("-o", "--outfile", dest="outfile", default="ublast.out", help="output file name")
parser.add_option("-T", "--tax_lev", dest="tax_lev", default="a", help="report taxonomy level - default is all ranks")
parser.add_option("-f", "--format", dest="infmt", default="fasta", help="input file format, e.g. fasta, fastq, genbank")
parser.add_option("-m", "--maxhits", dest="maxhits", default="100", help="max number of hits to return for each query")
parser.add_option("-d", "--identity", dest="id", default="0.9", help="percent identity threshold")
parser.add_option("-b", "--database", dest="db", default="~/d/db/genpept/genpept.udb", help="database, default = ~/d/db/genpept/genpept.udb")
parser.add_option("-e", "--evalue", dest="evalue", default="1.0e-10", help="e value threshold")
parser.add_option("-c", "--cons", dest="cons", default="0.15", help="specify LCA scoring radius 0 to 1, default = 0.15")
parser.add_option("-s", "--stype", dest="stype", default="p", help="specify the sequence type. Default = protein, 'p'")
parser.add_option("-O", "--offset", dest="offset1", default="1", help="specify the split offset, default = 1, '1'")
(options, args) = parser.parse_args()
r2=str(randrange(100000))
db=options.db
inputfile = options.filename #sys.argv[1]
outputfile=options.outfile #sys.argv[2]
fmt = options.infmt
maxhits= options.maxhits
id= options.id
evalue=options.evalue
cons = float(options.cons)
stype=options.stype
tax_lev=options.tax_lev
offset1=options.offset1
#017
'''
subprocess.Popen("rm -rf ~/scripts/tmp2", shell=True).wait()
subprocess.Popen("mkdir ~/scripts/tmp2", shell=True).wait()
'''
inputname='/OSM/HOME-MEL/all29c/scripts/tmp2/inp%s.fa' %(r2)
f1=open(inputfile,'r')
f2=open(inputname,'w')
#split large reads/contigs if >50kb
print "splitting large seq records if >50kb..."
for i in SeqIO.parse(f1,fmt):
if len(i.seq)>50000:
seqlist=list(chunkstring(i.seq,50000))
c=-1
for t in seqlist:
c=c+1
print i.id, c+1
f2.write(">"+i.id+"-part-"+str(c+1)+"\n"+str(t)+"\n")
else:
f2.write(">"+i.id+"\n"+str(i.seq)+"\n")
f2.close()
p1= subprocess.Popen("usearch -ublast %s -db %s -strand both -maxhits %s -userout %s -userfields query+target+id+ql+qlo+qhi+tl+tlo+thi+alnlen+evalue+bits -id %s -evalue %s" %(inputname,db,maxhits,outputfile,id,evalue), shell=True).wait() #-ka_ungapped_lambda 0.318 -ka_gapped_lambda 0.291 -ka_gapped_k 0.075 -ka_ungapped_k 0.134
outputfile2=outputfile+".taxa"
print"adding lineage.."
subprocess.Popen("python ~/scripts/linfetch.py %s %s %s %s" %(outputfile,outputfile2, stype, offset1), shell=True).wait()
subprocess.Popen("python ~/scripts/lca2.py %s %s %s %s" %(outputfile2,outputfile+".lca",cons,tax_lev), shell=True).wait()
subprocess.Popen("rm /OSM/HOME-MEL/all29c/scripts/tmp2/inp%s.fa" %(r2), shell=True).wait()