-
Notifications
You must be signed in to change notification settings - Fork 1
/
meta_otu_table3.py
149 lines (101 loc) · 3.35 KB
/
meta_otu_table3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
import sys
import os
import re
import glob
#written for 060, outputs all taxon levels, use "-t rel_ab_w_read_stats" in metaphlan2.py
digits = re.compile(r'(\d+)')
def tokenize(filename):
return tuple(int(token) if match else token
for token, match in
((fragment, digits.search(fragment))
for fragment in digits.split(filename)))
#makes an otu table from input species lists
folder = sys.argv[1] #working folder
outfile=sys.argv[2] #output file prefix
delim=sys.argv[3] #taxonomy output delimiter
fthresh=int(sys.argv[4]) #threshold of number of samples that have each otu
otuthresh=int(sys.argv[5])#total count threshold for each otu
filelist=glob.glob(folder)
filelist.sort(key=tokenize)
if sys.argv[6] == 'count':
dtype=-1
else:
dtype=1
print filelist
data={}
allspecies={}
tax_levels=["d","k","p","c","o","f","g","s","t"]
filenames=[]
for i in filelist:
file1 = open(i,'r')
filename = ".".join(str(p) for p in i.split("/")[-1].split(".")[:-1])
filenames.append(filename)
data[filename]={}
#k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_buccalis|t__GCF_000177075 0.00127 2.39794735706e-05 3033961 73
for line in file1:
for level in tax_levels:
if level not in allspecies.keys():
allspecies[level]=[]
if level not in data[filename].keys():
data[filename][level]={}
if line[0]<>"#":
sp1 = line.split("\t")[0]
sp=sp1.split(delim)[-1]
lev = sp.split("__")[0]
freq= float(line.split("\t")[dtype].rstrip("\n").rstrip("\r"))
if dtype==-1:
freq=int(freq)
if sp1=="unclassified":
if sp1 not in allspecies[level]:
allspecies[level].append(sp1)
if sp1 not in data[filename][level]:
data[filename][level][sp1]=""
data[filename][level][sp1]=freq
if lev==level:
if sp1 not in allspecies[level]:
allspecies[level].append(sp1)
if sp1 not in data[filename][level]:
data[filename][level][sp1]=""
data[filename][level][sp1]=freq
file1.close()
filenames.sort(key=tokenize)
for v in allspecies.keys():
allspecies[v].sort()
#print allspecies
#remove if below threshold
allspecies2={}
for level in tax_levels:
allspecies2[level]=[]
for i in allspecies[level]:
c=0
otutotal=0
for j in filenames:
if i in data[j][level].keys():
c=c+1
otutotal=otutotal+data[j][level][i]
if c >= fthresh and otutotal>=otuthresh:
allspecies2[level].append(i)
g=open(outfile+"_all_"+".tab",'w')
for level in tax_levels:
print level,"###########################################################"
g2=open(outfile+"_"+level+".tab",'w')
g.write(level+"\t\t"+"\t".join(str(x) for x in filenames)+"\n")
g2.write("#OTU_ID"+"\t"+"\t".join(str(x) for x in filenames)+"\n")
for i in allspecies2[level]:
print (i.split("__")[-1])
#g.write(i+"\t") #change to write the whole taxonomy
g.write(i+"\t"+i.split("__")[-1])
#g2.write(i) #change to write the whole taxonomy
g2.write(i.split("__")[-1])
for j in filenames:
if i in data[j][level].keys():
g.write("\t"+str(data[j][level][i]))
g2.write("\t"+str(data[j][level][i]))
else:
g.write("\t"+"0")
g2.write("\t"+"0")
g.write("\n")
g2.write("\n")
g.write("\n")
g2.write("\n")