forked from idoerg/cafa-format-check
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cafa_go_format_checker.py
executable file
·266 lines (247 loc) · 10 KB
/
cafa_go_format_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env python
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re
import sys
pr_field = re.compile("^PR=[0,1]\.[0-9][0-9];$")
rc_field = re.compile("^RC=[0,1]\.[0-9][0-9]$")
go_field = re.compile("^GO:[0-9]{5,7}$")
# Fix to add EFI and HP
target_field = re.compile("^(M|T|EFI)[0-9]{5,20}$")
#target_field = re.compile("^T[0-9]{5,20}$")
confidence_field = re.compile("^[0,1]\.[0-9][0-9]$")
# Legal states: the CAFA prediction records fields, and their order. KEYWORDS and ACCURACY are
# optional
legal_states1 = ["author","model","keywords","accuracy","go_prediction","end"]
legal_states2 = ["author","model","keywords","go_prediction","end"]
legal_states3 = ["author","model","go_prediction","end"]
legal_keywords = [
"sequence alignment", "sequence-profile alignment", "profile-profile alignment", "phylogeny",
"sequence properties",
"physicochemical properties", "predicted properties", "protein interactions", "gene expression",
"mass spectrometry",
"genetic interactions", "protein structure", "literature", "genomic context", "synteny",
"structure alignment",
"comparative model", "predicted protein structure", "de novo prediction", "machine learning",
"genome environment",
"operon", "ortholog", "paralog", "homolog", "hidden Markov model", "clinical data", "genetic data",
"natural language processing", "other functional information"
]
"""
A collection of modules to check the format of the different records in the CAFA prediction file
Accept the current record (inrec). Then returns a boolean value if it is correct or not, and an
applicable error message.
The "correct" and "errmsg" variables then should be passed to the "handle_error" function
"""
def author_check(inrec):
correct = True
errmsg = None
fields = [i.strip() for i in inrec.split()]
if len(fields) != 2:
correct = False
errmsg = "AUTHOR: invalid number of fields. Should be 2"
elif fields[0] != "AUTHOR":
correct = False
errmsg = "AUTHOR: First field should be AUTHOR"
return correct, errmsg
def model_check(inrec):
correct = True
errmsg = None
fields = [i.strip() for i in inrec.split()]
if len(fields) != 2:
correct = False
errmsg = "MODEL: invalid number of fields. Should be 2"
elif fields[0] != "MODEL":
correct = False
errmsg = "MODEL: First field should be MODEL"
elif len(fields[1]) != 1 or not fields[1].isdigit():
correct = False
errmsg = "MODEL: second field should be single digit."
return correct, errmsg
def keywords_check(inrec):
correct = True
errmsg = None
if inrec[:8] != "KEYWORDS":
correct = False
errmsg = "KEYWORDS: first field should be KEYWORDS"
else:
keywords = [i.strip() for i in inrec[8:].split(",")]
for keyword in keywords:
# stupid full stop
if keyword[-1] == ".":
keyword = keyword[:-1]
if keyword not in legal_keywords:
correct = False
errmsg = "KEYWORDS: illegal keyword %s" % keyword
break
return correct, errmsg
def accuracy_check(inrec):
correct = True
errmsg = None
fields = [i.strip() for i in inrec.split()]
if len(fields) != 4:
correct = False
errmsg = "ACCURACY: error in number of fields. Should be 4"
elif fields[0] != "ACCURACY":
correct = False
errmsg = "ACCURACY: first field should be 'ACCURACY'"
elif not fields[1].isdigit() or len(fields[1]) != 1:
correct = False
errmsg = "ACCURACY: second field should be a single digit"
elif not pr_field.match(fields[2]):
correct = False
errmsg = "ACCURACY: error in PR field"
elif not rc_field.match(fields[3]):
correct = False
errmsg = "ACCURACY: error in RC field"
return correct, errmsg
def go_prediction_check(inrec):
correct = True
errmsg = None
fields = [i.strip() for i in inrec.split()]
if len(fields) != 3:
correct = False
errmsg = "GO prediction: wrong number of fields. Should be 3"
elif not target_field.match(fields[0]):
correct = False
errmsg = "GO prediction: error in first (Target ID) field"
elif not go_field.match(fields[1]):
correct = False
errmsg = "GO prediction: error in second (GO ID) field"
elif not confidence_field.match(fields[2]):
correct = False
errmsg = "GO prediction: error in third (confidence) field"
elif float(fields[2]) > 1.0:
correct = False
errmsg = "GO prediction: error in third (confidence) field. Cannot be > 1.0"
return correct, errmsg
def end_check(inrec):
correct = True
errmsg = None
fields = [i.strip() for i in inrec.split()]
if len(fields) != 1:
correct = False
errmsg = "END: wrong number of fields. Should be 1"
elif fields[0] != "END":
correct = False
errmsg = "END: record should include the word END only"
return correct, errmsg
"""
Function builds the error message to incorporate the filename and what line the error was raised on.
Returns the status of whether the line is correct and the error message if one exists.
"""
def handle_error(correct, errmsg, inrec, line_num, fileName):
if not correct:
line = "Error in %s, line %s, " % (fileName, line_num)
return False, line + errmsg
else:
return True, "Nothing wrong here"
def cafa_checker(infile, fileName):
"""
Main program that: 1. identifies fields; 2. Calls the proper checker function; 3. calls the
error handler "handle_error" which builds the error report. If correct is False, the function returns correct, errmsg
to the file_name_check function in cafa3_format_checker.
"""
visited_states = []
s_token = 0
n_accuracy = 0
first_prediction = True
first_accuracy = True
first_keywords = True
n_models = 0
line_num = 0
for inline in infile:
line_num += 1
inrec = [i.strip() for i in inline.split()]
field1 = inrec[0]
# Check which field type (state) we are in
if field1 == "AUTHOR":
state = "author"
elif field1 == "MODEL":
state = "model"
elif field1 == "KEYWORDS":
state = "keywords"
elif field1 == "ACCURACY":
state = "accuracy"
elif field1 == "END":
state = "end"
else: #default to prediction state
state = "go_prediction"
# print "****"
# print "FIELD1", field1
# print inline, state
# Check for errors according to state
if state == "author":
correct,errmsg = author_check(inline)
correct, errmsg = handle_error(correct, errmsg, inline, line_num, fileName)
if not correct:
return correct, errmsg
visited_states.append(state)
elif state == "model":
n_models += 1
n_accuracy = 0
if n_models > 3:
return False, "Too many models. Only up to 3 allowed"
correct,errmsg = model_check(inline)
correct, errmsg = handle_error(correct, errmsg, inline, line_num, fileName)
if not correct:
return correct, errmsg
if n_models == 1:
visited_states.append(state)
elif state == "keywords":
if first_keywords:
visited_states.append(state)
first_keywords = False
correct, errmsg = keywords_check(inline)
correct, errmsg = handle_error(correct, errmsg, inline, line_num, fileName)
if not correct:
return correct, errmsg
elif state == "accuracy":
if first_accuracy:
visited_states.append(state)
first_accuracy = False
n_accuracy += 1
if n_accuracy > 3:
correct, errmsg = handle_error(False, "ACCURACY: too many ACCURACY records", line_num, fileName)
if not correct:
return correct, errmsg
else:
correct, errmsg = accuracy_check(inline)
if not correct:
return correct, errmsg
elif state == "go_prediction":
correct, errmsg = go_prediction_check(inline)
correct, errmsg = handle_error(correct, errmsg, inline, line_num, fileName)
if not correct:
return correct, errmsg
if first_prediction:
visited_states.append(state)
first_prediction = False
elif state == "end":
correct, errmsg = end_check(inline)
correct, errmsg = handle_error(correct, errmsg, inline, line_num, fileName)
if not correct:
return correct, errmsg
visited_states.append(state)
# End file forloop
if (visited_states != legal_states1 and
visited_states != legal_states2 and
visited_states != legal_states3):
errmsg = "Error in " + fileName + "\n"
errmsg += "Sections found in the file: [" + ", ".join(visited_states) + "]\n"
errmsg += "file not formatted according to CAFA 3 specs\n"
errmsg += "Check whether all these record types are in your file in the correct order\n"
errmsg += "AUTHOR, MODEL, KEYWORDS, ACCURACY (optional), predictions, END"
return False, errmsg
else:
return True, "%s, passed the CAFA 3 GO prediction format checker" % fileName