This repository has been archived by the owner on Feb 16, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 15
/
convertGenbank2table.py
executable file
·465 lines (401 loc) · 22.5 KB
/
convertGenbank2table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Libraries needed
from __future__ import print_function
import fileinput, optparse
import os, sys, csv, getpass, socket, shutil, re
import Bio
from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein
from FileLocator import *
from GenbankHandler import *
RECOMMENDED_BIOPYTHON_VERSION = 1.61
'''
Produce an ITEP-compatible tab-delimited file ("raw file") from a Genbank file.
Standards enforced:
0: All organism IDs must match the regex \d+\.\d+
1: All raw files must be in the raw/ directory and named [organismid].txt
2: All genbank files must be in the genbank/ directory and named [organismid].gbk
3: All genbank files must have a corresponding raw file.
4: All raw files must have a corresponding genbank file.
5: If there are multiple genbank files (for multiple contigs) they should be concatinated and placed in the genbank folder as a single file with the name [organismid].gbk
6: The raw files must have a specific format (this is to be dealt with automatically if using standardized input functions):
- Column order as specified in the help files.
- Gene IDs are in the format fig\|\d+\.\d+\.peg\.\d+ where the first \d+\.\d+ is the organism ID of the corresponding genes
- Strand must be + or -
- "Start" is the nucleotide position of the first transcribed base (i.e. for - strand genes, the start position will be bigger than the stop).
Optionally we add the ITEP IDs to the genbank file, but if you want to do this you must
be prepared for the contig IDs to be truncated to 16 characters (due to Biopython limitations we
cannot write the genbank file if they are longer than this).
'''
def lookupStrainID(accession):
#search NCBI for all IDs (the unique numerical key, not just accession)
genbank = Entrez.read(Entrez.esearch(db="nucleotide", term=accession))
IDlist = genbank['IdList']
strainIDs=[]
#there can me multiple records returned
for thisID in IDlist:
summaries = Entrez.read(Entrez.esummary(db="nucleotide", id=thisID))
for s in summaries:
strainIDs.append(s['TaxId'])
strainIDs = list(set(strainIDs))
assert len(strainIDs)==1, "Note: more than one ID returned from query of NCBI % s" % accession
return strainIDs[0]
def info_from_genbank(gb_seqrec):
info = {}
# Not all Genbank files (i.e. those from RAST fall into this category) actually
# assign the contig ID in the ID field but they call it a "name" instead...
# Biopython automatically gives those an ID of "unknown".
if gb_seqrec.id == "unknown":
info["id"] = gb_seqrec.name
else:
info["id"] = gb_seqrec.id
info["number_of_features"] = len(gb_seqrec.features)
numcds = len([f for f in gb_seqrec.features if (f.type =='CDS')])
info["number_of_cds"] = numcds
if gb_seqrec.description:
info["gb_description"] = gb_seqrec.description
else:
info["gb_description"] = "unknown"
# The "try" part will only work for Entrez-derived GBK files but we want to be more
# generic. If we can't find it from tehre we try to get it from the dbxref and if it fails there too
# there's nothing we can do.
sys.stderr.write("Attempting to query Entrez for the taxID...\n")
try:
info["gi"]= gb_seqrec.annotations['gi']
if gb_seqrec.name:
info["gb_name"] = gb_seqrec.name
info["taxon"] = lookupStrainID(gb_seqrec.id)
except:
sys.stderr.write("Querying Entrez failed. Getting taxID from the genbank file...\n")
for record in gb_seqrec.features:
if record.type == "source":
for xref in record.qualifiers["db_xref"]:
if "taxon" in xref:
taxid = re.search("(\d+)", xref).group(1)
info["taxon"] = taxid
break
if "taxon" in info:
break
if "taxon" not in info:
sys.stderr.write("ERROR: Could not identify taxonID from querying Entrez or from the Genbank file itself...\n")
exit(2)
if gb_seqrec.annotations:
info.update([("gb_annotation: "+k,v) for k, v in list(gb_seqrec.annotations.items())])
return info
def info_from_feature(feature):
info = {}
# An extra space in the amino acid sequences exists which
# throws off the validator and may cause downstream problems.
info["aa_sequence"] = feature.qualifiers['translation'][0].strip()
if "protein_id" in info:
info["aliases"] = feature.qualifiers['protein_id'][0]
else:
info["aliases"] = ""
if "product" in feature.qualifiers:
info["function"] = feature.qualifiers['product'][0]
else:
info["function"] = "NO_FUNCTION_ASSIGNED"
info["figfam"] = ""
info["evidence_codes"] = ""
if feature.type =='CDS':
info["type"] = 'peg'
#add this to preserve other types
#info["type"] = feature.type
#Must add one to biopython's 0 indexed to get the original genbank one indexed counting
#However the stop is slice notation so it is already 1 higher than it "should" be.
# Thus we only add 1 to the start.
# I verified that this gives the correct answer relative to what is in the PubSEED
# for some + and - strand genes in E. coli.
try:
info["start"] = int(feature.location.start) + 1
info["stop"] = int(feature.location.end)
except TypeError:
# Old versions of Biopython use an access function instead of casting ExactLocation as an int.
# This syntax is marked as obsolete and will probably go away in a future version of Biopython.
info["start"] = feature.location.start.position + 1
info["stop"] = feature.location.end.position
except:
raise
if feature.strand == +1:
info["strand"] = str("+")
if feature.strand == -1:
info["strand"] = str("-")
# invert the start and stop locations...
info["start"], info["stop"] = info["stop"], info["start"]
return info
def info_from_record(record):
info = {}
info["nucleotide_sequence"] = str(record.seq)
#info["Nucleotide ID"] = record.id
#info["Nucleotide Description"] = record.description
#if record.dbxrefs:
# info["Database cross-references"] = ";".join(record.dbxrefs)
return info
def genbank_extract(ptr, version_number):
'''Extract data from a genbank file...returns some organism-specific data,
a dictionary from gene ID to a list of aliases to that gene ID (including the
original IDs from the genbank file), and the data needed to build the raw data
table required by ITEP.'''
gb_seqrec_multi = SeqIO.parse(ptr, "genbank")
#lists to store extracted seqs
genes = []
geneidToAlias = {}
counter = 0
match_SEED_xref = re.compile("SEED\:(fig\|\d+\.\d+\.peg\.\d+)")
match_KEGG_xref = re.compile("KEGG\:.*?\+(.*)")
# Have ITEP IDs been added previously?
match_ITEP_xref = re.compile("ITEP\:(fig\|\d+\.\d+\.peg\.\d+)")
# If we have a SEED or ITEP ID we want to get the version number out of that.
get_version_number = re.compile("fig\|\d+\.(\d+)\.peg\.\d+")
# These are used to help avoid colliding namespaces.
seedConflictCheck = False
itepConflictCheck = False
orginfo = dict()
cnt = 0
for gb_seqrec in gb_seqrec_multi:
orginfo = info_from_genbank(gb_seqrec)
orginfo["net_version_number"] = version_number
cnt += 1
sys.stderr.write("Extracting data on features for genbank record number %d...\n" %(cnt))
for feature in gb_seqrec.features:
if feature.type =="CDS":
#check there is only one translation and get info
#TODO - Should we attempt to translate manually if no translations are preent in the gbk file?
if 'translation' not in feature.qualifiers:
sys.stderr.write("WARNING: CDS found with no translation\n")
sys.stderr.write("Qualifiers:\n")
for key in feature.qualifiers:
sys.stderr.write("%s\t%s\n" %(key, feature.qualifiers[key]))
continue
assert len(feature.qualifiers['translation'])==1
counter += 1
if ( counter % 100 == 0 ):
sys.stderr.write("Now working on feature number %d ... \n" %(counter))
geneinfo = {}
#get aa info
geneinfo.update(info_from_feature(feature))
#get na info
record = feature.extract(gb_seqrec)
geneinfo.update(info_from_record(record))
# Handle supported xrefs: KEGG (locus tags by another name), SEED, and ITEP references
aliases = []
geneid = None
if "db_xref" in feature.qualifiers:
for xref in feature.qualifiers["db_xref"]:
match = match_SEED_xref.match(xref)
if match is not None:
geneid = match.group(1)
seedConflictCheck = True
versionNumberMatch = get_version_number.match(geneid)
realVersionNumber = versionNumberMatch.group(1)
if realVersionNumber != version_number:
sys.stderr.write("WARNING: Version number %s from SEED ID will overwrite provided version number %s\n" %(realVersionNumber, version_number))
version_number = realVersionNumber
orginfo["net_version_number"] = realVersionNumber
match = match_ITEP_xref.match(xref)
if match is not None:
# If there is both an ITEP ID and a SEED ID they must match
if geneid is not None and geneid != match.group(1):
raise IOError("ERROR: ITEP ID did not match with SEED ID in provided Genbank file - this should never happen.")
geneid = match.group(1)
itepConflictCheck = True
versionNumberMatch = get_version_number.match(geneid)
realVersionNumber = versionNumberMatch.group(1)
if realVersionNumber != version_number:
sys.stderr.write("WARNING: Version number %s from ITEP ID will overwrite provided version number %s\n" %(realVersionNumber, version_number))
version_number = realVersionNumber
orginfo["net_version_number"] = realVersionNumber
match = match_ITEP_xref.match(xref)
# Note that one SEED gene can have more than one KEGG match (i.e. go with more than one locus tag).
# This is OK - I'll just capture all of them here.
match = match_KEGG_xref.match(xref)
if match is not None:
aliases.append(match.group(1))
# If we got our genbank file from somewhere else (including from RAST) we need to just make an ID in the right format.
if geneid is None:
# This flag is used to make sure that if we pull out ONE SEED ID then ALL of them have to be pulled out in the same manner.
# If this is not the case we will most likely get duplicated IDs which is BAD.
if seedConflictCheck:
raise IOError("ERROR: In order to avoid namespace collisions, if at least one gene has a PubSEED ID attached then ALL of them must.")
elif itepConflictCheck:
raise IOError("ERROR: In order to avoid namespace collisions, if at least one gene has an ITEP ID already attached then ALL of them must.")
else:
geneid = "fig|" + str(orginfo["taxon"]) + "." + str(version_number) + ".peg." + str(counter)
pass
pass
geneinfo["feature_id"] = geneid
geneinfo["contig_id"] = orginfo["id"]
geneinfo["source_description"] = orginfo["gb_description"]
if "protein_id" in feature.qualifiers:
geneinfo["location"] = feature.qualifiers["protein_id"][0]
else:
geneinfo["location"] = ""
genename = geneinfo["aliases"]
genedesc = geneinfo["function"] + " " + orginfo["gb_description"]
geneinfo["gene_description"] = genedesc
# Add locus tag and existing feature_id to list of aliases
if "protein_id" in feature.qualifiers:
aliases.append(feature.qualifiers["protein_id"][0])
if "locus_tag" in feature.qualifiers:
aliases.append(feature.qualifiers["locus_tag"][0])
if "gene" in feature.qualifiers:
aliases.append(feature.qualifiers["gene"][0])
geneidToAlias[geneid] = aliases
genes.append(geneinfo)
return orginfo, genes, geneidToAlias
#Entrez requires an email, if not set namually, this guesses one
email = None
guessemail = getpass.getuser() + '@' + socket.getfqdn()
if email == None:
email = guessemail
Entrez.email = email
if __name__ == '__main__':
usage="%prog [options] -g genbank_file"
description=''' The purpose of this script is to take in a Genbank file (.gbk)
with multiple contig genbank files concatinated and
automatically organize three pieces of information required for input into ITEP:
1: A genbank file with the coorrect name reflecting the taxID and a version number
2: A tab-delimited file with the required format (see documentaion in the header of this file)
3: An augmented aliases file containing locus tag and geneId information from the genbank file if available.
You should NOT put your original genbank file in the $ROOT/genbank/ folder - instead put it somewhere else and call this
script. This script will automatically reformat it and put the reformatted file into the /genbank/ folder.
The genbank file is placed in $ROOT/genbank/, the tab-delimited file in $ROOT/raw/ and the aliases
file is appended to any existing aliase file in the location $ROOT/aliases/aliases
'''
parser = optparse.OptionParser(usage=usage, description=description)
parser.add_option("-g", "--genbank_file", help="Input genbank file concatinated across contigs (REQUIRED)", action="store", type="str", dest="genbank_file", default=None)
parser.add_option("-o", "--org_file", help="(OPTIONAL) a file to which to dump organism data.", action="store", type="str", dest="org_file", default=None)
parser.add_option("-r", "--replace",
help="If specified, replaces old data related to the derived organism ID with the new data (D: Throws an error if an organism already is present with the derived organism ID)",
action="store_true", dest="replace", default=False)
parser.add_option("-v", "--version_number", help="An integer used to distinguish between multiple genbank files with the same taxID (D:88888). This will become the second number in the organism ID",
action="store", type="int", dest="version_number", default=88888)
(options, args) = parser.parse_args()
if options.genbank_file is None:
sys.stderr.write("ERROR: Genbank_file (-g) is a required argument\n")
exit(2)
rootdir = locateRootDirectory()
gbk_dir = os.path.join(rootdir, "genbank")
raw_dir = os.path.join(rootdir, "raw")
# This should never happen but just in case it did, we will
if not os.path.exists(gbk_dir):
sys.stderr.write("WARNING: Genbank output directory %s did not exist (it is part of the repo and should not be deleted). Attempting to create it...\n")
os.makedirs(gbk_dir)
if not os.path.exists(raw_dir):
sys.stderr.write("WARNING: Raw file output directory %s did not exist (it is part of the repo and should not be deleted). Attempting to create it...\n")
os.makedirs(raw_dir)
# Extract data from the Genbank file
orginfo, genes, aliases = genbank_extract(options.genbank_file, options.version_number)
options.version_number = orginfo["net_version_number"]
organism_id = str(orginfo["taxon"]) + "." + str(options.version_number)
geneout_filename = os.path.join(rootdir, "raw", "%s.txt" %(organism_id))
genbank_filename = os.path.join(rootdir, "genbank", "%s.gbk" %(organism_id))
alias_filename = os.path.join(rootdir, "aliases", "aliases")
# Try to prevent conflics between multiple organisms with the same taxID
if os.path.exists(geneout_filename):
if options.replace:
# Note - to make this really robusst I should probably add a timestamp or something...
bkgeneout_filename = os.path.join(rootdir, "%s.txt.bk" %(organism_id))
sys.stderr.write("""WARNING: Backing up original gene output file %s to location %s in case something went wrong\n""" %(geneout_filename, bkgeneout_filename))
shutil.copyfile(geneout_filename, bkgeneout_filename)
else:
sys.stderr.write("""ERROR: Gene output file %s already exists!
This could indicate a conflict in taxIDs between multiple organisms. Use -r to override this error and
replace the existing file with a new one and remove the existing file, or use a different version number (-v)
if the genomes are really different.\n""" %(geneout_filename))
exit(2)
if os.path.exists(genbank_filename):
if options.replace:
# Note - to make this really robusst I should probably add a timestamp or something...
bkgenbank_filename = os.path.join(rootdir, "%s.txt.bk" %(organism_id))
sys.stderr.write("""WARNING: Backing up original gene output file %s to location %s in case something went wrong\n""" %(genbank_filename, bkgenbank_filename))
shutil.copyfile(geneout_filename, bkgenbank_filename)
else:
sys.stderr.write("""ERROR: Genbank output file %s already exists!
This could indicate a conflict in taxIDs between multiple organisms. Use -r to override this error and
replace the existing file with a new one and remove the existing file, or use a different version number (-v
if the genomes are really different.\n""" %(genbank_filename))
exit(2)
if os.path.exists(alias_filename):
# Lets check to see if we can find our organism in the existing alias file
bk_file = os.path.join(rootdir, "aliases", "aliases.bk")
shutil.copyfile(alias_filename, bk_file)
alias_ptr = open(alias_filename, "w")
# The entire organism ID must match and we must avoid subsets.
tosearch = "|" + organism_id + "."
for line in open(bk_file, "r"):
if tosearch in line:
if not options.replace:
sys.stderr.write("""ERROR: Entries with organism ID %s already exist in the aliases file.
This could indicate a conflict in taxIDs between multiple organisms. Use -r to override this error and
replace the existing entries in the aliases file with new entries or use a different version number (v) if the
genomes are really different.""" %(organism_id))
alias_ptr.close()
os.rename(bk_file, alias_filename)
exit(2)
else:
continue
else:
alias_ptr.write(line)
alias_ptr.close()
# Generate raw file.
names = ["contig_id", #from gi of gb file
"feature_id", #taxonID from NCBI lookup of accession, geneID from the db_xref GI in feature
"type", #CDS = peg
"location", #ignored by ITEP, source accession here
"start", #+1 index and rev if on neg strand
"stop", #+1 index and rev if on neg strand
"strand", #converted to + or -
"function", #from feature product
"aliases", #ignored by ITEP, from gene accession
"figfam", #ignored by ITEP, empty for all
"evidence_codes", #ignored by ITEP, empty for all
"nucleotide_sequence",#as recorded in feature
"aa_sequence"] #as recorded in feature, not translated manually
geneout_file = open(geneout_filename, 'w')
geneout = csv.DictWriter(geneout_file, fieldnames = names, delimiter="\t")
#geneout.writeheader()
for gene in genes:
geneout.writerow(dict([(n, gene[n]) for n in names]))
pass
geneout_file.close()
sys.stderr.write("Text file saved as %s\n" % geneout_filename)
# Now we need to write a new Genbank file.
# We have to make a temporary one in order to work around biopython issues.
tbl = [ line.strip("\r\n").split("\t") for line in open(geneout_filename, "r") ]
multi_gbk_object = SeqIO.parse(options.genbank_file, "genbank")
# Add the ITEP IDs - produces temporary IDs that we need to turn back into the originals
gb_seqrec_id, newToOriginalName = addItepGeneIdsToGenbank(multi_gbk_object, tbl)
# Save the resulting file with a temporary name
import random
tmp_fname = str(random.randint(0, 10)) + ".gbk"
tmp_fid = open(tmp_fname, "w")
SeqIO.write(gb_seqrec_id, tmp_fid, "genbank")
tmp_fid.close()
# Replace the temporary IDs with new IDs
tmp_fid = open(tmp_fname, "r")
fid = open(genbank_filename, "w")
replaceTemporaryIdsWithOriginalIds(tmp_fid, newToOriginalName, fid)
tmp_fid.close()
fid.close()
os.remove(tmp_fname)
sys.stderr.write("Genbank file saved as %s\n" % genbank_filename)
# IMPORTANT: Append, don't use "w" here (we don't want to blow away all the different organism entries...)
alias_file = open(alias_filename, "a+")
for geneid in aliases:
for alias in aliases[geneid]:
alias_file.write("%s\t%s\n" %(geneid, alias))
alias_file.close()
#Write out the organism data
if options.org_file is not None:
names = list(orginfo.keys())
names.sort()
orgout_file = open(options.orgout_file, "w")
orgout = csv.DictWriter(orgout_file, fieldnames = names)
orgout.writeheader()
for org in orginfos:
orgout.writerow(org)
orgout_file.close()