-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathfind_CAPS.py
executable file
·108 lines (87 loc) · 4.14 KB
/
find_CAPS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/python2.6
##find snps that condition CAPS
##usage find_CAPS.py [-h] -i <reference file> -g <gff file> -o <output file>
#Copyright 2012 John McCallum & Leshi Chen
#New Zealand Institute for Plant and Food Research
#This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import sys
from Bio import SeqIO
from BCBio import GFF
from Bio.Restriction import *
import argparse
###This list is limited to economical enzymes performing well in PCR buffer
rest_batch = RestrictionBatch(
[AluI, ApaI, BamHI, BbrPI, BfrI, ClaI, DdeI, DpnII, DraI, EcoRI,
HaeIII, HindII, HinfI, HpaI, PvuII, RsaI, SacI, Sau3AI, SmaI, TaqI])
parser = argparse.ArgumentParser(description='Identify SNPs that condition restriction polymorphisms')
parser.add_argument('-i', type=argparse.FileType('r'), help="input sequence file, required", dest='in_file', required=True)
parser.add_argument('-g', type=argparse.FileType('r'), help="input gff file with SNP and indels, required", dest='gff_file', required=True)
parser.add_argument('-o', type=argparse.FileType('w'), help="output file , optional", dest='caps_out_file', required=False)
try:
my_args = parser.parse_args()
except SystemExit:
print("argument is missing or invalid, exiting...\n")
sys.exit(0)
in_seq_handle = my_args.in_file
in_gff_handle = my_args.gff_file
if my_args.caps_out_file != None:
out_file_handle = my_args.caps_out_file
else:
out_file_handle = sys.stdout
##use iterator
for myrec in SeqIO.parse(in_seq_handle, "fasta"):
##create single-entry dictionary to accept gff annotations from parser
seq_dict = {myrec.id:myrec}
##note that this filters out only SNP features
limit_info = dict(gff_id = [myrec.id] ,gff_type = ['SNP'])
in_gff_handle.seek(0)
##parse annotations into
annotations = [r for r
in GFF.parse(in_gff_handle,
base_dict=seq_dict,
limit_info=limit_info)]
##if there are any for this sequence, proceed
if annotations:
rec=annotations[0]
for feat in rec.features:
fstart=feat.location.start.position
fend=feat.location.end.position
if 20 < fstart < len(rec) - 20:
#just work with +/- 20 bp, ignoring SNPS within this
#distance from ends
fseq=rec.seq[fstart-20:fstart+20]
ref_seq = rec.seq[fstart-20:fstart+20]
variant_seq = ref_seq.tomutable()
#mutate the variant
variant_seq[20]= feat.qualifiers['Variant_seq'][0]
variant_seq = variant_seq.toseq()
#digest the sequences
ref_cuts = rest_batch.search(ref_seq)
var_cuts = rest_batch.search(variant_seq)
#print
for enz in ref_cuts:
kr = set(ref_cuts[enz])
km = set(var_cuts[enz])
outputstr=[rec.id, fstart +1,fend+1,feat.id,enz]
if len(kr) > len(km):
outputstr.append("reference")
#print('\t'.join(map(str,outputstr)))
out_file_handle.write('\t'.join(str(x) for x in outputstr)+'\n')
elif len(kr) < len(km):
outputstr.append("variant")
#print('\t'.join(map(str,outputstr)))
out_file_handle.write('\t'.join(str(x) for x in outputstr)+'\n')
in_gff_handle.close()
in_seq_handle.close()
out_file_handle.close()