-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTheo.py~
111 lines (73 loc) · 3.01 KB
/
Theo.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from typing import *
from eliot_main import get_genetic_code, write_csv, read_csv, read_fasta
def read_flat_file(filename):
"""Load a file in memory by returning a string
This function is written by Theo Gauvrit.
Args:
filename: file to open
Returns:
string of the whole file (with \n)
"""
fichier=open(filename,"r")
txt = fichier.read()
fichier.close()
return txt # 'LOCUS NM_000518 ' ...
def reversed_complement(dna_seq):
"""Return the reversed complement of the given DNA sequence
This function is written by Theo GAUVRIT.
Args:
dna_seq: DNA sequence to be reversed.
Returns:
reversed complement DNA sequence
"""
complement_seq=[]
dna_seq=dna_seq.upper()
complement_dict={"A":"T","C":"G","G":"C","T":"A"}
for nucleotide in dna_seq[::-1]:
complement_seq.append(complement_dict[nucleotide])
return "".join(complement_seq)
def find_orf(seq: str, code_table_id: int):
"""Give a list of all ORF in the sequence if they are grater than the threshold
This function is written by .
Args:
seq: Sequence to analyse
threshold: Minimum size of the ORF in the list
code_table_id: NCBI identifier of the translation table used on this sequence
Returns:
list of ORF
start: start position (in bp)
stop: stop position (in bp)
length: ORF length (in bp)
protein: translated protein sequence if available.
frame:1,2,3 ou -1,-2,-3
"""
transl_table, start_table = get_genetic_code(code_table_id)
ORF_lidi=[]
#ORF_lidi[e]={"start":0,"stop":0,"length":0,"protein":0,"frame":0}
start_lidi=[]
listestop=[]
stop_lidi=[]
rever=reversed_complement(seq)
for z in transl_table.keys():
if transl_table[z]=='*':
listestop.append(z)
for a in range(3):
for pos in range(a,len(seq)):
if seq[pos:pos+3] in start_table:
start_lidi.append({"pos":pos,"frame":a})
if rever[pos:pos+3] in start_table:
start_lidi.append({"pos":pos,"frame":-a})
if seq[pos:pos+3] in listestop:
stop_lidi.append({"pos":pos,"frame":a})
if rever[pos:pos+3] in listestop:
stop_lidi.append({"pos":pos,"frame":-a})
for i in stop_lidi:
for j in stop_lidi:
if j["frame"]==i["frame"] and j["pos"]>i["pos"]:
return listestop,start_lidi,stop_lidi
if __name__=="__main__":
seqentry=read_fasta("influenza.fasta")
a,b,c=find_orf(seqentry,11)
print (a)
print (b)
print (c)