forked from jmp1985/metrix-database
-
Notifications
You must be signed in to change notification settings - Fork 0
/
protein_parser.py
144 lines (120 loc) · 3.32 KB
/
protein_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
from __future__ import division
import sqlite3
import os
#from rdkit import Chem
#import textract
#import PyPDF2
class ProteinParser(object):
'''
A class to parse protein information
'''
def __init__(self, handle):
'''
Initialise the class with the handle
'''
self.handle = handle
# self.cur = self.handle.cursor()
def add_protein(self, pdb_id, filename):
'''
Add protein details to the database
'''
# Get the sqlite cursor
cur = self.handle.cursor()
# def get_atom_number(pdb_id, filename):
# '''A function to count the number of atoms there are in a given
# amino acid sequence; I am 3 atoms down compared to ProtParam webservices
# results as I don't account for 2Hs and 1O at starting and ending amino
# acid'''
aa_dict = {'A' : 10,
'R' : 23,
'N' : 14,
'D' : 13,
'C' : 11,
'Q' : 17,
'E' : 16,
'G' : 7,
'H' : 17,
'I' : 19,
'L' : 19,
'K' : 21,
'M' : 17,
'F' : 20,
'P' : 14,
'S' : 11,
'T' : 14,
'W' : 24,
'Y' : 21,
'V' : 16
}
mw_dict = {'A' : 71,
'R' : 156,
'N' : 114,
'D' : 115,
'C' : 103,
'Q' : 128,
'E' : 129,
'G' : 57,
'H' : 137,
'I' : 113,
'L' : 113,
'K' : 128,
'M' : 131,
'F' : 147,
'P' : 97,
'S' : 87,
'T' : 101,
'W' : 86,
'Y' : 163,
'V' : 99
}
atom_sum = []
mw_chain = []
with open(filename, 'r') as seq:
next(seq)
for line in seq:
if line.startswith('>%s' %pdb_id):
break
else:
line_stripped = line.rstrip('\n')
for letter in line_stripped:
if letter in aa_dict:
atom_sum.append(aa_dict[letter])
if letter in mw_dict:
mw_chain.append(mw_dict[letter])
atom_num = sum(atom_sum)
atom_num = atom_num + 3
mw_chain = sum(mw_chain)
mw_chain = mw_chain + 18
print 'Reading: %s for pdb id: %s' % (filename, pdb_id)
protein_data = {
'No_atom_chain' : atom_num,
'MW_chain' : mw_chain
}
# Inserts acquired information into relevant tables
# Inserts pdb_id
cur.executescript( '''
INSERT OR IGNORE INTO Protein
(pdb_id_id) SELECT id FROM PDB_id
WHERE PDB_id.pdb_id="%s";
''' % (pdb_id))
cur.execute('''
SELECT id FROM PDB_id WHERE pdb_id="%s"
''' % (pdb_id))
pdb_pk = (cur.fetchone())[0]
# Inserts pdb reference statistics
# Adds necessary columns
for keys in protein_data.keys():
try:
cur.executescript('''
ALTER TABLE Protein ADD "%s" TEXT
''' % (keys))
except:
pass
items = len(protein_data)
for data in protein_data:
cur.execute('''
UPDATE Protein SET "%s" = "%s"
WHERE pdb_id_id = "%s"
''' % (data, protein_data[data], pdb_pk ))
self.handle.commit()