-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_apostruct.py
130 lines (123 loc) · 4.55 KB
/
get_apostruct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from prody import *
import argparse
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import AllChem
from pymol import cmd
import numpy as np
from scipy.spatial.distance import cdist
import os
from multiprocessing import Pool, cpu_count
def get_apo(id):
dir='/scratch/rishal/v2019-other-PL/'+id+'/'
protein_file=dir+id+'_protein.pdb'
ligand_file=dir+id+'_ligand.sdf'
pathPDBFolder('/scratch/rishal/apo_Structs/')
apo_file=open('apo_structs.txt','a')
blast_fail=open('blast_fail.txt','a')
#load protein file, get final chain seq, save to fasta and get entire protein length
protein=parsePDB(protein_file)
chains=sorted(list(set(protein.getChids())))
seq=protein[chains[-1]].getSequence()
fasta=open(dir+'fasta.txt','w')
fasta.write('>'+id+'\n')
fasta.write(seq)
fasta.close()
protein_seq=''
for chain in chains:
if chain==' ':
continue
else:
protein_seq+=protein[chain].getSequence()
protein_len=len(protein_seq)
#load ligand file and get ligand coords
mol = Chem.MolFromMolFile(ligand_file,sanitize=False)
c=mol.GetConformer()
ligand_coords=c.GetPositions()
#blast sequence and output results
try:
os.system('/scratch/rishal/ncbi-blast-2.11.0+/bin/blastp -query '+dir+'fasta.txt -db /scratch/rishal/pdbaa -out '+dir+'blast.txt -outfmt 6')
blast_results=open(dir+'blast.txt','r').readlines()
except:
blast_fail.write(id+'\n')
return
#iterate through hits, full alignment with pymol and check if ligands are close
checked_pdbs=[]
for line in blast_results:
print(line)
pdb_id=line.split()[1].split('_')[0].lower()
align_identity=float(line.split()[2])
if pdb_id in checked_pdbs:
continue
if align_identity<80:
continue
if pdb_id==id:
continue
checked_pdbs.append(pdb_id)
pdb=fetchPDB(pdb_id,tp='http')
if pdb is None:
continue
cmd.load(pdb)
cmd.load(protein_file)
cmd.align( pdb_id+'////CA', id+'////CA',object='aln')
cmd.align( pdb_id+'////CA', id+'////CA',cycles=0, transform=0,object='aln')
raw_aln = cmd.get_raw_alignment('aln')
idx2resn = {}
idx2chain = {}
cmd.iterate('aln', 'idx2resn[model, index] = resn', space={'idx2resn': idx2resn})
cmd.iterate('aln', 'idx2chain[model, index] = chain', space={'idx2chain': idx2chain})
count_identity=0
chains_hit=[]
for idx1, idx2 in raw_aln:
if idx2resn[idx1]==idx2resn[idx2]:
count_identity+=1
chains_hit.append(idx2chain[idx2])
chains_hit=sorted(list(set(chains_hit)))
print(chains_hit)
chain_seq=''
for chain in chains_hit:
chain_seq+= 'chain '+chain+' or '
chain_seq=' '.join(chain_seq.split()[:-1])
cmd.save(dir+'apo_temp.pdb',pdb_id)
cmd.delete(pdb_id)
cmd.delete(id)
apo_prot=parsePDB(dir+'apo_temp.pdb')
apo_seq=''
for chain in chains_hit:
apo_seq+=apo_prot[chain].getSequence()
hit_len=len(apo_seq)
# sequence identity and coverage thresholded at 0.8
if count_identity/hit_len < 0.8 or count_identity/protein_len < 0.8:
os.remove(dir+'apo_temp.pdb')
continue
ligand_apo = apo_prot.select('not protein and not water and not ion')
if ligand_apo is None:
apo_file.write(id+'\n')
apo_prot=apo_prot.select(chain_seq + ' or ion')
apo_prot=apo_prot.select('protein or ion and not water')
#writePDB(dir+id+'_apo.pdb',apo_prot)
#os.remove(dir+'apo_temp.pdb')
break
#check if hit has ligand in same binding pocket
ligand_apo_coords=ligand_apo.getCoords()
dist=cdist(ligand_coords,ligand_apo_coords)
if np.min(dist) <=4:
os.remove(dir+'apo_temp.pdb')
continue
else:
apo_file.write(id+'\n')
apo_prot=apo_prot.select(chain_seq + ' or ion')
apo_prot=apo_prot.select('protein or ion and not water')
#writePDB(dir+id+'_apo.pdb',apo_prot)
#os.remove(dir+'apo_temp.pdb')
break
'''parser = argparse.ArgumentParser()
parser.add_argument('-s', '--string', type=str, help="comma seperated pdb ids for apo query", required=True)
opt = parser.parse_args()
ids=opt.string.split(',')
ncpus=cpu_count()
pool = Pool(ncpus)
pool.map(get_apo,ids)
'''
#for id in ids:
# get_apo(id)
get_apo('2aq7')