-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_prep_zinc_qed.py
89 lines (71 loc) · 2.84 KB
/
data_prep_zinc_qed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import rdkit.Chem as Chem
import glob
from tqdm import tqdm
import pickle
from utils.dataloader import QM9Dataset
import numpy as np
import os
from utils.helpers import download_files
import pandas as pd
periodic_table = {
1: 'H',
6: 'C',
7: 'N',
8: 'O',
9: 'F',
15:'P',
16:'S',
17:'Cl',
35:'Br',
53:'I'
}
molecules_Adjacency_list = []
ions = 0
if not os.path.exists("data/250k_rndm_zinc_drugs_clean_3.csv"):
print("Data not found locally")
download_files("https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv")
values = []
smiles = []
with open('data/250k_rndm_zinc_drugs_clean_3.csv') as f:
next(f)
for line in f:
info = line.split('",')
if len(info)>1:
values.append(info[1].strip().split(','))
else:
smiles.append(info[0].strip('"').strip())
data =np.asarray(values,dtype=np.float)
data = pd.DataFrame({'smiles':smiles,'LogP':data[:,0],'QED':data[:,1],'SAS':data[:,2]})
test_samples=data.sample(frac=0.2)
remaining_samples = data.loc[~data.index.isin(test_samples.index)]
train_samples = remaining_samples[remaining_samples.QED>0.75]
for name, dataset in zip(['train','test'],[train_samples, test_samples]):
for i, sample in tqdm(dataset.iterrows(),total=len(dataset)):
smiles = sample.smiles
molecule = Chem.MolFromSmiles(smiles)
if '+' in smiles or '-' in smiles:
ions += 1
#continue
#convert aromatic bonds to single/double
Chem.Kekulize(molecule)
# Hydrogen is normally implicit, but we need them to exist explicitly in the molecule
molecule = Chem.AddHs(molecule)
molecule_list = []
for atom in molecule.GetAtoms():
molecule_list.append(periodic_table[atom.GetAtomicNum()])
Adj = Chem.rdmolops.GetAdjacencyMatrix(molecule)
Adj2 = np.zeros(Adj.shape, dtype=np.int32)
for bond in molecule.GetBonds():
start = bond.GetBeginAtomIdx()
end = bond.GetEndAtomIdx()
bond_type = bond.GetBondType()
Adj2[start,end] = int(bond_type)
Adj2[end,start] = int(bond_type)
# convert list of atoms to numpy array for easier computations later
molecule_list = np.asarray(molecule_list)
charges = [atom.GetFormalCharge() for atom in molecule.GetAtoms()]
num_neighbours = Adj2.sum(axis=1)
molecules_Adjacency_list.append([molecule_list, Adj, Adj2, {'QED':sample.QED,'SAS':sample.SAS,'LogP':sample.LogP}, smiles, charges, num_neighbours])
if not os.path.exists("data/zinc_properties") : os.makedirs("data/zinc_properties")
pickle.dump(molecules_Adjacency_list,open(f'data/zinc_properties/adjacency_matrix_{name}.pkl','wb'))
print(f" {ions} molecules containing ions")