Skip to content

Commit

Permalink
Merge pull request #32 from ECRL/dev
Browse files Browse the repository at this point in the history
Addition to conversion tools, update to database creation function
  • Loading branch information
tjkessler authored May 30, 2019
2 parents 22d549f + 6c7f4f0 commit 9deab44
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 20 deletions.
19 changes: 16 additions & 3 deletions docs/usage/tools.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@ ECNet databases are comma-separated value (CSV) formatted files that provide inf

Our [databases](https://github.com/ECRL/ECNet/tree/master/databases) directory on GitHub contains databases for cetane number, cloud point, kinetic viscosity, pour point and yield sooting index, as well as a database template.

You can create an ECNet-formatted database with molecule names or SMILES and (optionally) target values. The following programs must be installed for you to do so:
- [Open Babel](http://openbabel.org/wiki/Main_Page) software
- [Java JRE](https://www.oracle.com/technetwork/java/javase/downloads/jre8-downloads-2133155.html) version 6 and above
You can create an ECNet-formatted database with molecule names or SMILES and (optionally) target values. [Java JRE](https://www.oracle.com/technetwork/java/javase/downloads/jre8-downloads-2133155.html) version 6 and above is required to create a database.

Supplied names or SMILES must exist in a text file, one entry per line:
```
Expand Down Expand Up @@ -70,6 +68,8 @@ smiles = get_smiles('Molecule Name')

### SMILES string to MDL Molfile

Creating an MDL file requires [Open Babel](http://openbabel.org/wiki/Main_Page) to be installed.

A text file containing SMILES strings, one per line, is required:

```
Expand Down Expand Up @@ -100,6 +100,19 @@ from ecnet.tools.conversions import mdl_to_descriptors
mdl_to_descriptors('molfile.md', 'descriptors.csv')
```

### SMILES string to QSPR descriptors

A text file containing SMILES strings, one per line, is required, as well as an installation of [Java JRE](https://www.oracle.com/technetwork/java/javase/downloads/jre8-downloads-2133155.html).

*Note: the file extension for the text file containing SMILES strings must be ".smi"

```python
from ecnet.tools.conversions import smiles_to_descriptors

# A CSV file with descriptors is generated
smiles_to_descriptors('molecules.smi', 'descriptors.csv')
```

## ECNet .prj file usage

Once an ECNet project has been created, the resulting .prj file can be used to predict properties for new molecules. A text file containing names or SMILES strings of new molecules, one per line, is required in addition to the .prj file.
Expand Down
77 changes: 74 additions & 3 deletions ecnet/tools/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,77 @@ def get_smiles(name):
return [m.isomeric_smiles for m in get_compounds(name, 'name')]


def smiles_to_descriptors(smiles_file, descriptors_csv, fingerprints=False):
'''Generates QSPR descriptors from supplied SMILES file using
PaDEL-Descriptor
Args:
smiles_file (str): path to source SMI file
descriptors_csv (str): path to resulting CSV file w/ descriptors
fingerprints (bool): if True, generates molecular fingerprints instead
of QSPR descriptors
Returns:
list: list of dicts, where each dict is a molecule populated with
descriptor names (keys) and values
'''

if which('java') is None:
raise ReferenceError(
'Java JRE 6+ not found (required for PaDEL-Descriptor)'
)

is_smi = compile(r'.*\.smi$', IGNORECASE)
if is_smi.match(smiles_file) is None:
raise ValueError('SMILES file must have a SMI extension: {}'.format(
smiles_file
))

dn = open(devnull, 'w')
for attempt in range(3):
try:
if fingerprints:
call([
'java',
'-jar',
_PADEL_PATH,
'-fingerprints',
'-retainorder',
'-convert3d',
'-retain3d',
'-dir',
smiles_file,
'-file',
descriptors_csv
], stdout=dn, stderr=dn, timeout=15)
break
else:
call([
'java',
'-jar',
_PADEL_PATH,
'-2d',
'-3d',
'-retainorder',
'-convert3d',
'-retain3d',
'-dir',
smiles_file,
'-file',
descriptors_csv
], stdout=dn, stderr=dn, timeout=15)
break
except Exception as e:
if attempt == 2:
raise e
else:
continue

with open(descriptors_csv, 'r', encoding='utf-8') as desc_file:
reader = DictReader(desc_file)
return [row for row in reader]


def smiles_to_mdl(smiles_file, mdl_file):
'''Invoke Open Babel to generate an MDL file containing all supplied
molecules; requires Open Babel to be installed externally
Expand Down Expand Up @@ -71,7 +142,7 @@ def smiles_to_mdl(smiles_file, mdl_file):
'-O',
mdl_file,
'--gen3D'
], stdout=dn, stderr=dn, timeout=3600)
], stdout=dn, stderr=dn, timeout=15)
break
except Exception as e:
if attempt == 2:
Expand Down Expand Up @@ -115,7 +186,7 @@ def mdl_to_descriptors(mdl_file, descriptors_csv, fingerprints=False):
mdl_file,
'-file',
descriptors_csv
], stdout=dn, stderr=dn, timeout=600)
], stdout=dn, stderr=dn, timeout=15)
break
else:
call([
Expand All @@ -130,7 +201,7 @@ def mdl_to_descriptors(mdl_file, descriptors_csv, fingerprints=False):
mdl_file,
'-file',
descriptors_csv
], stdout=dn, stderr=dn, timeout=600)
], stdout=dn, stderr=dn, timeout=15)
break
except Exception as e:
if attempt == 2:
Expand Down
24 changes: 10 additions & 14 deletions ecnet/tools/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,12 @@
from os import remove

# ECNet imports
from ecnet.tools.conversions import get_smiles, smiles_to_mdl,\
mdl_to_descriptors
from ecnet.tools.conversions import get_smiles, smiles_to_descriptors


def create_db(input_txt, output_name, id_prefix='', targets=None, form='name',
smiles_file='mols.smi', mdl_file='mols.mdl',
desc_file='descriptors.csv', clean_up=True, fingerprints=False,
extra_strings={}):
smiles_file='mols.smi', desc_file='descriptors.csv',
clean_up=True, fingerprints=False, extra_strings={}):
'''Create an ECNet-formatted database from either molecule names or SMILES
strings
Expand All @@ -32,9 +30,8 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name',
id_prefix (str): prefix for DATAID column entries, if supplied
targets (str): path to file containing target values (optional)
form (str): `name` or `smiles` for selecting input format
smiles_file (str): if input format is not SMILES, this is the name of
a temporary .smi file containing SMILES strings
mdl_file (str): name of MDL file generated by Open Babel
smiles_file (str): name of temporary .smi file containing SMILES
strings
desc_file (str): name of descriptors file generated by PaDEL-Descriptor
clean_up (bool): if True, cleans up all files generated during
processing except for the input text files and output database
Expand Down Expand Up @@ -66,7 +63,9 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name',
len(input_names), len(input_data)
))
del extra_strings['Compound Name']
smiles_file = input_txt
with open(smiles_file, 'w') as smi_file:
for d in input_data:
smi_file.write(d + '\n')

else:
raise ValueError('Unknown `format` argument: {}'.format(form))
Expand All @@ -83,8 +82,7 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name',
else:
target_data = [0 for _ in range(len(input_data))]

smiles_to_mdl(smiles_file, mdl_file)
desc = mdl_to_descriptors(mdl_file, desc_file, fingerprints)
desc = smiles_to_descriptors(smiles_file, desc_file, fingerprints)
desc_keys = list(desc[0].keys())
try:
desc_keys.remove('Name')
Expand Down Expand Up @@ -135,9 +133,7 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name',
wr.writerow(row)

if clean_up:
if form != 'smiles':
remove(smiles_file)
remove(mdl_file)
remove(smiles_file)
remove(desc_file)


Expand Down

0 comments on commit 9deab44

Please sign in to comment.