diff --git a/docs/usage/tools.md b/docs/usage/tools.md index 07df55b..f36baf2 100644 --- a/docs/usage/tools.md +++ b/docs/usage/tools.md @@ -6,9 +6,7 @@ ECNet databases are comma-separated value (CSV) formatted files that provide inf Our [databases](https://github.com/ECRL/ECNet/tree/master/databases) directory on GitHub contains databases for cetane number, cloud point, kinetic viscosity, pour point and yield sooting index, as well as a database template. -You can create an ECNet-formatted database with molecule names or SMILES and (optionally) target values. The following programs must be installed for you to do so: -- [Open Babel](http://openbabel.org/wiki/Main_Page) software -- [Java JRE](https://www.oracle.com/technetwork/java/javase/downloads/jre8-downloads-2133155.html) version 6 and above +You can create an ECNet-formatted database with molecule names or SMILES and (optionally) target values. [Java JRE](https://www.oracle.com/technetwork/java/javase/downloads/jre8-downloads-2133155.html) version 6 and above is required to create a database. Supplied names or SMILES must exist in a text file, one entry per line: ``` @@ -70,6 +68,8 @@ smiles = get_smiles('Molecule Name') ### SMILES string to MDL Molfile +Creating an MDL file requires [Open Babel](http://openbabel.org/wiki/Main_Page) to be installed. + A text file containing SMILES strings, one per line, is required: ``` @@ -100,6 +100,19 @@ from ecnet.tools.conversions import mdl_to_descriptors mdl_to_descriptors('molfile.md', 'descriptors.csv') ``` +### SMILES string to QSPR descriptors + +A text file containing SMILES strings, one per line, is required, as well as an installation of [Java JRE](https://www.oracle.com/technetwork/java/javase/downloads/jre8-downloads-2133155.html). + +*Note: the file extension for the text file containing SMILES strings must be ".smi" + +```python +from ecnet.tools.conversions import smiles_to_descriptors + +# A CSV file with descriptors is generated +smiles_to_descriptors('molecules.smi', 'descriptors.csv') +``` + ## ECNet .prj file usage Once an ECNet project has been created, the resulting .prj file can be used to predict properties for new molecules. A text file containing names or SMILES strings of new molecules, one per line, is required in addition to the .prj file. diff --git a/ecnet/tools/conversions.py b/ecnet/tools/conversions.py index c231253..0123294 100644 --- a/ecnet/tools/conversions.py +++ b/ecnet/tools/conversions.py @@ -40,6 +40,77 @@ def get_smiles(name): return [m.isomeric_smiles for m in get_compounds(name, 'name')] +def smiles_to_descriptors(smiles_file, descriptors_csv, fingerprints=False): + '''Generates QSPR descriptors from supplied SMILES file using + PaDEL-Descriptor + + Args: + smiles_file (str): path to source SMI file + descriptors_csv (str): path to resulting CSV file w/ descriptors + fingerprints (bool): if True, generates molecular fingerprints instead + of QSPR descriptors + + Returns: + list: list of dicts, where each dict is a molecule populated with + descriptor names (keys) and values + ''' + + if which('java') is None: + raise ReferenceError( + 'Java JRE 6+ not found (required for PaDEL-Descriptor)' + ) + + is_smi = compile(r'.*\.smi$', IGNORECASE) + if is_smi.match(smiles_file) is None: + raise ValueError('SMILES file must have a SMI extension: {}'.format( + smiles_file + )) + + dn = open(devnull, 'w') + for attempt in range(3): + try: + if fingerprints: + call([ + 'java', + '-jar', + _PADEL_PATH, + '-fingerprints', + '-retainorder', + '-convert3d', + '-retain3d', + '-dir', + smiles_file, + '-file', + descriptors_csv + ], stdout=dn, stderr=dn, timeout=15) + break + else: + call([ + 'java', + '-jar', + _PADEL_PATH, + '-2d', + '-3d', + '-retainorder', + '-convert3d', + '-retain3d', + '-dir', + smiles_file, + '-file', + descriptors_csv + ], stdout=dn, stderr=dn, timeout=15) + break + except Exception as e: + if attempt == 2: + raise e + else: + continue + + with open(descriptors_csv, 'r', encoding='utf-8') as desc_file: + reader = DictReader(desc_file) + return [row for row in reader] + + def smiles_to_mdl(smiles_file, mdl_file): '''Invoke Open Babel to generate an MDL file containing all supplied molecules; requires Open Babel to be installed externally @@ -71,7 +142,7 @@ def smiles_to_mdl(smiles_file, mdl_file): '-O', mdl_file, '--gen3D' - ], stdout=dn, stderr=dn, timeout=3600) + ], stdout=dn, stderr=dn, timeout=15) break except Exception as e: if attempt == 2: @@ -115,7 +186,7 @@ def mdl_to_descriptors(mdl_file, descriptors_csv, fingerprints=False): mdl_file, '-file', descriptors_csv - ], stdout=dn, stderr=dn, timeout=600) + ], stdout=dn, stderr=dn, timeout=15) break else: call([ @@ -130,7 +201,7 @@ def mdl_to_descriptors(mdl_file, descriptors_csv, fingerprints=False): mdl_file, '-file', descriptors_csv - ], stdout=dn, stderr=dn, timeout=600) + ], stdout=dn, stderr=dn, timeout=15) break except Exception as e: if attempt == 2: diff --git a/ecnet/tools/database.py b/ecnet/tools/database.py index 3df9271..1168a1c 100644 --- a/ecnet/tools/database.py +++ b/ecnet/tools/database.py @@ -14,14 +14,12 @@ from os import remove # ECNet imports -from ecnet.tools.conversions import get_smiles, smiles_to_mdl,\ - mdl_to_descriptors +from ecnet.tools.conversions import get_smiles, smiles_to_descriptors def create_db(input_txt, output_name, id_prefix='', targets=None, form='name', - smiles_file='mols.smi', mdl_file='mols.mdl', - desc_file='descriptors.csv', clean_up=True, fingerprints=False, - extra_strings={}): + smiles_file='mols.smi', desc_file='descriptors.csv', + clean_up=True, fingerprints=False, extra_strings={}): '''Create an ECNet-formatted database from either molecule names or SMILES strings @@ -32,9 +30,8 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name', id_prefix (str): prefix for DATAID column entries, if supplied targets (str): path to file containing target values (optional) form (str): `name` or `smiles` for selecting input format - smiles_file (str): if input format is not SMILES, this is the name of - a temporary .smi file containing SMILES strings - mdl_file (str): name of MDL file generated by Open Babel + smiles_file (str): name of temporary .smi file containing SMILES + strings desc_file (str): name of descriptors file generated by PaDEL-Descriptor clean_up (bool): if True, cleans up all files generated during processing except for the input text files and output database @@ -66,7 +63,9 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name', len(input_names), len(input_data) )) del extra_strings['Compound Name'] - smiles_file = input_txt + with open(smiles_file, 'w') as smi_file: + for d in input_data: + smi_file.write(d + '\n') else: raise ValueError('Unknown `format` argument: {}'.format(form)) @@ -83,8 +82,7 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name', else: target_data = [0 for _ in range(len(input_data))] - smiles_to_mdl(smiles_file, mdl_file) - desc = mdl_to_descriptors(mdl_file, desc_file, fingerprints) + desc = smiles_to_descriptors(smiles_file, desc_file, fingerprints) desc_keys = list(desc[0].keys()) try: desc_keys.remove('Name') @@ -135,9 +133,7 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name', wr.writerow(row) if clean_up: - if form != 'smiles': - remove(smiles_file) - remove(mdl_file) + remove(smiles_file) remove(desc_file)