diff --git a/ecnet/tools/conversions.py b/ecnet/tools/conversions.py index c231253..0123294 100644 --- a/ecnet/tools/conversions.py +++ b/ecnet/tools/conversions.py @@ -40,6 +40,77 @@ def get_smiles(name): return [m.isomeric_smiles for m in get_compounds(name, 'name')] +def smiles_to_descriptors(smiles_file, descriptors_csv, fingerprints=False): + '''Generates QSPR descriptors from supplied SMILES file using + PaDEL-Descriptor + + Args: + smiles_file (str): path to source SMI file + descriptors_csv (str): path to resulting CSV file w/ descriptors + fingerprints (bool): if True, generates molecular fingerprints instead + of QSPR descriptors + + Returns: + list: list of dicts, where each dict is a molecule populated with + descriptor names (keys) and values + ''' + + if which('java') is None: + raise ReferenceError( + 'Java JRE 6+ not found (required for PaDEL-Descriptor)' + ) + + is_smi = compile(r'.*\.smi$', IGNORECASE) + if is_smi.match(smiles_file) is None: + raise ValueError('SMILES file must have a SMI extension: {}'.format( + smiles_file + )) + + dn = open(devnull, 'w') + for attempt in range(3): + try: + if fingerprints: + call([ + 'java', + '-jar', + _PADEL_PATH, + '-fingerprints', + '-retainorder', + '-convert3d', + '-retain3d', + '-dir', + smiles_file, + '-file', + descriptors_csv + ], stdout=dn, stderr=dn, timeout=15) + break + else: + call([ + 'java', + '-jar', + _PADEL_PATH, + '-2d', + '-3d', + '-retainorder', + '-convert3d', + '-retain3d', + '-dir', + smiles_file, + '-file', + descriptors_csv + ], stdout=dn, stderr=dn, timeout=15) + break + except Exception as e: + if attempt == 2: + raise e + else: + continue + + with open(descriptors_csv, 'r', encoding='utf-8') as desc_file: + reader = DictReader(desc_file) + return [row for row in reader] + + def smiles_to_mdl(smiles_file, mdl_file): '''Invoke Open Babel to generate an MDL file containing all supplied molecules; requires Open Babel to be installed externally @@ -71,7 +142,7 @@ def smiles_to_mdl(smiles_file, mdl_file): '-O', mdl_file, '--gen3D' - ], stdout=dn, stderr=dn, timeout=3600) + ], stdout=dn, stderr=dn, timeout=15) break except Exception as e: if attempt == 2: @@ -115,7 +186,7 @@ def mdl_to_descriptors(mdl_file, descriptors_csv, fingerprints=False): mdl_file, '-file', descriptors_csv - ], stdout=dn, stderr=dn, timeout=600) + ], stdout=dn, stderr=dn, timeout=15) break else: call([ @@ -130,7 +201,7 @@ def mdl_to_descriptors(mdl_file, descriptors_csv, fingerprints=False): mdl_file, '-file', descriptors_csv - ], stdout=dn, stderr=dn, timeout=600) + ], stdout=dn, stderr=dn, timeout=15) break except Exception as e: if attempt == 2: diff --git a/ecnet/tools/database.py b/ecnet/tools/database.py index 3df9271..1168a1c 100644 --- a/ecnet/tools/database.py +++ b/ecnet/tools/database.py @@ -14,14 +14,12 @@ from os import remove # ECNet imports -from ecnet.tools.conversions import get_smiles, smiles_to_mdl,\ - mdl_to_descriptors +from ecnet.tools.conversions import get_smiles, smiles_to_descriptors def create_db(input_txt, output_name, id_prefix='', targets=None, form='name', - smiles_file='mols.smi', mdl_file='mols.mdl', - desc_file='descriptors.csv', clean_up=True, fingerprints=False, - extra_strings={}): + smiles_file='mols.smi', desc_file='descriptors.csv', + clean_up=True, fingerprints=False, extra_strings={}): '''Create an ECNet-formatted database from either molecule names or SMILES strings @@ -32,9 +30,8 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name', id_prefix (str): prefix for DATAID column entries, if supplied targets (str): path to file containing target values (optional) form (str): `name` or `smiles` for selecting input format - smiles_file (str): if input format is not SMILES, this is the name of - a temporary .smi file containing SMILES strings - mdl_file (str): name of MDL file generated by Open Babel + smiles_file (str): name of temporary .smi file containing SMILES + strings desc_file (str): name of descriptors file generated by PaDEL-Descriptor clean_up (bool): if True, cleans up all files generated during processing except for the input text files and output database @@ -66,7 +63,9 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name', len(input_names), len(input_data) )) del extra_strings['Compound Name'] - smiles_file = input_txt + with open(smiles_file, 'w') as smi_file: + for d in input_data: + smi_file.write(d + '\n') else: raise ValueError('Unknown `format` argument: {}'.format(form)) @@ -83,8 +82,7 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name', else: target_data = [0 for _ in range(len(input_data))] - smiles_to_mdl(smiles_file, mdl_file) - desc = mdl_to_descriptors(mdl_file, desc_file, fingerprints) + desc = smiles_to_descriptors(smiles_file, desc_file, fingerprints) desc_keys = list(desc[0].keys()) try: desc_keys.remove('Name') @@ -135,9 +133,7 @@ def create_db(input_txt, output_name, id_prefix='', targets=None, form='name', wr.writerow(row) if clean_up: - if form != 'smiles': - remove(smiles_file) - remove(mdl_file) + remove(smiles_file) remove(desc_file)