diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..83d765a --- /dev/null +++ b/.gitignore @@ -0,0 +1,111 @@ +# Prerequisites +*.d + +# Compiled object files +*.slo +*.lo +*.o +*.obj + +# Precompiled headers +*.gch +*.pch + +# Compiled dynamic libraries +*.so +*.so.[0-9]* +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +.installed.cfg +MANIFEST +*.egg-info/ +*.egg +*.manifest +*.spec +pip-log.txt +pip-delete-this-directory.txt +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.pytest_cache/ + +# Documentation +doc/html/ +doc/latex/ +doc/man/ +doc/xml/ +doc/_build/ +doc/source +doc/modules + +# Environments +.env +.venv +env/ +venv/ +ENV/ + +# Editor junk +tags +[._]*.s[a-v][a-z] +[._]*.sw[a-p] +[._]s[a-v][a-z] +[._]sw[a-p] +*~ +\#*\# +.\#* +.ropeproject +.idea/ +.spyderproject +.spyproject +.vscode/ +# Mac .DS_Store +.DS_Store + +# jupyter notebook checkpoints +.ipynb_checkpoints + +# version file generated by rob +padelpy/_version.py + diff --git a/README.md b/README.md index 42f486d..b2b27d2 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,28 @@ fingerprints = from_mdl('mols.mdl', fingerprints=True, descriptors=False) _ = from_mdl('mols.mdl', output_csv='descriptors.csv') ``` +### SDF to Descriptors/Fingerprints + +The "from_sdf" function accepts a filepath as an argument, and returns a list. +Each list element is a dictionary with descriptors/fingerprints corresponding to each supplied +molecule (indexed as they appear in the SDF file). + +```python +from padelpy import from_sdf + +# calculate molecular descriptors for molecules in `mols.sdf` +descriptors = from_sdf('mols.sdf') + +# in addition to descriptors, calculate PubChem fingerprints +desc_fp = from_sdf('mols.sdf', fingerprints=True) + +# only calculate fingerprints +fingerprints = from_sdf('mols.sdf', fingerprints=True, descriptors=False) + +# save descriptors to a CSV file +_ = from_sdf('mols.sdf', output_csv='descriptors.csv') +``` + ### Command Line Wrapper Alternatively, you can have more control over PaDEL-Descriptor with the command-line wrapper function. Any combination of arguments supported by PaDEL-Descriptor can be accepted by the "padeldescriptor" function. @@ -84,9 +106,12 @@ from padelpy import padeldescriptor # to supply a configuration file padeldescriptor(config='\\path\\to\\config') -# to supply an input and output file +# to supply an input (MDL) and output file padeldescriptor(mol_dir='molecules.mdl', d_file='descriptors.csv') +# to supply an input (SDF) and output file +padeldescriptor(mol_dir='molecules.sdf', d_file='descriptors.csv') + # a SMILES file can be supplied padeldescriptor(mol_dir='molecules.smi', d_file='descriptors.csv') diff --git a/padelpy/__init__.py b/padelpy/__init__.py index 79627f6..011cc1a 100644 --- a/padelpy/__init__.py +++ b/padelpy/__init__.py @@ -1,3 +1,3 @@ -from padelpy.wrapper import padeldescriptor -from padelpy.functions import from_mdl, from_smiles +from .wrapper import padeldescriptor +from .functions import from_mdl, from_smiles, from_sdf __version__ = '0.1.10' diff --git a/padelpy/functions.py b/padelpy/functions.py index d4ea3ee..f4675c8 100644 --- a/padelpy/functions.py +++ b/padelpy/functions.py @@ -8,6 +8,7 @@ # Contains various functions commonly used with PaDEL-Descriptor # +import warnings # stdlib. imports from collections import OrderedDict from csv import DictReader @@ -15,15 +16,20 @@ from os import remove from re import compile, IGNORECASE from time import sleep -import warnings # PaDELPy imports from padelpy import padeldescriptor +__all__ = [ + "from_mdl", + "from_smiles", + "from_sdf", +] + def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, fingerprints: bool = False, timeout: int = 60) -> OrderedDict: - ''' from_smiles: converts SMILES string to QSPR descriptors/fingerprints + """ from_smiles: converts SMILES string to QSPR descriptors/fingerprints Args: smiles (str, list): SMILES string for a given molecule, or a list of @@ -38,17 +44,17 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, list of OrderedDicts, else single OrderedDict; each OrderedDict contains labels and values for each descriptor generated for each supplied molecule - ''' + """ - timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3] + timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")[:-3] - with open('{}.smi'.format(timestamp), 'w') as smi_file: + with open("{}.smi".format(timestamp), "w") as smi_file: if type(smiles) == str: smi_file.write(smiles) elif type(smiles) == list: - smi_file.write('\n'.join(smiles)) + smi_file.write("\n".join(smiles)) else: - raise RuntimeError('Unknown input format for `smiles`: {}'.format( + raise RuntimeError("Unknown input format for `smiles`: {}".format( type(smiles) )) smi_file.close() @@ -56,12 +62,12 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, save_csv = True if output_csv is None: save_csv = False - output_csv = '{}.csv'.format(timestamp) + output_csv = "{}.csv".format(timestamp) for attempt in range(3): try: padeldescriptor( - mol_dir='{}.smi'.format(timestamp), + mol_dir="{}.smi".format(timestamp), d_file=output_csv, convert3d=True, retain3d=True, @@ -74,7 +80,7 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, break except RuntimeError as exception: if attempt == 2: - remove('{}.smi'.format(timestamp)) + remove("{}.smi".format(timestamp)) if not save_csv: sleep(0.5) try: @@ -85,33 +91,33 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, else: continue - with open(output_csv, 'r', encoding='utf-8') as desc_file: + with open(output_csv, "r", encoding="utf-8") as desc_file: reader = DictReader(desc_file) rows = [row for row in reader] desc_file.close() - remove('{}.smi'.format(timestamp)) + remove("{}.smi".format(timestamp)) if not save_csv: remove(output_csv) if type(smiles) == list and len(rows) != len(smiles): - raise RuntimeError('PaDEL-Descriptor failed on one or more mols.' + - ' Ensure the input structures are correct.') + raise RuntimeError("PaDEL-Descriptor failed on one or more mols." + + " Ensure the input structures are correct.") elif type(smiles) == str and len(rows) == 0: raise RuntimeError( - 'PaDEL-Descriptor failed on {}.'.format(smiles) + - ' Ensure input structure is correct.' + "PaDEL-Descriptor failed on {}.".format(smiles) + + " Ensure input structure is correct." ) for idx, r in enumerate(rows): if len(r) == 0: raise RuntimeError( - 'PaDEL-Descriptor failed on {}.'.format(smiles[idx]) + - ' Ensure input structure is correct.' + "PaDEL-Descriptor failed on {}.".format(smiles[idx]) + + " Ensure input structure is correct." ) for idx in range(len(rows)): - del rows[idx]['Name'] + del rows[idx]["Name"] if type(smiles) == str: return rows[0] @@ -120,7 +126,7 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True, fingerprints: bool = False, timeout: int = 60) -> list: - ''' from_mdl: converts MDL file into QSPR descriptors/fingerprints; + """ from_mdl: converts MDL file into QSPR descriptors/fingerprints; multiple molecules may be represented in the MDL file Args: @@ -133,25 +139,69 @@ def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True, Returns: list: list of dicts, where each dict corresponds sequentially to a compound in the supplied MDL file - ''' + """ - is_mdl = compile(r'.*\.mdl$', IGNORECASE) + is_mdl = compile(r".*\.mdl$", IGNORECASE) if is_mdl.match(mdl_file) is None: - raise ValueError('MDL file must have a `.mdl` extension: {}'.format( + raise ValueError("MDL file must have a `.mdl` extension: {}".format( mdl_file )) + rows = _from_mdl_lower(mol_file=mdl_file, + output_csv=output_csv, + descriptors=descriptors, + fingerprints=fingerprints, + timeout=timeout) + return rows + + +def from_sdf(sdf_file: str, + output_csv: str = None, + descriptors: bool = True, + fingerprints: bool = False, + timeout: int = 60) -> list: + """ Converts sdf file into QSPR descriptors/fingerprints. + Multiple molecules may be represented in the sdf file + + Args: + sdf_file (str): path to sdf file + output_csv (str): if supplied, saves descriptors/fingerprints here + descriptors (bool): if `True`, calculates descriptors + fingerprints (bool): if `True`, calculates fingerprints + timeout (int): maximum time, in seconds, for conversion + + Returns: + list: list of dicts, where each dict corresponds sequentially to a compound in the + supplied sdf file + """ + + is_sdf = compile(r".*\.sdf$", IGNORECASE) + if is_sdf.match(sdf_file) is None: + raise ValueError("sdf file must have a `.sdf` extension: {}".format( + sdf_file + )) + + rows = _from_mdl_lower(mol_file=sdf_file, + output_csv=output_csv, + descriptors=descriptors, + fingerprints=fingerprints, + timeout=timeout) + return rows + + +def _from_mdl_lower(mol_file: str, output_csv: str = None, descriptors: bool = True, + fingerprints: bool = False, timeout: int = 60) -> list: save_csv = True if output_csv is None: save_csv = False - output_csv = '{}.csv'.format( - datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3] + output_csv = "{}.csv".format( + datetime.now().strftime("%Y%m%d%H%M%S%f")[:-3] ) for attempt in range(3): try: padeldescriptor( - mol_dir=mdl_file, + mol_dir=mol_file, d_file=output_csv, convert3d=True, retain3d=True, @@ -174,15 +224,16 @@ def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True, else: continue - with open(output_csv, 'r', encoding='utf-8') as desc_file: + with open(output_csv, "r", encoding="utf-8") as desc_file: reader = DictReader(desc_file) rows = [row for row in reader] desc_file.close() if not save_csv: remove(output_csv) if len(rows) == 0: - raise RuntimeError('PaDEL-Descriptor returned no calculated values.' + - ' Ensure the input structure is correct.') + raise RuntimeError("PaDEL-Descriptor returned no calculated values." + + " Ensure the input structure is correct.") for row in rows: - del row['Name'] + del row["Name"] + return rows diff --git a/padelpy/version.py b/padelpy/version.py new file mode 100644 index 0000000..5ed587d --- /dev/null +++ b/padelpy/version.py @@ -0,0 +1,5 @@ + +VERSION = (0, 1, 11, "") + +__version__ = ".".join(map(str, VERSION[:-1])) +__release__ = ".".join(map(str, VERSION)) diff --git a/padelpy/wrapper.py b/padelpy/wrapper.py index 8dbb263..65e18ba 100644 --- a/padelpy/wrapper.py +++ b/padelpy/wrapper.py @@ -21,6 +21,10 @@ 'PaDEL-Descriptor.jar' ) +__all__ = [ + "padeldescriptor", +] + def _popen_timeout(command: str, timeout: int) -> tuple: ''' Calls PaDEL-Descriptor, with optional subprocess timeout @@ -154,4 +158,4 @@ def padeldescriptor(maxruntime: int = -1, waitingjobs: int = -1, err.decode('utf-8') )) return - \ No newline at end of file + diff --git a/setup.py b/setup.py index d906720..a9f7cd1 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,31 @@ +import os from setuptools import setup + +def get_readme(): + """Load README.rst for display on PyPI.""" + with open('README.md') as fhandle: + return fhandle.read() + + +def get_version_info(): + """Read __version__ from version.py, using exec, not import.""" + fn_version = os.path.join("padelpy", "version.py") + myglobals = {} + with open(fn_version, "r") as f: + # pylint: disable=exec-used + exec(f.read(), myglobals) + return myglobals["__version__"] + + +VERSION = get_version_info() + setup( name='padelpy', - version='0.1.10', + version=VERSION, description='A Python wrapper for PaDEL-Descriptor', + long_description=get_readme(), + long_description_content_type='text/markdown', url='https://github.com/ecrl/padelpy', author='Travis Kessler', author_email='Travis_Kessler@student.uml.edu', diff --git a/tests/aspirin_3d.sdf b/tests/aspirin_3d.sdf new file mode 100644 index 0000000..9442fd1 --- /dev/null +++ b/tests/aspirin_3d.sdf @@ -0,0 +1,51 @@ +2244 + -OEChem-12012120113D + + 21 21 0 0 0 0 0 0 0999 V2000 + 1.2333 0.5540 0.7792 O 0 0 0 0 0 0 0 0 0 0 0 0 + -0.6952 -2.7148 -0.7502 O 0 0 0 0 0 0 0 0 0 0 0 0 + 0.7958 -2.1843 0.8685 O 0 0 0 0 0 0 0 0 0 0 0 0 + 1.7813 0.8105 -1.4821 O 0 0 0 0 0 0 0 0 0 0 0 0 + -0.0857 0.6088 0.4403 C 0 0 0 0 0 0 0 0 0 0 0 0 + -0.7927 -0.5515 0.1244 C 0 0 0 0 0 0 0 0 0 0 0 0 + -0.7288 1.8464 0.4133 C 0 0 0 0 0 0 0 0 0 0 0 0 + -2.1426 -0.4741 -0.2184 C 0 0 0 0 0 0 0 0 0 0 0 0 + -2.0787 1.9238 0.0706 C 0 0 0 0 0 0 0 0 0 0 0 0 + -2.7855 0.7636 -0.2453 C 0 0 0 0 0 0 0 0 0 0 0 0 + -0.1409 -1.8536 0.1477 C 0 0 0 0 0 0 0 0 0 0 0 0 + 2.1094 0.6715 -0.3113 C 0 0 0 0 0 0 0 0 0 0 0 0 + 3.5305 0.5996 0.1635 C 0 0 0 0 0 0 0 0 0 0 0 0 + -0.1851 2.7545 0.6593 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.7247 -1.3605 -0.4564 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.5797 2.8872 0.0506 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.8374 0.8238 -0.5090 H 0 0 0 0 0 0 0 0 0 0 0 0 + 3.7290 1.4184 0.8593 H 0 0 0 0 0 0 0 0 0 0 0 0 + 4.2045 0.6969 -0.6924 H 0 0 0 0 0 0 0 0 0 0 0 0 + 3.7105 -0.3659 0.6426 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.2555 -3.5916 -0.7337 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1 5 1 0 0 0 0 + 1 12 1 0 0 0 0 + 2 11 1 0 0 0 0 + 2 21 1 0 0 0 0 + 3 11 2 0 0 0 0 + 4 12 2 0 0 0 0 + 5 6 1 0 0 0 0 + 5 7 2 0 0 0 0 + 6 8 2 0 0 0 0 + 6 11 1 0 0 0 0 + 7 9 1 0 0 0 0 + 7 14 1 0 0 0 0 + 8 10 1 0 0 0 0 + 8 15 1 0 0 0 0 + 9 10 2 0 0 0 0 + 9 16 1 0 0 0 0 + 10 17 1 0 0 0 0 + 12 13 1 0 0 0 0 + 13 18 1 0 0 0 0 + 13 19 1 0 0 0 0 + 13 20 1 0 0 0 0 +M END +> +2244 + +$$$$ diff --git a/tests/test.py b/tests/test.py index 357ec65..5dd7a74 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,33 +1,37 @@ import unittest from collections import OrderedDict -from padelpy import from_smiles, from_mdl +from padelpy import from_mdl, from_sdf, from_smiles class TestAll(unittest.TestCase): def test_from_smiles(self): - descriptors = from_smiles('CCC') self.assertEqual(len(descriptors), 1875) self.assertAlmostEqual(float(descriptors['MW']), 44.0626, 4) self.assertEqual(int(descriptors['nC']), 3) def test_multiple_smiles(self): - smiles = ['CCC', 'CCCC'] descriptors = from_smiles(smiles) self.assertEqual(len(descriptors), 2) self.assertEqual(len(descriptors[0]), 1875) def test_errors(self): - bad_smiles = 'SJLDFGSJ' self.assertRaises(RuntimeError, from_smiles, bad_smiles) bad_smiles = ['SJLDFGSJ', 'CCC'] self.assertRaises(RuntimeError, from_smiles, bad_smiles) + def test_from_sdf(self): + """Test SDF file input functionality.""" + descriptors = from_sdf("aspirin_3d.sdf")[0] + self.assertEqual(len(descriptors), 1875) + self.assertAlmostEqual(float(descriptors['MW']), 180.04225, 4) + self.assertAlmostEqual(float(descriptors['SsCH3']), 1.2209, 4) + self.assertEqual(int(descriptors['nC']), 9) -if __name__ == '__main__': +if __name__ == '__main__': unittest.main()