Skip to content

Commit

Permalink
Merge pull request #32 from fwmeng88/master
Browse files Browse the repository at this point in the history
Version 0.1.11

Add support of SDF file
  • Loading branch information
tjkessler authored Dec 5, 2021
2 parents 9c6b49e + 723cf5d commit 901772d
Show file tree
Hide file tree
Showing 9 changed files with 313 additions and 40 deletions.
111 changes: 111 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Prerequisites
*.d

# Compiled object files
*.slo
*.lo
*.o
*.obj

# Precompiled headers
*.gch
*.pch

# Compiled dynamic libraries
*.so
*.so.[0-9]*
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
.installed.cfg
MANIFEST
*.egg-info/
*.egg
*.manifest
*.spec
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.pytest_cache/

# Documentation
doc/html/
doc/latex/
doc/man/
doc/xml/
doc/_build/
doc/source
doc/modules

# Environments
.env
.venv
env/
venv/
ENV/

# Editor junk
tags
[._]*.s[a-v][a-z]
[._]*.sw[a-p]
[._]s[a-v][a-z]
[._]sw[a-p]
*~
\#*\#
.\#*
.ropeproject
.idea/
.spyderproject
.spyproject
.vscode/
# Mac .DS_Store
.DS_Store

# jupyter notebook checkpoints
.ipynb_checkpoints

# version file generated by rob
padelpy/_version.py

27 changes: 26 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,28 @@ fingerprints = from_mdl('mols.mdl', fingerprints=True, descriptors=False)
_ = from_mdl('mols.mdl', output_csv='descriptors.csv')
```

### SDF to Descriptors/Fingerprints

The "from_sdf" function accepts a filepath as an argument, and returns a list.
Each list element is a dictionary with descriptors/fingerprints corresponding to each supplied
molecule (indexed as they appear in the SDF file).

```python
from padelpy import from_sdf

# calculate molecular descriptors for molecules in `mols.sdf`
descriptors = from_sdf('mols.sdf')

# in addition to descriptors, calculate PubChem fingerprints
desc_fp = from_sdf('mols.sdf', fingerprints=True)

# only calculate fingerprints
fingerprints = from_sdf('mols.sdf', fingerprints=True, descriptors=False)

# save descriptors to a CSV file
_ = from_sdf('mols.sdf', output_csv='descriptors.csv')
```

### Command Line Wrapper

Alternatively, you can have more control over PaDEL-Descriptor with the command-line wrapper function. Any combination of arguments supported by PaDEL-Descriptor can be accepted by the "padeldescriptor" function.
Expand All @@ -84,9 +106,12 @@ from padelpy import padeldescriptor
# to supply a configuration file
padeldescriptor(config='\\path\\to\\config')

# to supply an input and output file
# to supply an input (MDL) and output file
padeldescriptor(mol_dir='molecules.mdl', d_file='descriptors.csv')

# to supply an input (SDF) and output file
padeldescriptor(mol_dir='molecules.sdf', d_file='descriptors.csv')

# a SMILES file can be supplied
padeldescriptor(mol_dir='molecules.smi', d_file='descriptors.csv')

Expand Down
4 changes: 2 additions & 2 deletions padelpy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from padelpy.wrapper import padeldescriptor
from padelpy.functions import from_mdl, from_smiles
from .wrapper import padeldescriptor
from .functions import from_mdl, from_smiles, from_sdf
__version__ = '0.1.10'
111 changes: 81 additions & 30 deletions padelpy/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,28 @@
# Contains various functions commonly used with PaDEL-Descriptor
#

import warnings
# stdlib. imports
from collections import OrderedDict
from csv import DictReader
from datetime import datetime
from os import remove
from re import compile, IGNORECASE
from time import sleep
import warnings

# PaDELPy imports
from padelpy import padeldescriptor

__all__ = [
"from_mdl",
"from_smiles",
"from_sdf",
]


def from_smiles(smiles, output_csv: str = None, descriptors: bool = True,
fingerprints: bool = False, timeout: int = 60) -> OrderedDict:
''' from_smiles: converts SMILES string to QSPR descriptors/fingerprints
""" from_smiles: converts SMILES string to QSPR descriptors/fingerprints
Args:
smiles (str, list): SMILES string for a given molecule, or a list of
Expand All @@ -38,30 +44,30 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True,
list of OrderedDicts, else single OrderedDict; each OrderedDict
contains labels and values for each descriptor generated for each
supplied molecule
'''
"""

timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3]
timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")[:-3]

with open('{}.smi'.format(timestamp), 'w') as smi_file:
with open("{}.smi".format(timestamp), "w") as smi_file:
if type(smiles) == str:
smi_file.write(smiles)
elif type(smiles) == list:
smi_file.write('\n'.join(smiles))
smi_file.write("\n".join(smiles))
else:
raise RuntimeError('Unknown input format for `smiles`: {}'.format(
raise RuntimeError("Unknown input format for `smiles`: {}".format(
type(smiles)
))
smi_file.close()

save_csv = True
if output_csv is None:
save_csv = False
output_csv = '{}.csv'.format(timestamp)
output_csv = "{}.csv".format(timestamp)

for attempt in range(3):
try:
padeldescriptor(
mol_dir='{}.smi'.format(timestamp),
mol_dir="{}.smi".format(timestamp),
d_file=output_csv,
convert3d=True,
retain3d=True,
Expand All @@ -74,7 +80,7 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True,
break
except RuntimeError as exception:
if attempt == 2:
remove('{}.smi'.format(timestamp))
remove("{}.smi".format(timestamp))
if not save_csv:
sleep(0.5)
try:
Expand All @@ -85,33 +91,33 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True,
else:
continue

with open(output_csv, 'r', encoding='utf-8') as desc_file:
with open(output_csv, "r", encoding="utf-8") as desc_file:
reader = DictReader(desc_file)
rows = [row for row in reader]
desc_file.close()

remove('{}.smi'.format(timestamp))
remove("{}.smi".format(timestamp))
if not save_csv:
remove(output_csv)

if type(smiles) == list and len(rows) != len(smiles):
raise RuntimeError('PaDEL-Descriptor failed on one or more mols.' +
' Ensure the input structures are correct.')
raise RuntimeError("PaDEL-Descriptor failed on one or more mols." +
" Ensure the input structures are correct.")
elif type(smiles) == str and len(rows) == 0:
raise RuntimeError(
'PaDEL-Descriptor failed on {}.'.format(smiles) +
' Ensure input structure is correct.'
"PaDEL-Descriptor failed on {}.".format(smiles) +
" Ensure input structure is correct."
)

for idx, r in enumerate(rows):
if len(r) == 0:
raise RuntimeError(
'PaDEL-Descriptor failed on {}.'.format(smiles[idx]) +
' Ensure input structure is correct.'
"PaDEL-Descriptor failed on {}.".format(smiles[idx]) +
" Ensure input structure is correct."
)

for idx in range(len(rows)):
del rows[idx]['Name']
del rows[idx]["Name"]

if type(smiles) == str:
return rows[0]
Expand All @@ -120,7 +126,7 @@ def from_smiles(smiles, output_csv: str = None, descriptors: bool = True,

def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True,
fingerprints: bool = False, timeout: int = 60) -> list:
''' from_mdl: converts MDL file into QSPR descriptors/fingerprints;
""" from_mdl: converts MDL file into QSPR descriptors/fingerprints;
multiple molecules may be represented in the MDL file
Args:
Expand All @@ -133,25 +139,69 @@ def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True,
Returns:
list: list of dicts, where each dict corresponds sequentially to a
compound in the supplied MDL file
'''
"""

is_mdl = compile(r'.*\.mdl$', IGNORECASE)
is_mdl = compile(r".*\.mdl$", IGNORECASE)
if is_mdl.match(mdl_file) is None:
raise ValueError('MDL file must have a `.mdl` extension: {}'.format(
raise ValueError("MDL file must have a `.mdl` extension: {}".format(
mdl_file
))

rows = _from_mdl_lower(mol_file=mdl_file,
output_csv=output_csv,
descriptors=descriptors,
fingerprints=fingerprints,
timeout=timeout)
return rows


def from_sdf(sdf_file: str,
output_csv: str = None,
descriptors: bool = True,
fingerprints: bool = False,
timeout: int = 60) -> list:
""" Converts sdf file into QSPR descriptors/fingerprints.
Multiple molecules may be represented in the sdf file
Args:
sdf_file (str): path to sdf file
output_csv (str): if supplied, saves descriptors/fingerprints here
descriptors (bool): if `True`, calculates descriptors
fingerprints (bool): if `True`, calculates fingerprints
timeout (int): maximum time, in seconds, for conversion
Returns:
list: list of dicts, where each dict corresponds sequentially to a compound in the
supplied sdf file
"""

is_sdf = compile(r".*\.sdf$", IGNORECASE)
if is_sdf.match(sdf_file) is None:
raise ValueError("sdf file must have a `.sdf` extension: {}".format(
sdf_file
))

rows = _from_mdl_lower(mol_file=sdf_file,
output_csv=output_csv,
descriptors=descriptors,
fingerprints=fingerprints,
timeout=timeout)
return rows


def _from_mdl_lower(mol_file: str, output_csv: str = None, descriptors: bool = True,
fingerprints: bool = False, timeout: int = 60) -> list:
save_csv = True
if output_csv is None:
save_csv = False
output_csv = '{}.csv'.format(
datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3]
output_csv = "{}.csv".format(
datetime.now().strftime("%Y%m%d%H%M%S%f")[:-3]
)

for attempt in range(3):
try:
padeldescriptor(
mol_dir=mdl_file,
mol_dir=mol_file,
d_file=output_csv,
convert3d=True,
retain3d=True,
Expand All @@ -174,15 +224,16 @@ def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True,
else:
continue

with open(output_csv, 'r', encoding='utf-8') as desc_file:
with open(output_csv, "r", encoding="utf-8") as desc_file:
reader = DictReader(desc_file)
rows = [row for row in reader]
desc_file.close()
if not save_csv:
remove(output_csv)
if len(rows) == 0:
raise RuntimeError('PaDEL-Descriptor returned no calculated values.' +
' Ensure the input structure is correct.')
raise RuntimeError("PaDEL-Descriptor returned no calculated values." +
" Ensure the input structure is correct.")
for row in rows:
del row['Name']
del row["Name"]

return rows
5 changes: 5 additions & 0 deletions padelpy/version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

VERSION = (0, 1, 11, "")

__version__ = ".".join(map(str, VERSION[:-1]))
__release__ = ".".join(map(str, VERSION))
Loading

0 comments on commit 901772d

Please sign in to comment.