Merge branch 'dev'

CDDLeiden · Feb 20, 2023 · 53d44e7 · 53d44e7
2 parents 447ae63 + 79da1a7
commit 53d44e7
Show file tree

Hide file tree

Showing 100 changed files with 45,665 additions and 16,383 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,10 +4,13 @@
 !.gitignore
 /build/
 /base_test/
-/tutorial/jupyter
 /tutorial/data
-/tutorial/CLI
-/tutorial/advanced/data
+/tutorial/download.json
+/testing/clitest/data/*.txt
+/testing/clitest/data/*.vocab
+/testing/clitest/data/backup*
+/testing/clitest/data/dataset.json
+/testing/clitest/generators/
 /docs/_build
 *.pkg
 *.tgz

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,5 @@
 # Change Log
-From v3.2.0 to v3.3.0
+From v3.3.0 to v3.4.0
 
 ## Fixes
 
@@ -8,22 +8,45 @@ None.
 
 ## Changes
 
-- Improve scaffold-based encoding. New `dummyMolsFromFragments` to create dummy molecules from set of fragments to be called as the `fragmenter` in `FragmentCorpusEncoder`. This makes the `ScaffoldSequenceCorpus`, `ScaffoldGraphCorpus`, `SmilesScaffoldDataSet` and `GraphScaffoldDataSet` classes obsolete. 
-- The early stopping criterion of reinforcement learning is changed back to the ratio of desired molecules.
-- Renamed `GaphModel.sampleFromSmiles` to `GraphModel.sample_smiles`,
-  - argument `min_samples` was renamed to `num_samples`,
-  - exactly `num_samples` are returned,
-  - arguments `drop_duplicates`, `drop_invalid` were added,
-  - argument `keep_frags` was added.
-- The `sample_smiles` method was added to the SequenceTranformer `GTP2Model` and to the `RNN` classes.
-- Changed the `GTP2Model` adaptive learning rate settings to resolve pretraining issues
-- Progress bars were added for models' fitting (pretraining, fine-tuning and reinforcement learning).
-- Tokens `_` and `.` always present in `VocSmiles` have been removed.
-- RNN models deposited on Zenodo and pretrained on ChEMBL31 and Papyrus 05.5 were updated while the RNN model pretrained on ChEMBL27 did not need to.
-- Moved encoding of tokens for SMILES-based models to the parallel preprocessing steps to improve performance
-- All testing code that is not unit tests was moved to `testing`
+Major refactoring of `drugex.training`
+
+- Moving generators from `drugex.training.models` to `drugex.training.generators`, and harmonizing and renaming them
+  - `RNN` -> `SequenceRNN`
+  - `GPT2Model` -> `SequenceTransformer`
+  - `GraphModel` -> `GraphTransformer`
+
+- Moving explorers from `drugex.training.models` to `drugex.training.explorers`, harmonizing and renaming them
+  - `SmilesExplorerNoFrag` -> `SequenceExplorer`
+  - `SmilesExplorer` -> `FragSequenceExplorer`
+  - `GraphExplorer` -> `FragGraphExplorer`
+
+- Removal of all obsolete modules related to the two discontinued fragment-based LSTM models from [DrugEx v3](https://doi.org/10.26434/chemrxiv-2021-px6kz).
+
+- The generators' `sample_smiles()` has been replaced by a `generate()` function
+
+- Clafification of the terms qualifying the generated molecules to have the following unique and constant definitions (replacing ambigous `VALID` and `DESIRE` terms)
+  - `Valid` : molecule can be parsed with rdkit
+  - `Accurate` : molecule contains given input fragments
+  - `Desired` : molecule fulfils all given objectives 
+
+
+- Revise implementation of Tanimoto distance-based Pareto ranking scheme(`SimilarityRanking`) to correspond to the method described in [DrugEx v2](https://doi.org/10.1186/s13321-021-00561-9). Add option to use minimum Tanimoto distance between molecules in a front instead the mean distance.
+
+- Remove all references to NN-based RAscore (already discontinued)
+
+Refactoring of CLI
+
+- Refactoring `dataset.py` and `train.py` to object based
+- Writting a single `.txt.vocab` file per dataset preprocessing instead of separate (duplicate) files for each subset in `dataset.py`
+
+## Removed
+
+- `--save_voc` argument in `dataset.py` as redundant
+- `--pretrained_model` argment in `train.py` (merged with `--agent_path`)
+- `memory` parameter and all associated code from in `SequenceRNN`
+
 
 ## New Features
 
-- Tutorial for scaffold-based generation.
-- Added tests to `testing` that allow to check consistency of models between versions.
+- GRU-based RNN added to the CLI 
+- added another possible implementation of similarity ranking (`MutualSimilaritySortRanking`), this is based on the code in the original repository of [DrugEx](https://github.com/XuhanLiu/DrugEx/blob/cd384f4a8ed4982776e92293f77afd4ea78644f9/utils/nsgaii.py#L92)
diff --git a/README.md b/README.md
@@ -24,8 +24,10 @@ pip install git+https://github.com/CDDLeiden/DrugEx.git@master
 
 #### Optional Dependencies
 
+**[QSPRPred](https://github.com/CDDLeiden/QSPRPred.git)** - Optional package to install if you want to use the command line interface of DrugEx, which requires the models to be serialized with this package. It is also used by some examples in the tutorial.
+
 **[RAscore](https://github.com/reymond-group/RAscore)** - If you want to use the Retrosynthesis Accessibility Score in the desirability function.
-- The installation of RAscore might degrade the Scikit-Learn packages. If this happens, Scikit-Learn should be re-upgraded.
+- The installation of RAscore might downgrade the scikit-Learn packages. If this happens, scikit-Learn should be re-upgraded.
 
 
 ### Use

diff --git a/drugex/about.py b/drugex/about.py
@@ -5,4 +5,4 @@
 On: 24.06.22, 10:36
 """
 
-VERSION = "3.3.0"
+VERSION = "3.4.0.dev2"
diff --git a/drugex/data/corpus/vocabulary.py b/drugex/data/corpus/vocabulary.py
@@ -18,7 +18,7 @@
 class VocSmiles(SequenceVocabulary):
     """The class for handling encoding/decoding from SMILES to an array of indices for the main SMILES-based models (`GPT2Model` and `RNN`)"""
 
-    defaultWords = ('#','%','(',')','-','0','1','2','3','4','5','6','7','8','9','=','B','C','F','I','L','N','O','P','R','S','[Ag-3]','[As+]','[As]','[B-]','[BH-]','[BH2-]','[BH3-]','[B]','[C+]','[C-]','[CH-]','[CH2]','[CH2-]','[CH]','[I+]','[IH2]','[N+]','[N-]','[NH+]','[NH-]','[NH2+]','[N]','[O+]','[O-]','[OH+]','[O]','[P+]','[PH]','[S+]','[S-]','[SH+]','[SH2]','[SH]','[Se+]','[SeH]','[Se]','[SiH2]','[SiH]','[Si]','[Te]','[b-]','[c+]','[c-]','[cH-]','[n+]','[n-]','[nH+]','[nH]','[o+]','[s+]','[se+]','[se]','[te+]',"[te]",'b','c','n','o','p','s'
+    defaultWords = ('#','%','(',')','-','0','1','2','3','4','5','6','7','8','9','=','B','C','F','I','L','N','O','P','R','S','[Ag-3]','[As+]','[As]','[B-]','[BH-]','[BH2-]','[BH3-]','[B]','[C+]','[C-]','[CH-]','[CH2]','[CH]','[I+]','[IH2]','[N+]','[N-]','[NH+]','[NH-]','[NH2+]','[N]','[O+]','[O-]','[OH+]','[O]','[P+]','[PH]','[S+]','[S-]','[SH+]','[SH2]','[SH]','[Se+]','[SeH]','[Se]','[SiH2]','[SiH]','[Si]','[Te]','[b-]','[c+]','[c-]','[cH-]','[n+]','[n-]','[nH+]','[nH]','[o+]','[s+]','[se+]','[se]','[te+]',"[te]",'b','c','n','o','p','s'
     )
 
     def __init__(self, encode_frags, words=defaultWords, max_len=100, min_len=10):
@@ -120,7 +120,6 @@ def encode(self, input, is_smiles=True):
         seq_len = self.trg_len if is_smiles else self.src_len
         output = torch.zeros(len(input), seq_len).long()
         for i, seq in enumerate(input):
-            # print(i, len(seq))
             for j, char in enumerate(seq):
                 output[i, j] = self.tk2ix[char] if is_smiles else self.tk2ix['|' + char]
         return output
@@ -287,7 +286,6 @@ def encode(self, smiles, subs=None):
     def decode(self, matrix):
         frags, smiles = [], []
         for m, adj in enumerate(matrix):
-            # print('decode: ', m)
             emol = Chem.RWMol()
             esub = Chem.RWMol()
             try:
@@ -307,8 +305,7 @@ def decode(self, matrix):
                 Chem.SanitizeMol(emol)
                 Chem.SanitizeMol(esub)
             except Exception as e:
-                print(adj)
-                # raise e
+                raise e
             frags.append(Chem.MolToSmiles(esub))
             smiles.append(Chem.MolToSmiles(emol))
         return frags, smiles
diff --git a/drugex/data/datasets.py b/drugex/data/datasets.py
@@ -96,8 +96,8 @@ def __call__(self, data, batch_size, vocabulary):
             dataset = DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn)
             return dataset
 
-    def __init__(self, path, voc=None, rewrite=False):
-        super().__init__(path, rewrite=rewrite)
+    def __init__(self, path, voc=None, rewrite=False, save_voc=True, voc_file=None):
+        super().__init__(path, rewrite=rewrite, save_voc=save_voc, voc_file=voc_file)
         self.voc = voc if voc else VocSmiles(True)
 
     def __call__(self, result):
@@ -142,8 +142,8 @@ class GraphFragDataSet(DataSet):
     `DataSet` to manage the fragment-molecule pair encodings for the graph-based model (`GraphModel`).
     """
 
-    def __init__(self, path, voc=None, rewrite=False):
-        super().__init__(path, rewrite=rewrite)
+    def __init__(self, path, voc=None, rewrite=False, save_voc=True, voc_file=None):
+        super().__init__(path, rewrite=rewrite, save_voc=save_voc, voc_file=voc_file)
         self.voc = voc if voc else VocGraph()
 
     def __call__(self, result):
@@ -168,4 +168,4 @@ def getColumns(self):
     def dataToLoader(data, batch_size, vocabulary):
         dataset = torch.from_numpy(data).long().view(len(data), vocabulary.max_len, -1)
         loader = DataLoader(dataset, batch_size=batch_size, drop_last=False, shuffle=True)
-        return loader
+        return loader
diff --git a/drugex/data/fragments.py b/drugex/data/fragments.py
@@ -246,7 +246,7 @@ def __init__(self, fragmenter, encoder, pairs_splitter=None, n_proc=None, chunk_
         """
 
         Args:
-            fragmenter: a `MolConverter` that returns a `list` of (fragment, molecule) `tuple`s for a given molecule supplied as its SMILES string. The reference implementation is `Fragmenter`.
+            fragmenter (MolConverter): a `MolConverter` that returns a `list` of (fragment, molecule) `tuple`s for a given molecule supplied as its SMILES string. See the reference implementation in `Fragmenter`.
             encoder:  a `FragmentPairEncoder` that handles how molecules and fragments are encoded in the final result
             pairs_splitter: a `ChunkSplitter` that divides the generated molecule-fragment pairs from the "fragmenter" to splits (i.e. test and train)
             n_proc: number of processes to use for parallel operations

diff --git a/drugex/data/interfaces.py b/drugex/data/interfaces.py
@@ -46,17 +46,28 @@ class DataSet(ResultCollector, ABC):
     Data sets represent encoded input data for the various DrugEx models. Each `DataSet` is associated with a file and also acts as a `ResultCollector` to append data from parallel operations (see `ParallelProcessor`). The `DataSet` is also coupled with the `Vocabulary` used to encode the data in it. However, `Vocabulary` is usually saved in a separate file(s) and needs to be loaded explicitly with `DataSet.readVocs()`.
     """
 
-    def __init__(self, path, rewrite=False):
+    def __init__(self, path, rewrite=False, save_voc=True, voc_file=None):
         """
         Initialize this `DataSet`. A path to the associated file must be given. Data is saved to this file upon calling `DataSet.save()`.
 
         If the associated file already exists, the data is loaded automatically upon initialization.
 
-        Args:
-            path: path to the output file.
+        Parameters
+        ----------
+        path : str
+            Path to the file to use for this `DataSet`.
+        rewrite : bool
+            If `True`, the associated file is deleted and a new one is created. If `False`, the data is loaded from the file if it exists.
+        save_voc : bool
+            If `True`, the vocabulary is saved to a separate file. If `False`, the vocabulary is not saved.
+        voc_file : str
+            Path to the file to use for the vocabulary. If `None`, the vocabulary is saved to a file with the same name as the data set file but with the `.vocab` extension.
         """
 
         self.outpath = path
+        self.save_voc = save_voc
+        self.voc_file = voc_file
+
         if not os.path.exists(os.path.dirname(self.outpath)):
             os.makedirs(os.path.dirname(self.outpath))
         self.voc = None
@@ -80,7 +91,10 @@ def reset(self):
         logger.info(f"{self} initialized.")
 
     def getVocPath(self):
-        return f"{self.outpath}.vocab"
+        if self.voc_file:
+            return self.voc_file
+        else:
+            return f'{self.outpath}.vocab'
 
     def sendDataToFile(self, data, columns=None):
         header_written = os.path.isfile(self.outpath)
@@ -120,12 +134,14 @@ def updateVoc(self, voc):
         Returns:
             `None`
         """
+
         if not self.voc:
             self.voc = voc
         else:
             self.voc += voc
 
-        self.voc.toFile(self.getVocPath())
+        if self.save_voc:
+            self.voc.toFile(self.getVocPath())   
 
     def getVoc(self):
         """

diff --git a/drugex/data/utils.py b/drugex/data/utils.py
@@ -3,23 +3,58 @@
 from drugex import VERSION
 from drugex.logs import logger
 
-def getVocPaths(data_path, voc_files, mol_type):
+def getVocPaths(data_path, voc_files):
+    """ 
+    Get paths to vocabulary files. If none are found, use internal defaults.
+    
+    Parameters
+    ----------
+    data_path : str
+        Path to data directory.
+    voc_files : list
+        List of vocabulary file names.
+        
+    Returns
+    -------
+    list
+        List of paths to vocabulary files.
+    """
 
     voc_paths = []
-    for v in voc_files:
-        path = data_path + f"{v}_{mol_type}_voc.txt"
-        if not os.path.exists(path):
-            logger.warning(f'Reading {mol_type}_voc.txt instead of {path}')
-            path = data_path + f"{mol_type}_voc.txt"
+    for voc_file in voc_files:
+        path = f'{data_path}/{voc_file}'
         if os.path.exists(path):
             voc_paths.append(path)
         else:
-            logger.warning(f"No vocabulary files found. Using internal defaults for DrugEx v{VERSION}.")
+            logger.warning(f'Could not find vocabulary file {voc_file} in {data_path}.')
+
+    if len(voc_paths) == 0 :
+        logger.warning(f'No vocabulary files found. Using internal defaults for DrugEx v{VERSION}.')
 
     return voc_paths
 
 def getDataPaths(data_path, input_prefix, mol_type, unique_frags):
 
+    """ 
+    Get paths to training and test data files.
+    
+    Parameters
+    ----------
+    data_path : str
+        Path to data directory.
+    input_prefix : str
+        Prefix of data files. If a file with the exact name exists, it is used for both training and testing.
+    mol_type : str
+        Type of molecules in data files. Either 'smiles' or 'graph'.
+    unique_frags : bool
+        Whether to use unique fragments or not.
+    
+    Returns
+    -------
+    Tuple[str, str]
+        Paths to training and test data files.
+    """
+
     # If exact data path was given as input, that data is both used for training and testing
     if os.path.exists(data_path + input_prefix):
         train_path = data_path + input_prefix