Merge branch 'dev' into 'master'

Dev See merge request cdd/DrugEx!96
CDDLeiden · Mar 2, 2023 · 1069f0e · 1069f0e
2 parents aaef0f7 + 88e2b38
commit 1069f0e
Show file tree

Hide file tree

Showing 44 changed files with 1,995 additions and 3,803 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,52 +1,26 @@
 # Change Log
-From v3.3.0 to v3.4.0
+From v3.4.0 to v3.4.1
 
 ## Fixes
 
-None.
-
+- Content of output files during model training and molecule generation (broken due to refactoring in `v3.4.0`):
+  - During fine-tuning, the training (`train_loss`) and the validation (`valid_loss`) loss, the rations of valid (`valid_ratio`) and accurate (`accurate_ratio`, only for transformers) molecules are saved in `_fit.tsv`
+  - During RL, the rations of valid (`valid_ratio`), accurate (`accurate_ratio`, only for transformers), unique (`unique_ratio`) and desired (`desired_ratio`) molecules and the average arithmetic (`avg_amean`) and geometric (`avg_gmean`) of the modified scores are saved in `_fit.tsv`
+- In `DrugExEnvironment.getScores()` set all modified scores to 0 for invalid molecules (fixes bug resulting from refactoring in `v3.4.0`)
+- Fixed the CLI so that it supports new QSPRPred models.
+- Fixed the tutorial for scaffold-based generation.
 
 ## Changes
 
-Major refactoring of `drugex.training`
-
-- Moving generators from `drugex.training.models` to `drugex.training.generators`, and harmonizing and renaming them
-  - `RNN` -> `SequenceRNN`
-  - `GPT2Model` -> `SequenceTransformer`
-  - `GraphModel` -> `GraphTransformer`
-
-- Moving explorers from `drugex.training.models` to `drugex.training.explorers`, harmonizing and renaming them
-  - `SmilesExplorerNoFrag` -> `SequenceExplorer`
-  - `SmilesExplorer` -> `FragSequenceExplorer`
-  - `GraphExplorer` -> `FragGraphExplorer`
-
-- Removal of all obsolete modules related to the two discontinued fragment-based LSTM models from [DrugEx v3](https://doi.org/10.26434/chemrxiv-2021-px6kz).
-
-- The generators' `sample_smiles()` has been replaced by a `generate()` function
-
-- Clafification of the terms qualifying the generated molecules to have the following unique and constant definitions (replacing ambigous `VALID` and `DESIRE` terms)
-  - `Valid` : molecule can be parsed with rdkit
-  - `Accurate` : molecule contains given input fragments
-  - `Desired` : molecule fulfils all given objectives 
-
+- Minimal supported version of QSPRPred compatible with the tutorial and CLI is now `v1.3.0.dev0`.
+- The `train` CLI script now uses the `'-p', '--predictor'` option to specify the QSPRPred model to use. It takes a path to the model's `_meta.json` file. More models can be specified this way.
+  - This changes the original meaning of the `'-ta', '--active_targets'`, `'-ti', '--inactive_targets'` and `'-tw', '--window_targets'` options. These now serve to link the models to the particular type of target. The name of the QSPRPred model is used to determine the type of target it represents. For example, if the QSPRPred model is called `A2AR_RandomForestClassifier`, then the `'-ta', '--active_targets'` option will be used to link to the `A2AR_RandomForestClassifier` as a predictor predicting activity towards a target. 
+- Standard crowding distance is now the default ranking method for the `train` script (equiv. to `--scheme PRCD`, previously was `--scheme PRTD`).
 
-- Revise implementation of Tanimoto distance-based Pareto ranking scheme(`SimilarityRanking`) to correspond to the method described in [DrugEx v2](https://doi.org/10.1186/s13321-021-00561-9). Add option to use minimum Tanimoto distance between molecules in a front instead the mean distance.
-
-- Remove all references to NN-based RAscore (already discontinued)
-
-Refactoring of CLI
-
-- Refactoring `dataset.py` and `train.py` to object based
-- Writting a single `.txt.vocab` file per dataset preprocessing instead of separate (duplicate) files for each subset in `dataset.py`
-
-## Removed
-
-- `--save_voc` argument in `dataset.py` as redundant
-- `--pretrained_model` argment in `train.py` (merged with `--agent_path`)
-- `memory` parameter and all associated code from in `SequenceRNN`
+## Removed Features
 
+None.
 
 ## New Features
 
-- GRU-based RNN added to the CLI 
-- added another possible implementation of similarity ranking (`MutualSimilaritySortRanking`), this is based on the code in the original repository of [DrugEx](https://github.com/XuhanLiu/DrugEx/blob/cd384f4a8ed4982776e92293f77afd4ea78644f9/utils/nsgaii.py#L92)
+None.
diff --git a/drugex/about.py b/drugex/about.py
@@ -5,4 +5,4 @@
 On: 24.06.22, 10:36
 """
 
-VERSION = "3.4.0"
+VERSION = "3.4.1"
diff --git a/drugex/data/corpus/vocabulary.py b/drugex/data/corpus/vocabulary.py
@@ -12,6 +12,7 @@
 from rdkit import Chem
 
 from drugex.data.corpus.interfaces import SequenceVocabulary, Vocabulary
+from drugex.logs import logger
 from drugex.molecules.converters.standardizers import CleanSMILES
 
 
@@ -305,7 +306,8 @@ def decode(self, matrix):
                 Chem.SanitizeMol(emol)
                 Chem.SanitizeMol(esub)
             except Exception as e:
-                raise e
+                logger.error(f'Error while decoding: {adj}')
+                logger.error(e)
             frags.append(Chem.MolToSmiles(esub))
             smiles.append(Chem.MolToSmiles(emol))
         return frags, smiles
diff --git a/drugex/data/utils.py b/drugex/data/utils.py
@@ -3,7 +3,7 @@
 from drugex import VERSION
 from drugex.logs import logger
 
-def getVocPaths(data_path, voc_files):
+def getVocPaths(data_path, voc_files, mol_type):
     """ 
     Get paths to vocabulary files. If none are found, use internal defaults.
     
@@ -25,6 +25,8 @@ def getVocPaths(data_path, voc_files):
         path = f'{data_path}/{voc_file}'
         if os.path.exists(path):
             voc_paths.append(path)
+        elif os.path.exists(path + f'_{mol_type}.txt.vocab'):
+            voc_paths.append(path + f'_{mol_type}.txt.vocab')
         else:
             logger.warning(f'Could not find vocabulary file {voc_file} in {data_path}.')
 

diff --git a/drugex/dataset.py b/drugex/dataset.py
@@ -173,9 +173,9 @@ def setPairCollectors(self):
 
         pair_collectors = dict()
         if self.save_intermediate_files:
-            pair_collectors['train_collector'] = lambda x : pd.DataFrame(x, columns=['Frags', 'Smiles']).to_csv(f'{self.file_base}_train.txt', sep='\t', index=False)
-            pair_collectors['test_collector'] = lambda x : pd.DataFrame(x, columns=['Frags', 'Smiles']).to_csv(f'{self.file_base}_test.txt', sep='\t', index=False)
-            pair_collectors['unique_collector'] = lambda x : pd.DataFrame(x, columns=['Frags', 'Smiles']).to_csv(f'{self.file_base}_unique.txt', sep='\t', index=False)
+            pair_collectors['train_collector'] = lambda x : pd.DataFrame(x, columns=['Frags', 'SMILES']).to_csv(f'{self.file_base}_train.txt', sep='\t', index=False)
+            pair_collectors['test_collector'] = lambda x : pd.DataFrame(x, columns=['Frags', 'SMILES']).to_csv(f'{self.file_base}_test.txt', sep='\t', index=False)
+            pair_collectors['unique_collector'] = lambda x : pd.DataFrame(x, columns=['Frags', 'SMILES']).to_csv(f'{self.file_base}_unique.txt', sep='\t', index=False)
 
         return pair_collectors
 

diff --git a/drugex/designer.py → drugex/generate.py b/drugex/designer.py → drugex/generate.py
@@ -22,7 +22,7 @@ def DesignArgParser(txt=None):
     parser.add_argument('-g', '--generator', type=str, default='ligand_mf_brics_gpt_128',
                         help="Name of final generator model file without .pkg extension")
     parser.add_argument('-i', '--input_file', type=str, default='ligand_4:4_brics_test',
-                        help="For v3, name of file containing fragments for generation without _graph.txt / _smi.txt extension") 
+                        help="For v3, name of file containing fragments for generation without _graph.txt / _smiles.txt extension")
     # TODO: Is reading voc files necessary? Is the vocabulary saved to the generator file?
     parser.add_argument('-vfs', '--voc_files', type=str, nargs='*', default=['smiles'],
                         help="Names of voc files to use as vocabulary.")
@@ -36,8 +36,58 @@ def DesignArgParser(txt=None):
     parser.add_argument('--keep_undesired', action='store_true',
                         help="If on, undesirable molecules are kept in the output. Else, they are dropped.")
 
-
-    parser.add_argument('-gpu', '--gpu', type=str, default='1,2,3,4',
+    # Affinity models
+    parser.add_argument('-p', '--predictor', type=str, nargs='*', default=['RF'],
+                        help="The path to the serialized metadata of a QSPRPred model (ie. 'RF_meta.json'). If different environments are required give environment of targets in order active, inactive, window.")
+    parser.add_argument('-at', '--activity_threshold', type=float, default=6.5,
+                        help="Activity threshold")
+    parser.add_argument('-ta', '--active_targets', type=str, nargs='*', default=[],
+                        help="Names of models that predict activity.")
+    parser.add_argument('-ti', '--inactive_targets', type=str, nargs='*', default=[],
+                        help="Names of models that predict inactivity.")
+    parser.add_argument('-tw', '--window_targets', type=str, nargs='*', default=[],
+                        help="Names of models for which selectivity window is calculated.")
+    parser.add_argument('-le', '--ligand_efficiency', action='store_true',
+                        help="If on, use the ligand efficiency instead of the simple affinity as objective for active targets.")
+    parser.add_argument('-le_ths', '--le_thresholds', type=float, nargs=2, default=[0.0, 0.5],
+                        help='Thresholds used calculate ligand efficiency clipped scores in the desirability function.')
+    parser.add_argument('-lipe', '--lipophilic_efficiency', action='store_true',
+                        help="If on, use the ligand lipophilic efficiency instead of the simple affinity as objective for active targets.")
+    parser.add_argument('-lipe_ths', '--lipe_thresholds', type=float, nargs=2, default=[4.0, 6.0],
+                        help='Thresholds used calculate lipophilic efficiency clipped scores in the desirability function.')
+
+    # Pre-implemented properties
+    parser.add_argument('-qed', '--qed', action='store_true',
+                        help="If on, QED is used in desirability function")
+    parser.add_argument('-unq', '--uniqueness', action='store_true',
+                        help="If on, molecule uniqueness is used in desirability function")
+    parser.add_argument('-sas', '--sa_score', action='store_true',
+                        help="If on, Synthetic Accessibility score is used in desirability function")
+    parser.add_argument('-ras', '--ra_score', action='store_true',
+                        help="If on, Retrosynthesis Accessibility score is used in desirability function")
+    parser.add_argument('-mw', '--molecular_weight', action='store_true',
+                        help='If on, compounds with molecular weights outside a range set by mw_thersholds are penalized in the desirability function')
+    parser.add_argument('-mw_ths', '--mw_thresholds', type=int, nargs='*', default=[200, 600],
+                        help='Thresholds used calculate molecular weights clipped scores in the desirability function.')
+    parser.add_argument('-logP', '--logP', action='store_true',
+                        help='If on, compounds with logP values outside a range set by mw_thersholds are penalized in the desirability function')
+    parser.add_argument('-logP_ths', '--logP_thresholds', type=float, nargs='*', default=[-5, 5],
+                        help='Thresholds used calculate logP clipped scores in the desirability function')
+    parser.add_argument('-tpsa', '--tpsa', action='store_true',
+                        help='If on, topology polar surface area is used in desirability function')
+    parser.add_argument('-tpsa_ths', '--tpsa_thresholds', type=float, nargs=2, default=[0, 140],
+                        help='Thresholds used calculate TPSA clipped scores in the desirability function')
+    parser.add_argument('-sim_mol', '--similarity_mol', type=str, default=None,
+                        help='SMILES string of a reference molecule to which the similarity is used as an objective. Similarity metric and threshold set by --sim_metric and --sim_th.')
+    parser.add_argument('-sim_type', '--similarity_type', type=str, default='fraggle',
+                        help="'fraggle' for Fraggle similarity, 'graph' for Tversky similarity between graphs or fingerprints name ('AP', 'PHCO', 'BPF', 'BTF', 'PATH', 'ECFP4', 'ECFP6', 'FCFP4', 'FCFP6') for Tversky similarity between fingeprints")
+    parser.add_argument('-sim_th', '--similarity_threshold', type=float, default=0.5,
+                        help="Threshold for molecular similarity to reference molecule")
+    parser.add_argument('-sim_tw', '--similarity_tversky_weights', nargs=2, type=float, default=[0.7, 0.3],
+                        help="Weights (alpha and beta) for Tversky similarity. If both equal to 1.0, Tanimoto similarity.")
+
+
+    parser.add_argument('-gpu', '--use_gpus', type=str, default='1,2,3,4',
                         help="List of GPUs") 
     parser.add_argument('-bs', '--batch_size', type=int, default=1048,
                         help="Batch size")
@@ -68,10 +118,7 @@ def DesignerFragsDataPreparation(
     voc_files : list, 
     data_path : str, 
     input_file : str,
-    mol_type : str,
-    alg : str, 
-    batch_size=128, 
-    n_samples=-1):
+    mol_type : str):
 
     """
     Reads and preprocesses the vocabulary and input data for a graph-based generator
@@ -95,8 +142,8 @@ def DesignerFragsDataPreparation(
         input_path = data_path + input_file
         assert os.path.exists(input_path)
     except:
-        input_path = data_path + '_'.join([input_file, 'test', mol_type if mol_type == 'graph' else 'smi']) + '.txt'
-        assert os.path.exists(input_path)
+        input_path = data_path + '_'.join([input_file, 'test', mol_type if mol_type == 'graph' else 'smiles']) + '.txt'
+        assert os.path.exists(input_path), f'Input file {input_path} does not exist'
     logSettings.log.info(f'Loading input fragments from {input_path}')
 
     if mol_type == 'graph' :
@@ -109,33 +156,29 @@ def DesignerFragsDataPreparation(
         if voc_paths:
             # TODO: SOFTCODE number of fragments !!!!
             data_set.readVocs(voc_paths, VocSmiles, max_len=100, encode_frags=True)
-    voc = data_set.getVoc()
 
-    loader = data_set.asDataLoader(batch_size=batch_size, n_samples=n_samples)
-    return voc, loader
+    return data_set
 
 def Design(args):
 
     log = logSettings.log
 
-    args.gpu = [int(x) for x in args.gpu.split(',')]
+    args.use_gpus = [int(x) for x in args.use_gpus.split(',')]
 
     data_path = args.base_dir + '/data/'
 
     if not os.path.exists(args.base_dir + '/new_molecules'):
         os.makedirs(args.base_dir + '/new_molecules')
 
     if args.algorithm != 'rnn':
-        voc, loader = DesignerFragsDataPreparation(args.voc_files, 
+        data_set = DesignerFragsDataPreparation(args.voc_files,
             data_path,
             args.input_file, 
-            args.mol_type,
-            args.algorithm, 
-            args.batch_size, 
-            args.num
+            args.mol_type
             )
+        voc = data_set.getVoc()
     else:
-        voc_paths = DataPreparation(args.base_dir, args.voc_files, None, None, None).getVocPaths()
+        voc_paths = DataPreparation(args.base_dir, args.voc_files, None, None, None, args.mol_type).getVocPaths()
         voc = VocSmiles.fromFile(voc_paths[0], False, max_len=100)
 
     # Load generator model
@@ -148,8 +191,7 @@ def Design(args):
     # Set up environment-predictor
     env = CreateEnvironment(
         args.base_dir,
-        args.env_alg,
-        args.env_task,
+        args.predictor,
         args.scheme,
         active_targets=args.active_targets,
         inactive_targets=args.inactive_targets,
@@ -162,6 +204,7 @@ def Design(args):
         mw_ths=args.mw_thresholds,
         logP=args.logP,
         logP_ths=args.logP_thresholds,
+        logger=log
     )
 
     out = args.base_dir + '/new_molecules/' + args.generator + '.tsv'
@@ -174,10 +217,10 @@ def Design(args):
 
     gen_kwargs = dict(num_samples=args.num, batch_size=args.batch_size, n_proc=8,
         drop_invalid=not args.keep_invalid, no_multifrag_smiles=True, drop_duplicates=not args.keep_duplicates, drop_undesired=not args.keep_undesired, 
-        evaluator=env, compute_desirability=True, raw_scores=True)
+        evaluator=env, raw_scores=True)
 
     if args.algorithm != 'rnn':
-        gen_kwargs['input_loader'] = loader
+        gen_kwargs['input_dataset'] = data_set
         gen_kwargs['keep_frags'] = True
 
     df_mols = agent.generate(**gen_kwargs)

diff --git a/drugex/logs/__init__.py b/drugex/logs/__init__.py
@@ -14,4 +14,4 @@
     logger.setLevel(logging.INFO)
 
 def setLogger(log):
-    setattr(sys.modules[__name__], 'logger', log)
+    setattr(sys.modules[__name__], 'logger_drugex', log)