From c3804c1211511345e98bc8df6de14ceb13402328 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 3 Sep 2024 16:57:02 +0200 Subject: [PATCH 01/87] replace fingeprints with scikit-fingerprints package --- README.md | 2 +- baybe/_optional/chem.py | 15 +- baybe/_optional/info.py | 4 +- baybe/parameters/enum.py | 46 +++-- baybe/parameters/substance.py | 20 +-- baybe/searchspace/core.py | 24 +-- .../gaussian_process/presets/edbo.py | 10 +- baybe/utils/chemistry.py | 163 ++---------------- docs/userguide/constraints.md | 4 +- docs/userguide/parameters.md | 10 +- examples/Backtesting/full_initial_data.py | 10 +- examples/Backtesting/full_lookup.py | 4 +- examples/Backtesting/impute_mode.py | 10 +- examples/Basics/campaign.py | 10 +- examples/Basics/recommenders.py | 8 +- .../custom_constraints.py | 2 +- .../dependency_constraints.py | 4 +- .../exclusion_constraints.py | 4 +- .../mixture_constraints.py | 12 +- .../prodsum_constraints.py | 4 +- examples/Custom_Hooks/campaign_stopping.py | 8 +- .../Custom_Surrogates/surrogate_params.py | 2 +- examples/Serialization/create_from_config.py | 2 +- examples/Serialization/validate_config.py | 4 +- mypy.ini | 6 +- pyproject.toml | 2 +- tests/conftest.py | 4 +- tests/hypothesis_strategies/parameters.py | 2 +- tests/simulate_telemetry.py | 8 +- tests/test_substance_parameter.py | 2 +- 30 files changed, 153 insertions(+), 253 deletions(-) diff --git a/README.md b/README.md index 6b83d4d10..6138d0082 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ parameters = [ "Solvent C": "O", "Solvent D": "CS(=O)C", }, - encoding="MORDRED", # chemical encoding via mordred package + encoding="DefaultFingerprint", # chemical encoding via scikit-fingerprints ), ] ``` diff --git a/baybe/_optional/chem.py b/baybe/_optional/chem.py index 1d9d661a6..cb2c35555 100644 --- a/baybe/_optional/chem.py +++ b/baybe/_optional/chem.py @@ -3,9 +3,10 @@ from baybe.exceptions import OptionalImportError try: - from mordred import Calculator, descriptors - from rdkit import Chem, RDLogger - from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect + from rdkit import Chem + from skfp import fingerprints as skfp_fingerprints + from skfp.bases import BaseFingerprintTransformer + except ModuleNotFoundError as ex: raise OptionalImportError( "Chemistry functionality is unavailable because the necessary optional " @@ -14,10 +15,4 @@ "e.g. via `pip install baybe[chem]`." ) from ex -__all__ = [ - "descriptors", - "Calculator", - "Chem", - "GetMorganFingerprintAsBitVect", - "RDLogger", -] +__all__ = ["Chem", "skfp_fingerprints", "BaseFingerprintTransformer"] diff --git a/baybe/_optional/info.py b/baybe/_optional/info.py index e725b4799..a91403bdc 100644 --- a/baybe/_optional/info.py +++ b/baybe/_optional/info.py @@ -25,13 +25,13 @@ def exclude_sys_path(path: str, /): # noqa: DOC402, DOC404 # Individual packages with exclude_sys_path(os.getcwd()): FLAKE8_INSTALLED = find_spec("flake8") is not None - MORDRED_INSTALLED = find_spec("mordred") is not None ONNX_INSTALLED = find_spec("onnxruntime") is not None POLARS_INSTALLED = find_spec("polars") is not None PRE_COMMIT_INSTALLED = find_spec("pre_commit") is not None PYDOCLINT_INSTALLED = find_spec("pydoclint") is not None RDKIT_INSTALLED = find_spec("rdkit") is not None RUFF_INSTALLED = find_spec("ruff") is not None + SKFP_INSTALLED = find_spec("skfp") is not None # scikit-fingerprints STREAMLIT_INSTALLED = find_spec("streamlit") is not None XYZPY_INSTALLED = find_spec("xyzpy") is not None @@ -44,7 +44,7 @@ def exclude_sys_path(path: str, /): # noqa: DOC402, DOC404 TYPOS_INSTALLED = True # Package combinations -CHEM_INSTALLED = MORDRED_INSTALLED and RDKIT_INSTALLED +CHEM_INSTALLED = RDKIT_INSTALLED and SKFP_INSTALLED LINT_INSTALLED = all( ( FLAKE8_INSTALLED, diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 7985a928a..83b52571e 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -2,6 +2,8 @@ from enum import Enum +from baybe._optional.info import CHEM_INSTALLED + class ParameterEncoding(Enum): """Generic base class for all parameter encodings.""" @@ -17,17 +19,39 @@ class CategoricalEncoding(ParameterEncoding): """Integer encoding.""" -class SubstanceEncoding(ParameterEncoding): - """Available encodings for substance parameters.""" - - MORDRED = "MORDRED" - """Encoding based on Mordred chemical descriptors.""" - - RDKIT = "RDKIT" - """Encoding based on RDKit chemical descriptors.""" - - MORGAN_FP = "MORGAN_FP" - """Encoding based on Morgan molecule fingerprints.""" +# TODO Ideally, this should be turned into a class that can: +# - return default when CHEM not installed +# - check if enum is fingerprint +PARAM_SUFFIX_FINGERPRINT = "Fingerprint" + +if CHEM_INSTALLED: + import inspect + + from baybe._optional.chem import BaseFingerprintTransformer, skfp_fingerprints + + AVAILABLE_SKFP_FP = dict( + inspect.getmembers( + skfp_fingerprints, + lambda x: inspect.isclass(x) and issubclass(x, BaseFingerprintTransformer), + ) + ) + AVAILABLE_SKFP_FP["Default"] = AVAILABLE_SKFP_FP["MordredFingerprint"] +else: + AVAILABLE_SKFP_FP = {"Default": None} + +AVAILABLE_SKFP_FP = { + ( + name + if name.endswith(PARAM_SUFFIX_FINGERPRINT) + else name + PARAM_SUFFIX_FINGERPRINT + ): fp + for name, fp in AVAILABLE_SKFP_FP.items() +} + +SubstanceEncoding = ParameterEncoding( + "SubstanceEncoding", dict(zip(AVAILABLE_SKFP_FP.keys(), AVAILABLE_SKFP_FP.keys())) +) +"""Available encodings for substance parameters.""" class CustomEncoding(ParameterEncoding): diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py index 40a741238..1c8c3a142 100644 --- a/baybe/parameters/substance.py +++ b/baybe/parameters/substance.py @@ -8,7 +8,7 @@ from attrs.validators import and_, deep_mapping, instance_of, min_len from baybe.parameters.base import DiscreteParameter -from baybe.parameters.enum import SubstanceEncoding +from baybe.parameters.enum import AVAILABLE_SKFP_FP, SubstanceEncoding from baybe.parameters.validation import validate_decorrelation from baybe.utils.basic import group_duplicate_values from baybe.utils.dataframe import df_drop_single_value_columns, df_uncorrelated_features @@ -58,7 +58,7 @@ class SubstanceParameter(DiscreteParameter): """ encoding: SubstanceEncoding = field( - default=SubstanceEncoding.MORDRED, converter=SubstanceEncoding + default=SubstanceEncoding.DefaultFingerprint, converter=SubstanceEncoding ) # See base class. @@ -118,22 +118,18 @@ def comp_df(self) -> pd.DataFrame: # noqa: D102 pref = self.name + "_" # Get the raw descriptors - if self.encoding is SubstanceEncoding.MORDRED: - comp_df = chemistry.smiles_to_mordred_features(vals, prefix=pref) - elif self.encoding is SubstanceEncoding.RDKIT: - comp_df = chemistry.smiles_to_rdkit_features(vals, prefix=pref) - elif self.encoding is SubstanceEncoding.MORGAN_FP: - comp_df = chemistry.smiles_to_fp_features(vals, prefix=pref) - else: - raise ValueError( - f"Unknown parameter encoding {self.encoding} for parameter {self.name}." - ) + comp_df = chemistry.smiles_to_fingerprint_features( + vals, + fingerprint_encoder=AVAILABLE_SKFP_FP[self.encoding.name](), + prefix=pref, + ) # Drop NaN and constant columns comp_df = comp_df.loc[:, ~comp_df.isna().any(axis=0)] comp_df = df_drop_single_value_columns(comp_df) # If there are bool columns, convert them to int (possible for Mordred) + # TODO should this be removed as with skfp all Mordred columns are float32? bool_cols = comp_df.select_dtypes(bool).columns comp_df[bool_cols] = comp_df[bool_cols].astype(int) diff --git a/baybe/searchspace/core.py b/baybe/searchspace/core.py index 23af0188f..1caec0ac5 100644 --- a/baybe/searchspace/core.py +++ b/baybe/searchspace/core.py @@ -14,8 +14,9 @@ validate_constraints, ) from baybe.constraints.base import Constraint -from baybe.parameters import SubstanceEncoding, TaskParameter +from baybe.parameters import TaskParameter from baybe.parameters.base import Parameter +from baybe.parameters.enum import PARAM_SUFFIX_FINGERPRINT from baybe.searchspace.continuous import SubspaceContinuous from baybe.searchspace.discrete import ( MemorySize, @@ -226,18 +227,19 @@ def type(self) -> SearchSpaceType: return SearchSpaceType.HYBRID raise RuntimeError("This line should be impossible to reach.") + # TODO replaces previously used contains_mordred and contains_rdkit + # which are both used likewise in edbo.py - + # not sure if this can be extrapolated to all fingerprints by using single property @property - def contains_mordred(self) -> bool: - """Indicates if any of the discrete parameters uses ``MORDRED`` encoding.""" + def contains_fingerprint(self) -> bool: + """Indicates if any of the discrete parameters uses ``Fingerprint`` encoding.""" return any( - p.encoding is SubstanceEncoding.MORDRED for p in self.discrete.parameters - ) - - @property - def contains_rdkit(self) -> bool: - """Indicates if any of the discrete parameters uses ``RDKIT`` encoding.""" - return any( - p.encoding is SubstanceEncoding.RDKIT for p in self.discrete.parameters + ( + False + if p.encoding is None + else p.encoding.name.endswith(PARAM_SUFFIX_FINGERPRINT) + ) + for p in self.discrete.parameters ) @property diff --git a/baybe/surrogates/gaussian_process/presets/edbo.py b/baybe/surrogates/gaussian_process/presets/edbo.py index 489f8f784..f66b7d0ef 100644 --- a/baybe/surrogates/gaussian_process/presets/edbo.py +++ b/baybe/surrogates/gaussian_process/presets/edbo.py @@ -36,9 +36,9 @@ def __call__( # noqa: D102 [p for p in searchspace.parameters if isinstance(p, TaskParameter)] ) - mordred = (searchspace.contains_mordred or searchspace.contains_rdkit) and ( - effective_dims >= 50 - ) + # TODO rename this to fingerprint after decided + # that it can be used for all fingerprints + mordred = searchspace.contains_fingerprint and (effective_dims >= 50) # low D priors if effective_dims < 5: @@ -95,9 +95,7 @@ def _edbo_noise_factory( [p for p in searchspace.parameters if isinstance(p, TaskParameter)] ) - uses_descriptors = ( - searchspace.contains_mordred or searchspace.contains_rdkit - ) and (effective_dims >= 50) + uses_descriptors = searchspace.contains_fingerprint and effective_dims >= 50 # low D priors if effective_dims < 5: diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 8d55f1358..facac8c70 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -4,24 +4,15 @@ import ssl import tempfile import urllib.request -from functools import lru_cache from pathlib import Path -import numpy as np import pandas as pd from joblib import Memory from baybe._optional.chem import ( - Calculator, + BaseFingerprintTransformer, Chem, - GetMorganFingerprintAsBitVect, - RDLogger, - descriptors, ) -from baybe.utils.numerical import DTypeFloatNumpy - -_mordred_calculator = Calculator(descriptors) - # Caching _cachedir = os.environ.get( @@ -70,162 +61,30 @@ def name_to_smiles(name: str) -> str: return "" -@lru_cache(maxsize=None) -@_disk_cache -def _smiles_to_mordred_features(smiles: str) -> np.ndarray: - """Memory- and disk-cached computation of Mordred descriptors. - - Args: - smiles: SMILES string. - - Returns: - Mordred descriptors for the given smiles string. - """ - try: - return np.asarray( - _mordred_calculator(Chem.MolFromSmiles(smiles)).fill_missing() - ) - except Exception: - return np.full(len(_mordred_calculator.descriptors), np.nan) - - -def smiles_to_mordred_features( +def smiles_to_fingerprint_features( smiles_list: list[str], + fingerprint_encoder: BaseFingerprintTransformer, prefix: str = "", - dropna: bool = True, -) -> pd.DataFrame: - """Compute Mordred chemical descriptors for a list of SMILES strings. - - Args: - smiles_list: List of SMILES strings. - prefix: Name prefix for each descriptor - (e.g., nBase --> _nBase). - dropna: If ``True``, drops columns that contain NaNs. - - Returns: - Dataframe containing overlapping Mordred descriptors for each SMILES - string. - """ - features = [_smiles_to_mordred_features(smiles) for smiles in smiles_list] - descriptor_names = list(_mordred_calculator.descriptors) - columns = [prefix + "MORDRED_" + str(name) for name in descriptor_names] - dataframe = pd.DataFrame(data=features, columns=columns, dtype=DTypeFloatNumpy) - - if dropna: - dataframe = dataframe.dropna(axis=1) - - return dataframe - - -def smiles_to_molecules(smiles_list: list[str]) -> list[Chem.Mol]: - """Convert a given list of SMILES strings into corresponding Molecule objects. - - Args: - smiles_list: List of SMILES strings. - - Returns: - List of corresponding molecules. - - Raises: - ValueError: If the SMILES does not seem to be chemically valid. - """ - mols = [] - for smiles in smiles_list: - try: - mol = Chem.MolFromSmiles(smiles) - if mol is None: - raise ValueError() - mols.append(mol) - except Exception as ex: - raise ValueError( - f"The SMILES {smiles} does not seem to be chemically valid." - ) from ex - return mols - - -def smiles_to_rdkit_features( - smiles_list: list[str], prefix: str = "", dropna: bool = True ) -> pd.DataFrame: - """Compute RDKit chemical descriptors for a list of SMILES strings. + """Compute molecule fingerprints for a list of SMILES strings. Args: smiles_list: List of SMILES strings. + fingerprint_encoder: Object used to transform smiles to fingerprints prefix: Name prefix for each descriptor (e.g., nBase --> _nBase). - dropna: If ``True``, drops columns that contain NaNs. Returns: - Dataframe containing overlapping RDKit descriptors for each SMILES string. + Dataframe containing fingerprints for each SMILES string. """ - mols = smiles_to_molecules(smiles_list) - - res = [] - for mol in mols: - desc = { - prefix + "RDKIT_" + dname: DTypeFloatNumpy(func(mol)) - for dname, func in Chem.Descriptors.descList - } - res.append(desc) - - df = pd.DataFrame(res) - if dropna: - df = df.dropna(axis=1) - - return df - - -def smiles_to_fp_features( - smiles_list: list[str], - prefix: str = "", - dtype: type[int] | type[float] = int, - radius: int = 4, - n_bits: int = 1024, -) -> pd.DataFrame: - """Compute standard Morgan molecule fingerprints for a list of SMILES strings. - - Args: - smiles_list: List of SMILES strings. - prefix: Name prefix for each descriptor (e.g., nBase --> _nBase). - dtype: Specifies whether fingerprints will have int or float data type. - radius: Radius for the Morgan fingerprint. - n_bits:Number of bits for the Morgan fingerprint. - - Returns: - Dataframe containing Morgan fingerprints for each SMILES string. - """ - mols = smiles_to_molecules(smiles_list) - - res = [] - for mol in mols: - RDLogger.logger().setLevel(RDLogger.CRITICAL) - - fingerp = GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits).ToBitString() - fingerp = map(int, fingerp) - fpvec = np.array(list(fingerp)) - res.append( - {prefix + "FP_" + f"{k + 1}": dtype(bit) for k, bit in enumerate(fpvec)} - ) - - df = pd.DataFrame(res) + features = fingerprint_encoder.transform(smiles_list) + col_names = [ + prefix + "SKFP_" + f for f in fingerprint_encoder.get_feature_names_out() + ] + df = pd.DataFrame(features, columns=col_names) return df -def is_valid_smiles(smiles: str) -> bool: - """Test if a SMILES string is valid according to RDKit. - - Args: - smiles: SMILES string to be tested. - - Returns: - ``True`` if the provided SMILES is valid, ``False`` else. - """ - try: - mol = Chem.MolFromSmiles(smiles) - return mol is not None - except Exception: - return False - - def get_canonical_smiles(smiles: str) -> str: """Return the "canonical" representation of the given SMILES.""" try: diff --git a/docs/userguide/constraints.md b/docs/userguide/constraints.md index 5a636503c..d15d2482f 100644 --- a/docs/userguide/constraints.md +++ b/docs/userguide/constraints.md @@ -228,12 +228,12 @@ dict_solvents = {"Water": "O", "THF": "C1CCOC1", "Octanol": "CCCCCCCCO"} solvent_encoding1 = SubstanceParameter( name="Solvent_RDKIT_enc", data=dict_solvents, - encoding="RDKIT", + encoding="RDKitFingerprint", ) solvent_encoding2 = SubstanceParameter( name="Solvent_MORDRED_enc", data=dict_solvents, - encoding="MORDRED", + encoding="MordredFingerprint", ) DiscreteLinkedParametersConstraint( parameters=["Solvent_RDKIT_enc", "Solvent_MORDRED_enc"] diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index 377ac14ca..dfb871ed0 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -122,16 +122,14 @@ SubstanceParameter( "1-Octanol": "CCCCCCCCO", "Toluene": "CC1=CC=CC=C1", }, - encoding="MORDRED", # optional + encoding="MordredFingerprint", # optional decorrelate=0.7, # optional ) ``` -The ``encoding`` option defines what kind of descriptors are calculated: -* ``MORDRED``: 2D descriptors from the [Mordred package](https://mordred-descriptor.github.io/documentation/master/). - Since the original package is now unmaintained, baybe requires the community replacement [mordredcommunity](https://github.com/JacksonBurns/mordred-community) -* ``RDKIT``: 2D descriptors from the [RDKit package](https://www.rdkit.org/) -* ``MORGAN_FP``: Morgan fingerprints calculated with RDKit (1024 bits, radius 4) +The ``encoding`` option defines what kind of descriptors are calculated. +All descriptors are calculated using [scikit-fingerprints package](https://github.com/scikit-fingerprints/scikit-fingerprints/). +Any fingerprint class name from `scikit-fingerprints` can be used as an input parameter for chemical encoding. These calculations will typically result in 500 to 1500 numbers per molecule. To avoid detrimental effects on the surrogate model fit, we reduce the number of diff --git a/examples/Backtesting/full_initial_data.py b/examples/Backtesting/full_initial_data.py index dfb0257b5..b53321fa7 100644 --- a/examples/Backtesting/full_initial_data.py +++ b/examples/Backtesting/full_initial_data.py @@ -91,9 +91,13 @@ # Here, we create the parameter objects, the searchspace and the objective. -base = SubstanceParameter(name="Base", data=dict_base, encoding="MORDRED") -solvent = SubstanceParameter(name="Solvent", data=dict_solvent, encoding="MORDRED") -ligand = SubstanceParameter(name="Ligand", data=dict_ligand, encoding="MORDRED") +base = SubstanceParameter(name="Base", data=dict_base, encoding="DefaultFingerprint") +solvent = SubstanceParameter( + name="Solvent", data=dict_solvent, encoding="DefaultFingerprint" +) +ligand = SubstanceParameter( + name="Ligand", data=dict_ligand, encoding="DefaultFingerprint" +) temperature = NumericalDiscreteParameter( name="Temp_C", values=[90, 105, 120], tolerance=2 ) diff --git a/examples/Backtesting/full_lookup.py b/examples/Backtesting/full_lookup.py index 330c066a5..095a8fd4f 100644 --- a/examples/Backtesting/full_lookup.py +++ b/examples/Backtesting/full_lookup.py @@ -93,10 +93,10 @@ ### Constructing campaigns for the simulation loop # In this example, we create several campaigns. -# First let us create three campaigns that each use a different chemical encoding to +# First let us create two campaigns that each use a different chemical encoding to # treat substances. -substance_encodings = ["MORDRED", "RDKIT", "MORGAN_FP"] +substance_encodings = ["MordredFingerprint", "RDKitFingerprint"] scenarios = { encoding: Campaign( searchspace=SearchSpace.from_product( diff --git a/examples/Backtesting/impute_mode.py b/examples/Backtesting/impute_mode.py index 8e46d5a4d..acc99e870 100644 --- a/examples/Backtesting/impute_mode.py +++ b/examples/Backtesting/impute_mode.py @@ -82,9 +82,13 @@ # Here, we create the parameter objects, the searchspace and the objective. -solvent = SubstanceParameter(name="Solvent", data=dict_solvent, encoding="MORDRED") -base = SubstanceParameter(name="Base", data=dict_base, encoding="MORDRED") -ligand = SubstanceParameter(name="Ligand", data=dict_ligand, encoding="MORDRED") +solvent = SubstanceParameter( + name="Solvent", data=dict_solvent, encoding="DefaultFingerprint" +) +base = SubstanceParameter(name="Base", data=dict_base, encoding="DefaultFingerprint") +ligand = SubstanceParameter( + name="Ligand", data=dict_ligand, encoding="DefaultFingerprint" +) temperature = NumericalDiscreteParameter( name="Temp_C", values=[90, 105, 120], tolerance=2 ) diff --git a/examples/Basics/campaign.py b/examples/Basics/campaign.py index 50e2b912c..a93af2e6f 100644 --- a/examples/Basics/campaign.py +++ b/examples/Basics/campaign.py @@ -40,13 +40,15 @@ } # We define the chemical substances parameters using the dictionaries defined previously. -# Here, we use `"MORDRED"` encoding, but others are available. +# Here, we use `"DefaultFingerprint"` encoding, but others are available. # We proceed to define numerical discrete parameters `temperature` and `concentration` # and create the search space. -solvent = SubstanceParameter("Solvent", data=dict_solvent, encoding="MORDRED") -base = SubstanceParameter("Base", data=dict_base, encoding="MORDRED") -ligand = SubstanceParameter("Ligand", data=dict_ligand, encoding="MORDRED") +solvent = SubstanceParameter( + "Solvent", data=dict_solvent, encoding="DefaultFingerprint" +) +base = SubstanceParameter("Base", data=dict_base, encoding="DefaultFingerprint") +ligand = SubstanceParameter("Ligand", data=dict_ligand, encoding="DefaultFingerprint") temperature = NumericalDiscreteParameter( "Temperature", values=[90, 105, 120], tolerance=2 diff --git a/examples/Basics/recommenders.py b/examples/Basics/recommenders.py index 3d541a1ad..2570de79a 100644 --- a/examples/Basics/recommenders.py +++ b/examples/Basics/recommenders.py @@ -140,9 +140,11 @@ "(t-Bu)PhCPhos": r"CN(C)C1=CC=CC(N(C)C)=C1C2=CC=CC=C2P(C(C)(C)C)C3=CC=CC=C3", } -solvent = SubstanceParameter("Solvent", data=dict_solvent, encoding="MORDRED") -base = SubstanceParameter("Base", data=dict_base, encoding="MORDRED") -ligand = SubstanceParameter("Ligand", data=dict_ligand, encoding="MORDRED") +solvent = SubstanceParameter( + "Solvent", data=dict_solvent, encoding="DefaultFingerprint" +) +base = SubstanceParameter("Base", data=dict_base, encoding="DefaultFingerprint") +ligand = SubstanceParameter("Ligand", data=dict_ligand, encoding="DefaultFingerprint") temperature = NumericalDiscreteParameter( "Temperature", values=[90, 105, 120], tolerance=2 ) diff --git a/examples/Constraints_Discrete/custom_constraints.py b/examples/Constraints_Discrete/custom_constraints.py index 866688336..b279042f6 100644 --- a/examples/Constraints_Discrete/custom_constraints.py +++ b/examples/Constraints_Discrete/custom_constraints.py @@ -43,7 +43,7 @@ "c6": "c1ccccc1", "C6": "CCCCCC", } -solvent = SubstanceParameter("Solvent", data=dict_solvent, encoding="RDKIT") +solvent = SubstanceParameter("Solvent", data=dict_solvent, encoding="RDKitFingerprint") speed = CategoricalParameter( "Speed", values=["very slow", "slow", "normal", "fast", "very fast"], encoding="INT" ) diff --git a/examples/Constraints_Discrete/dependency_constraints.py b/examples/Constraints_Discrete/dependency_constraints.py index 737bedc33..ab06375af 100644 --- a/examples/Constraints_Discrete/dependency_constraints.py +++ b/examples/Constraints_Discrete/dependency_constraints.py @@ -34,7 +34,9 @@ "water": "O", "C1": "C", } -solvent = SubstanceParameter(name="Solv", data=dict_solvent, encoding="MORDRED") +solvent = SubstanceParameter( + name="Solv", data=dict_solvent, encoding="DefaultFingerprint" +) switch1 = CategoricalParameter(name="Switch1", values=["on", "off"]) switch2 = CategoricalParameter(name="Switch2", values=["left", "right"]) fraction1 = NumericalDiscreteParameter( diff --git a/examples/Constraints_Discrete/exclusion_constraints.py b/examples/Constraints_Discrete/exclusion_constraints.py index eb81ef2fc..2758786ea 100644 --- a/examples/Constraints_Discrete/exclusion_constraints.py +++ b/examples/Constraints_Discrete/exclusion_constraints.py @@ -40,7 +40,9 @@ "c6": "c1ccccc1", "C6": "CCCCCC", } -solvent = SubstanceParameter(name="Solv", data=dict_solvent, encoding="RDKIT") +solvent = SubstanceParameter( + name="Solv", data=dict_solvent, encoding="RDKitFingerprint" +) speed = CategoricalParameter( name="Speed", values=["very slow", "slow", "normal", "fast", "very fast"], diff --git a/examples/Constraints_Discrete/mixture_constraints.py b/examples/Constraints_Discrete/mixture_constraints.py index 7848043cc..a87bc4534 100644 --- a/examples/Constraints_Discrete/mixture_constraints.py +++ b/examples/Constraints_Discrete/mixture_constraints.py @@ -46,9 +46,15 @@ "C2": "CC", "C3": "CCC", } -solvent1 = SubstanceParameter(name="Solv1", data=dict_solvents, encoding="MORDRED") -solvent2 = SubstanceParameter(name="Solv2", data=dict_solvents, encoding="MORDRED") -solvent3 = SubstanceParameter(name="Solv3", data=dict_solvents, encoding="MORDRED") +solvent1 = SubstanceParameter( + name="Solv1", data=dict_solvents, encoding="DefaultFingerprint" +) +solvent2 = SubstanceParameter( + name="Solv2", data=dict_solvents, encoding="DefaultFingerprint" +) +solvent3 = SubstanceParameter( + name="Solv3", data=dict_solvents, encoding="DefaultFingerprint" +) # Parameters for representing the fraction. diff --git a/examples/Constraints_Discrete/prodsum_constraints.py b/examples/Constraints_Discrete/prodsum_constraints.py index 2d547e61b..394f5ae3e 100644 --- a/examples/Constraints_Discrete/prodsum_constraints.py +++ b/examples/Constraints_Discrete/prodsum_constraints.py @@ -38,7 +38,9 @@ "C1": "C", "C2": "CC", } -solvent = SubstanceParameter(name="Solvent", data=dict_solvent, encoding="RDKIT") +solvent = SubstanceParameter( + name="Solvent", data=dict_solvent, encoding="RDKitFingerprint" +) speed = CategoricalParameter( name="Speed", values=["slow", "normal", "fast"], encoding="INT" ) diff --git a/examples/Custom_Hooks/campaign_stopping.py b/examples/Custom_Hooks/campaign_stopping.py index 6adb9f07a..0f51ac6ee 100644 --- a/examples/Custom_Hooks/campaign_stopping.py +++ b/examples/Custom_Hooks/campaign_stopping.py @@ -92,9 +92,11 @@ } parameters = [ - SubstanceParameter(name="Solvent", data=dict_solvent, encoding="MORDRED"), - SubstanceParameter(name="Base", data=dict_base, encoding="MORDRED"), - SubstanceParameter(name="Ligand", data=dict_ligand, encoding="MORDRED"), + SubstanceParameter( + name="Solvent", data=dict_solvent, encoding="DefaultFingerprint" + ), + SubstanceParameter(name="Base", data=dict_base, encoding="DefaultFingerprint"), + SubstanceParameter(name="Ligand", data=dict_ligand, encoding="DefaultFingerprint"), NumericalDiscreteParameter(name="Temp_C", values=[90, 105, 120], tolerance=2), NumericalDiscreteParameter(name="Concentration", values=[0.057, 0.1, 0.153]), ] diff --git a/examples/Custom_Surrogates/surrogate_params.py b/examples/Custom_Surrogates/surrogate_params.py index f72cc0b68..11b9179c4 100644 --- a/examples/Custom_Surrogates/surrogate_params.py +++ b/examples/Custom_Surrogates/surrogate_params.py @@ -53,7 +53,7 @@ "Solvent C": "O", "Solvent D": "CS(=O)C", }, - encoding="MORDRED", + encoding="DefaultFingerprint", ), ] diff --git a/examples/Serialization/create_from_config.py b/examples/Serialization/create_from_config.py index f11b336ea..c15f03499 100644 --- a/examples/Serialization/create_from_config.py +++ b/examples/Serialization/create_from_config.py @@ -53,7 +53,7 @@ "Solvent D": "CCOCCOCCN" }, "decorrelate": true, - "encoding": "MORDRED" + "encoding": "DefaultFingerprint" } ], "constraints": [] diff --git a/examples/Serialization/validate_config.py b/examples/Serialization/validate_config.py index 52e8bd311..4919cdc8c 100644 --- a/examples/Serialization/validate_config.py +++ b/examples/Serialization/validate_config.py @@ -52,7 +52,7 @@ "Solvent D": "CCOCCOCCN" }, "decorrelate": true, - "encoding": "MORDRED" + "encoding": "DefaultFingerprint" } ], "constraints": [] @@ -123,7 +123,7 @@ "Solvent D": "CCOCCOCCN" }, "decorrelate": true, - "encoding": "MORDRED" + "encoding": "DefaultFingerprint" } ], "constraints": [] diff --git a/mypy.ini b/mypy.ini index dfe237e75..8380550d5 100644 --- a/mypy.ini +++ b/mypy.ini @@ -24,9 +24,6 @@ ignore_missing_imports = True [mypy-joblib] ignore_missing_imports = True -[mypy-mordred] -ignore_missing_imports = True - [mypy-mpl_toolkits.mplot3d] ignore_missing_imports = True @@ -57,6 +54,9 @@ ignore_missing_imports = True [mypy-rdkit.Chem.rdMolDescriptors] ignore_missing_imports = True +[mypy-skfp] +ignore_missing_imports = True + [mypy-xyzpy] ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index e681ebe8e..af63b2064 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ Issues = "https://github.com/emdgroup/baybe/issues/" [project.optional-dependencies] chem = [ "rdkit>=2022.3.4", - "mordredcommunity>=1.2.0", + "scikit-fingerprints>=1.7.0", ] onnx = [ diff --git a/tests/conftest.py b/tests/conftest.py index 559396907..8dd1dfa2e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -323,7 +323,7 @@ def fixture_parameters( SubstanceParameter( name=f"Substance_1_{encoding}", data=mock_substances, - encoding=encoding, + encoding=encoding.name, ) for encoding in SubstanceEncoding ], @@ -731,7 +731,7 @@ def fixture_default_config(): "name": "Solvent", "data": {"sol1":"C", "sol2":"CC", "sol3":"CCC"}, "decorrelate": true, - "encoding": "MORDRED" + "encoding": "DefaultFingerprint" },""" if CHEM_INSTALLED else """ diff --git a/tests/hypothesis_strategies/parameters.py b/tests/hypothesis_strategies/parameters.py index 10b067409..6608b6eef 100644 --- a/tests/hypothesis_strategies/parameters.py +++ b/tests/hypothesis_strategies/parameters.py @@ -142,7 +142,7 @@ def substance_parameters(draw: st.DrawFn): decorrelate = draw(decorrelations) encoding = draw(st.sampled_from(SubstanceEncoding)) return SubstanceParameter( - name=name, data=data, decorrelate=decorrelate, encoding=encoding + name=name, data=data, decorrelate=decorrelate, encoding=encoding.name ) diff --git a/tests/simulate_telemetry.py b/tests/simulate_telemetry.py index ce26db3a8..39d09bad5 100644 --- a/tests/simulate_telemetry.py +++ b/tests/simulate_telemetry.py @@ -55,9 +55,11 @@ } parameters = [ - SubstanceParameter(name="Solvent", data=dict_solvent, encoding="MORDRED"), - SubstanceParameter(name="Base", data=dict_base, encoding="MORDRED"), - SubstanceParameter(name="Ligand", data=dict_ligand, encoding="MORDRED"), + SubstanceParameter( + name="Solvent", data=dict_solvent, encoding="DefaultFingerprint" + ), + SubstanceParameter(name="Base", data=dict_base, encoding="DefaultFingerprint"), + SubstanceParameter(name="Ligand", data=dict_ligand, encoding="DefaultFingerprint"), NumericalDiscreteParameter(name="Temp_C", values=[90, 105, 120], tolerance=2), NumericalDiscreteParameter( name="Concentration", values=[0.057, 0.1, 0.153], tolerance=0.005 diff --git a/tests/test_substance_parameter.py b/tests/test_substance_parameter.py index 52bc69a0a..6fbdcca98 100644 --- a/tests/test_substance_parameter.py +++ b/tests/test_substance_parameter.py @@ -13,7 +13,7 @@ ) @pytest.mark.parametrize( "parameter_names", - [["Categorical_1", f"Substance_1_{enc}"] for enc in SubstanceEncoding], + [["Categorical_1", f"Substance_1_{enc.name}"] for enc in SubstanceEncoding], ids=[enc.name for enc in SubstanceEncoding], ) def test_run_iterations(campaign, batch_size, n_iterations): From d6f56d289ec33e595b58a1252d76ea8564d264ba Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 3 Sep 2024 17:07:25 +0200 Subject: [PATCH 02/87] add myself to contributors --- CONTRIBUTORS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 1e13f036d..618c47c4f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -24,3 +24,5 @@ Human readable output for search spaces - Di Jin (Merck Life Science KGaA, Darmstadt, Germany):\ Cardinality constraints +- Karin Hrovatin (Merck KGaA, Darmstadt, Germany):\ + Update fingerprint encodings From d9865128330748ef02aeea6fae7d7add5d732de7 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 4 Sep 2024 07:46:15 +0200 Subject: [PATCH 03/87] fix mypy ingnore imports --- mypy.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypy.ini b/mypy.ini index 8380550d5..34e7c2190 100644 --- a/mypy.ini +++ b/mypy.ini @@ -54,7 +54,7 @@ ignore_missing_imports = True [mypy-rdkit.Chem.rdMolDescriptors] ignore_missing_imports = True -[mypy-skfp] +[mypy-skfp.*] ignore_missing_imports = True [mypy-xyzpy] From ec293b22c1f07505794dfb2ff1667a4c8cbc523e Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 4 Sep 2024 07:46:36 +0200 Subject: [PATCH 04/87] attempt to fix enum, not resolved --- baybe/parameters/enum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 83b52571e..38955ea94 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -49,7 +49,7 @@ class CategoricalEncoding(ParameterEncoding): } SubstanceEncoding = ParameterEncoding( - "SubstanceEncoding", dict(zip(AVAILABLE_SKFP_FP.keys(), AVAILABLE_SKFP_FP.keys())) + value="SubstanceEncoding", names={k: k for k in AVAILABLE_SKFP_FP.keys()} ) """Available encodings for substance parameters.""" From 630d8963c3b71ff5977783c3f2886683ebd1ced6 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 6 Sep 2024 06:27:19 +0200 Subject: [PATCH 05/87] Update CONTRIBUTORS.md Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- CONTRIBUTORS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 618c47c4f..f0f45558f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -25,4 +25,4 @@ - Di Jin (Merck Life Science KGaA, Darmstadt, Germany):\ Cardinality constraints - Karin Hrovatin (Merck KGaA, Darmstadt, Germany):\ - Update fingerprint encodings + Add scikit-fingerprints support From bb19b0ede7f3a9b7eef868b1d07c963e673646dd Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 6 Sep 2024 14:36:44 +0200 Subject: [PATCH 06/87] review 1 --- README.md | 2 +- baybe/_optional/info.py | 3 +- baybe/parameters/enum.py | 82 +++++++++++-------- baybe/parameters/substance.py | 32 ++++++-- baybe/searchspace/core.py | 11 +-- .../gaussian_process/presets/edbo.py | 8 +- baybe/utils/chemistry.py | 62 ++++++++++++-- docs/userguide/constraints.md | 4 +- docs/userguide/parameters.md | 6 +- examples/Backtesting/full_initial_data.py | 10 +-- examples/Backtesting/full_lookup.py | 2 +- examples/Backtesting/impute_mode.py | 10 +-- examples/Basics/campaign.py | 10 +-- examples/Basics/recommenders.py | 8 +- .../custom_constraints.py | 2 +- .../dependency_constraints.py | 4 +- .../exclusion_constraints.py | 4 +- .../mixture_constraints.py | 12 +-- .../prodsum_constraints.py | 4 +- examples/Custom_Hooks/campaign_stopping.py | 8 +- .../Custom_Surrogates/surrogate_params.py | 2 +- examples/Serialization/create_from_config.py | 2 +- examples/Serialization/validate_config.py | 4 +- mypy.ini | 3 - pyproject.toml | 1 - tests/conftest.py | 4 +- tests/hypothesis_strategies/parameters.py | 2 +- tests/simulate_telemetry.py | 8 +- 28 files changed, 171 insertions(+), 139 deletions(-) diff --git a/README.md b/README.md index 6138d0082..8c4d223fd 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ parameters = [ "Solvent C": "O", "Solvent D": "CS(=O)C", }, - encoding="DefaultFingerprint", # chemical encoding via scikit-fingerprints + encoding="MORDRED", # chemical encoding via scikit-fingerprints ), ] ``` diff --git a/baybe/_optional/info.py b/baybe/_optional/info.py index a91403bdc..32907fa68 100644 --- a/baybe/_optional/info.py +++ b/baybe/_optional/info.py @@ -29,7 +29,6 @@ def exclude_sys_path(path: str, /): # noqa: DOC402, DOC404 POLARS_INSTALLED = find_spec("polars") is not None PRE_COMMIT_INSTALLED = find_spec("pre_commit") is not None PYDOCLINT_INSTALLED = find_spec("pydoclint") is not None - RDKIT_INSTALLED = find_spec("rdkit") is not None RUFF_INSTALLED = find_spec("ruff") is not None SKFP_INSTALLED = find_spec("skfp") is not None # scikit-fingerprints STREAMLIT_INSTALLED = find_spec("streamlit") is not None @@ -44,7 +43,7 @@ def exclude_sys_path(path: str, /): # noqa: DOC402, DOC404 TYPOS_INSTALLED = True # Package combinations -CHEM_INSTALLED = RDKIT_INSTALLED and SKFP_INSTALLED +CHEM_INSTALLED = SKFP_INSTALLED LINT_INSTALLED = all( ( FLAKE8_INSTALLED, diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 38955ea94..6bdf41d8b 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -2,8 +2,6 @@ from enum import Enum -from baybe._optional.info import CHEM_INSTALLED - class ParameterEncoding(Enum): """Generic base class for all parameter encodings.""" @@ -19,39 +17,53 @@ class CategoricalEncoding(ParameterEncoding): """Integer encoding.""" -# TODO Ideally, this should be turned into a class that can: -# - return default when CHEM not installed -# - check if enum is fingerprint -PARAM_SUFFIX_FINGERPRINT = "Fingerprint" - -if CHEM_INSTALLED: - import inspect - - from baybe._optional.chem import BaseFingerprintTransformer, skfp_fingerprints - - AVAILABLE_SKFP_FP = dict( - inspect.getmembers( - skfp_fingerprints, - lambda x: inspect.isclass(x) and issubclass(x, BaseFingerprintTransformer), - ) - ) - AVAILABLE_SKFP_FP["Default"] = AVAILABLE_SKFP_FP["MordredFingerprint"] -else: - AVAILABLE_SKFP_FP = {"Default": None} - -AVAILABLE_SKFP_FP = { - ( - name - if name.endswith(PARAM_SUFFIX_FINGERPRINT) - else name + PARAM_SUFFIX_FINGERPRINT - ): fp - for name, fp in AVAILABLE_SKFP_FP.items() -} - -SubstanceEncoding = ParameterEncoding( - value="SubstanceEncoding", names={k: k for k in AVAILABLE_SKFP_FP.keys()} -) -"""Available encodings for substance parameters.""" +class SubstanceEncoding(ParameterEncoding): + """Available encodings for substance parameters.""" + + AtomPairFingerprint = "ATOMPAIR" + AutocorrFingerprint = "AUTOCORR" + AvalonFingerprint = "AVALON" + E3FPFingerprint = "E3FP" + ECFPFingerprint = "ECFP" + ERGFingerprint = "ERG" + EStateFingerprint = "ESTATE" + FunctionalGroupsFingerprint = "FUNCTIONALGROUPS" + GETAWAYFingerprint = "GETAWAY" + GhoseCrippenFingerprint = "GHOSECRIPPEN" + KlekotaRothFingerprint = "KLEKOTAROTH" + LaggnerFingerprint = "LAGGNER" + LayeredFingerprint = "LAYERED" + LingoFingerprint = "LINGO" + MACCSFingerprint = "MACCS" + MAPFingerprint = "MAP" + MHFPFingerprint = "MHFP" + MORSEFingerprint = "MORSE" + MQNsFingerprint = "MQNS" + MordredFingerprint = "MORDRED" + PatternFingerprint = "PATTERN" + PharmacophoreFingerprint = "PHARMACOPHORE" + PhysiochemicalPropertiesFingerprint = "PHYSIOCHEMICALPROPERTIES" + PubChemFingerprint = "PUBCHEM" + RDFFingerprint = "RDF" + RDKit2DDescriptorsFingerprint = "RDKIT2DDESCRIPTORS" + RDKitFingerprint = "RDKIT" + SECFPFingerprint = "SECFP" + TopologicalTorsionFingerprint = "TOPOLOGICALTORSION" + USRCATFingerprint = "USRCAT" + USRFingerprint = "USR" + WHIMFingerprint = "WHIM" + + @classmethod + def _missing_(cls, value): + """Backward compatibility of enum values. + + Enable backwards compatibility of value names that + differ between SKFP and previous version. + """ + if value == "MORGAN_FP": + return cls.ECFPFingerprint + else: + return super()._missing_(value) class CustomEncoding(ParameterEncoding): diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py index 1c8c3a142..32beb774b 100644 --- a/baybe/parameters/substance.py +++ b/baybe/parameters/substance.py @@ -8,7 +8,7 @@ from attrs.validators import and_, deep_mapping, instance_of, min_len from baybe.parameters.base import DiscreteParameter -from baybe.parameters.enum import AVAILABLE_SKFP_FP, SubstanceEncoding +from baybe.parameters.enum import SubstanceEncoding from baybe.parameters.validation import validate_decorrelation from baybe.utils.basic import group_duplicate_values from baybe.utils.dataframe import df_drop_single_value_columns, df_uncorrelated_features @@ -18,7 +18,6 @@ except NameError: from exceptiongroup import ExceptionGroup - Smiles = str """Type alias for SMILES strings.""" @@ -58,10 +57,28 @@ class SubstanceParameter(DiscreteParameter): """ encoding: SubstanceEncoding = field( - default=SubstanceEncoding.DefaultFingerprint, converter=SubstanceEncoding + default=SubstanceEncoding.MordredFingerprint, + converter=lambda x: ( + # Passed enum + x + if isinstance(x, SubstanceEncoding) + # Passed enum name + else ( + SubstanceEncoding[x] + if x in SubstanceEncoding.__members__ + # Passed enum value + else SubstanceEncoding(x) + ) + ), ) # See base class. + kwargs_fingerprint: dict = field(default={}) + """Kwargs for fingerprint generator""" + + kwargs_conformer: dict = field(default={}) + """Kwargs for conformer generator""" + @data.validator def _validate_substance_data( # noqa: DOC101, DOC103 self, _: Any, data: dict[str, Smiles] @@ -120,19 +137,16 @@ def comp_df(self) -> pd.DataFrame: # noqa: D102 # Get the raw descriptors comp_df = chemistry.smiles_to_fingerprint_features( vals, - fingerprint_encoder=AVAILABLE_SKFP_FP[self.encoding.name](), + fingerprint_name=self.encoding.name, prefix=pref, + kwargs_conformer=self.kwargs_conformer, + kwargs_fingerprint=self.kwargs_fingerprint, ) # Drop NaN and constant columns comp_df = comp_df.loc[:, ~comp_df.isna().any(axis=0)] comp_df = df_drop_single_value_columns(comp_df) - # If there are bool columns, convert them to int (possible for Mordred) - # TODO should this be removed as with skfp all Mordred columns are float32? - bool_cols = comp_df.select_dtypes(bool).columns - comp_df[bool_cols] = comp_df[bool_cols].astype(int) - # Label the rows with the molecule names comp_df.index = pd.Index(self.values) diff --git a/baybe/searchspace/core.py b/baybe/searchspace/core.py index 1caec0ac5..e153e0b22 100644 --- a/baybe/searchspace/core.py +++ b/baybe/searchspace/core.py @@ -16,7 +16,7 @@ from baybe.constraints.base import Constraint from baybe.parameters import TaskParameter from baybe.parameters.base import Parameter -from baybe.parameters.enum import PARAM_SUFFIX_FINGERPRINT +from baybe.parameters.enum import SubstanceEncoding from baybe.searchspace.continuous import SubspaceContinuous from baybe.searchspace.discrete import ( MemorySize, @@ -227,18 +227,11 @@ def type(self) -> SearchSpaceType: return SearchSpaceType.HYBRID raise RuntimeError("This line should be impossible to reach.") - # TODO replaces previously used contains_mordred and contains_rdkit - # which are both used likewise in edbo.py - - # not sure if this can be extrapolated to all fingerprints by using single property @property def contains_fingerprint(self) -> bool: """Indicates if any of the discrete parameters uses ``Fingerprint`` encoding.""" return any( - ( - False - if p.encoding is None - else p.encoding.name.endswith(PARAM_SUFFIX_FINGERPRINT) - ) + p.encoding in SubstanceEncoding if p.encoding is not None else False for p in self.discrete.parameters ) diff --git a/baybe/surrogates/gaussian_process/presets/edbo.py b/baybe/surrogates/gaussian_process/presets/edbo.py index f66b7d0ef..ee851f9e4 100644 --- a/baybe/surrogates/gaussian_process/presets/edbo.py +++ b/baybe/surrogates/gaussian_process/presets/edbo.py @@ -36,9 +36,7 @@ def __call__( # noqa: D102 [p for p in searchspace.parameters if isinstance(p, TaskParameter)] ) - # TODO rename this to fingerprint after decided - # that it can be used for all fingerprints - mordred = searchspace.contains_fingerprint and (effective_dims >= 50) + uses_descriptors = searchspace.contains_fingerprint and (effective_dims >= 50) # low D priors if effective_dims < 5: @@ -48,14 +46,14 @@ def __call__( # noqa: D102 outputscale_initial_value = 8.0 # DFT optimized priors - elif mordred and effective_dims < 100: + elif uses_descriptors and effective_dims < 100: lengthscale_prior = GammaPrior(2.0, 0.2) lengthscale_initial_value = 5.0 outputscale_prior = GammaPrior(5.0, 0.5) outputscale_initial_value = 8.0 # Mordred optimized priors - elif mordred: + elif uses_descriptors: lengthscale_prior = GammaPrior(2.0, 0.1) lengthscale_initial_value = 10.0 outputscale_prior = GammaPrior(2.0, 0.1) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index facac8c70..07e160866 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -4,15 +4,20 @@ import ssl import tempfile import urllib.request +from functools import lru_cache from pathlib import Path +import numpy as np import pandas as pd from joblib import Memory +from skfp.preprocessing import ConformerGenerator, MolFromSmilesTransformer from baybe._optional.chem import ( BaseFingerprintTransformer, Chem, + skfp_fingerprints, ) +from baybe.utils.numerical import DTypeFloatNumpy # Caching _cachedir = os.environ.get( @@ -61,26 +66,67 @@ def name_to_smiles(name: str) -> str: return "" +@lru_cache(maxsize=None) +@_disk_cache +def _smiles_str_to_fingerprint_features( + fingerprint_encoder: BaseFingerprintTransformer, + smiles_str: str, +) -> np.ndarray: + """Compute molecular fingerprint for a single SMILES string. + + Args: + fingerprint_encoder: Instance of Fingerprint class used to + transform smiles string to fingerprint + smiles_str: Smiles string + + Returns: + Array containing fingerprint for SMILES string. + """ + return fingerprint_encoder.transform([smiles_str]) + + def smiles_to_fingerprint_features( smiles_list: list[str], - fingerprint_encoder: BaseFingerprintTransformer, + fingerprint_name: str, prefix: str = "", + kwargs_conformer: dict | None = None, + kwargs_fingerprint: dict | None = None, ) -> pd.DataFrame: - """Compute molecule fingerprints for a list of SMILES strings. + """Compute molecular fingerprints for a list of SMILES strings. Args: smiles_list: List of SMILES strings. - fingerprint_encoder: Object used to transform smiles to fingerprints + fingerprint_name: Name of Fingerprint class used to + transform smiles to fingerprints prefix: Name prefix for each descriptor (e.g., nBase --> _nBase). + kwargs_conformer: kwargs for ConformerGenerator + kwargs_fingerprint: kwargs for ConformerGenerator Returns: Dataframe containing fingerprints for each SMILES string. """ - features = fingerprint_encoder.transform(smiles_list) - col_names = [ - prefix + "SKFP_" + f for f in fingerprint_encoder.get_feature_names_out() - ] - df = pd.DataFrame(features, columns=col_names) + kwargs_fingerprint = {} if kwargs_fingerprint is None else kwargs_fingerprint + fingerprint_encoder = getattr(skfp_fingerprints, fingerprint_name)( + **kwargs_fingerprint + ) + + if fingerprint_encoder.requires_conformers: + kwargs_conformer = {} if kwargs_conformer is None else kwargs_conformer + smiles_list = ConformerGenerator(**kwargs_conformer).transform( + MolFromSmilesTransformer().transform(smiles_list) + ) + + features = np.concatenate( + [ + _smiles_str_to_fingerprint_features( + fingerprint_encoder=fingerprint_encoder, smiles_str=smiles_str + ) + for smiles_str in smiles_list + ] + ) + name = f"skfp{fingerprint_encoder.__class__.__name__.replace('Fingerprint', '')}_" + col_names = [prefix + name + f for f in fingerprint_encoder.get_feature_names_out()] + df = pd.DataFrame(features, columns=col_names, dtype=DTypeFloatNumpy) return df diff --git a/docs/userguide/constraints.md b/docs/userguide/constraints.md index d15d2482f..5a636503c 100644 --- a/docs/userguide/constraints.md +++ b/docs/userguide/constraints.md @@ -228,12 +228,12 @@ dict_solvents = {"Water": "O", "THF": "C1CCOC1", "Octanol": "CCCCCCCCO"} solvent_encoding1 = SubstanceParameter( name="Solvent_RDKIT_enc", data=dict_solvents, - encoding="RDKitFingerprint", + encoding="RDKIT", ) solvent_encoding2 = SubstanceParameter( name="Solvent_MORDRED_enc", data=dict_solvents, - encoding="MordredFingerprint", + encoding="MORDRED", ) DiscreteLinkedParametersConstraint( parameters=["Solvent_RDKIT_enc", "Solvent_MORDRED_enc"] diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index dfb871ed0..ec3f36aae 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -122,14 +122,16 @@ SubstanceParameter( "1-Octanol": "CCCCCCCCO", "Toluene": "CC1=CC=CC=C1", }, - encoding="MordredFingerprint", # optional + encoding="MORDRED", # optional decorrelate=0.7, # optional ) ``` The ``encoding`` option defines what kind of descriptors are calculated. All descriptors are calculated using [scikit-fingerprints package](https://github.com/scikit-fingerprints/scikit-fingerprints/). -Any fingerprint class name from `scikit-fingerprints` can be used as an input parameter for chemical encoding. +Any fingerprint class from `scikit-fingerprints` can be used as an input parameter for chemical encoding. +The fingerprint class names should be passed in all upper case and without the `Fingeprint` suffix, +e.g. use alias `MORDRED` for `MordredFingerprint` class. These calculations will typically result in 500 to 1500 numbers per molecule. To avoid detrimental effects on the surrogate model fit, we reduce the number of diff --git a/examples/Backtesting/full_initial_data.py b/examples/Backtesting/full_initial_data.py index b53321fa7..dfb0257b5 100644 --- a/examples/Backtesting/full_initial_data.py +++ b/examples/Backtesting/full_initial_data.py @@ -91,13 +91,9 @@ # Here, we create the parameter objects, the searchspace and the objective. -base = SubstanceParameter(name="Base", data=dict_base, encoding="DefaultFingerprint") -solvent = SubstanceParameter( - name="Solvent", data=dict_solvent, encoding="DefaultFingerprint" -) -ligand = SubstanceParameter( - name="Ligand", data=dict_ligand, encoding="DefaultFingerprint" -) +base = SubstanceParameter(name="Base", data=dict_base, encoding="MORDRED") +solvent = SubstanceParameter(name="Solvent", data=dict_solvent, encoding="MORDRED") +ligand = SubstanceParameter(name="Ligand", data=dict_ligand, encoding="MORDRED") temperature = NumericalDiscreteParameter( name="Temp_C", values=[90, 105, 120], tolerance=2 ) diff --git a/examples/Backtesting/full_lookup.py b/examples/Backtesting/full_lookup.py index 095a8fd4f..ffd2b6ba4 100644 --- a/examples/Backtesting/full_lookup.py +++ b/examples/Backtesting/full_lookup.py @@ -96,7 +96,7 @@ # First let us create two campaigns that each use a different chemical encoding to # treat substances. -substance_encodings = ["MordredFingerprint", "RDKitFingerprint"] +substance_encodings = ["MORDRED", "RDKIT", "ECFP"] scenarios = { encoding: Campaign( searchspace=SearchSpace.from_product( diff --git a/examples/Backtesting/impute_mode.py b/examples/Backtesting/impute_mode.py index acc99e870..8e46d5a4d 100644 --- a/examples/Backtesting/impute_mode.py +++ b/examples/Backtesting/impute_mode.py @@ -82,13 +82,9 @@ # Here, we create the parameter objects, the searchspace and the objective. -solvent = SubstanceParameter( - name="Solvent", data=dict_solvent, encoding="DefaultFingerprint" -) -base = SubstanceParameter(name="Base", data=dict_base, encoding="DefaultFingerprint") -ligand = SubstanceParameter( - name="Ligand", data=dict_ligand, encoding="DefaultFingerprint" -) +solvent = SubstanceParameter(name="Solvent", data=dict_solvent, encoding="MORDRED") +base = SubstanceParameter(name="Base", data=dict_base, encoding="MORDRED") +ligand = SubstanceParameter(name="Ligand", data=dict_ligand, encoding="MORDRED") temperature = NumericalDiscreteParameter( name="Temp_C", values=[90, 105, 120], tolerance=2 ) diff --git a/examples/Basics/campaign.py b/examples/Basics/campaign.py index a93af2e6f..50e2b912c 100644 --- a/examples/Basics/campaign.py +++ b/examples/Basics/campaign.py @@ -40,15 +40,13 @@ } # We define the chemical substances parameters using the dictionaries defined previously. -# Here, we use `"DefaultFingerprint"` encoding, but others are available. +# Here, we use `"MORDRED"` encoding, but others are available. # We proceed to define numerical discrete parameters `temperature` and `concentration` # and create the search space. -solvent = SubstanceParameter( - "Solvent", data=dict_solvent, encoding="DefaultFingerprint" -) -base = SubstanceParameter("Base", data=dict_base, encoding="DefaultFingerprint") -ligand = SubstanceParameter("Ligand", data=dict_ligand, encoding="DefaultFingerprint") +solvent = SubstanceParameter("Solvent", data=dict_solvent, encoding="MORDRED") +base = SubstanceParameter("Base", data=dict_base, encoding="MORDRED") +ligand = SubstanceParameter("Ligand", data=dict_ligand, encoding="MORDRED") temperature = NumericalDiscreteParameter( "Temperature", values=[90, 105, 120], tolerance=2 diff --git a/examples/Basics/recommenders.py b/examples/Basics/recommenders.py index 2570de79a..3d541a1ad 100644 --- a/examples/Basics/recommenders.py +++ b/examples/Basics/recommenders.py @@ -140,11 +140,9 @@ "(t-Bu)PhCPhos": r"CN(C)C1=CC=CC(N(C)C)=C1C2=CC=CC=C2P(C(C)(C)C)C3=CC=CC=C3", } -solvent = SubstanceParameter( - "Solvent", data=dict_solvent, encoding="DefaultFingerprint" -) -base = SubstanceParameter("Base", data=dict_base, encoding="DefaultFingerprint") -ligand = SubstanceParameter("Ligand", data=dict_ligand, encoding="DefaultFingerprint") +solvent = SubstanceParameter("Solvent", data=dict_solvent, encoding="MORDRED") +base = SubstanceParameter("Base", data=dict_base, encoding="MORDRED") +ligand = SubstanceParameter("Ligand", data=dict_ligand, encoding="MORDRED") temperature = NumericalDiscreteParameter( "Temperature", values=[90, 105, 120], tolerance=2 ) diff --git a/examples/Constraints_Discrete/custom_constraints.py b/examples/Constraints_Discrete/custom_constraints.py index b279042f6..866688336 100644 --- a/examples/Constraints_Discrete/custom_constraints.py +++ b/examples/Constraints_Discrete/custom_constraints.py @@ -43,7 +43,7 @@ "c6": "c1ccccc1", "C6": "CCCCCC", } -solvent = SubstanceParameter("Solvent", data=dict_solvent, encoding="RDKitFingerprint") +solvent = SubstanceParameter("Solvent", data=dict_solvent, encoding="RDKIT") speed = CategoricalParameter( "Speed", values=["very slow", "slow", "normal", "fast", "very fast"], encoding="INT" ) diff --git a/examples/Constraints_Discrete/dependency_constraints.py b/examples/Constraints_Discrete/dependency_constraints.py index ab06375af..737bedc33 100644 --- a/examples/Constraints_Discrete/dependency_constraints.py +++ b/examples/Constraints_Discrete/dependency_constraints.py @@ -34,9 +34,7 @@ "water": "O", "C1": "C", } -solvent = SubstanceParameter( - name="Solv", data=dict_solvent, encoding="DefaultFingerprint" -) +solvent = SubstanceParameter(name="Solv", data=dict_solvent, encoding="MORDRED") switch1 = CategoricalParameter(name="Switch1", values=["on", "off"]) switch2 = CategoricalParameter(name="Switch2", values=["left", "right"]) fraction1 = NumericalDiscreteParameter( diff --git a/examples/Constraints_Discrete/exclusion_constraints.py b/examples/Constraints_Discrete/exclusion_constraints.py index 2758786ea..eb81ef2fc 100644 --- a/examples/Constraints_Discrete/exclusion_constraints.py +++ b/examples/Constraints_Discrete/exclusion_constraints.py @@ -40,9 +40,7 @@ "c6": "c1ccccc1", "C6": "CCCCCC", } -solvent = SubstanceParameter( - name="Solv", data=dict_solvent, encoding="RDKitFingerprint" -) +solvent = SubstanceParameter(name="Solv", data=dict_solvent, encoding="RDKIT") speed = CategoricalParameter( name="Speed", values=["very slow", "slow", "normal", "fast", "very fast"], diff --git a/examples/Constraints_Discrete/mixture_constraints.py b/examples/Constraints_Discrete/mixture_constraints.py index a87bc4534..7848043cc 100644 --- a/examples/Constraints_Discrete/mixture_constraints.py +++ b/examples/Constraints_Discrete/mixture_constraints.py @@ -46,15 +46,9 @@ "C2": "CC", "C3": "CCC", } -solvent1 = SubstanceParameter( - name="Solv1", data=dict_solvents, encoding="DefaultFingerprint" -) -solvent2 = SubstanceParameter( - name="Solv2", data=dict_solvents, encoding="DefaultFingerprint" -) -solvent3 = SubstanceParameter( - name="Solv3", data=dict_solvents, encoding="DefaultFingerprint" -) +solvent1 = SubstanceParameter(name="Solv1", data=dict_solvents, encoding="MORDRED") +solvent2 = SubstanceParameter(name="Solv2", data=dict_solvents, encoding="MORDRED") +solvent3 = SubstanceParameter(name="Solv3", data=dict_solvents, encoding="MORDRED") # Parameters for representing the fraction. diff --git a/examples/Constraints_Discrete/prodsum_constraints.py b/examples/Constraints_Discrete/prodsum_constraints.py index 394f5ae3e..2d547e61b 100644 --- a/examples/Constraints_Discrete/prodsum_constraints.py +++ b/examples/Constraints_Discrete/prodsum_constraints.py @@ -38,9 +38,7 @@ "C1": "C", "C2": "CC", } -solvent = SubstanceParameter( - name="Solvent", data=dict_solvent, encoding="RDKitFingerprint" -) +solvent = SubstanceParameter(name="Solvent", data=dict_solvent, encoding="RDKIT") speed = CategoricalParameter( name="Speed", values=["slow", "normal", "fast"], encoding="INT" ) diff --git a/examples/Custom_Hooks/campaign_stopping.py b/examples/Custom_Hooks/campaign_stopping.py index 0f51ac6ee..6adb9f07a 100644 --- a/examples/Custom_Hooks/campaign_stopping.py +++ b/examples/Custom_Hooks/campaign_stopping.py @@ -92,11 +92,9 @@ } parameters = [ - SubstanceParameter( - name="Solvent", data=dict_solvent, encoding="DefaultFingerprint" - ), - SubstanceParameter(name="Base", data=dict_base, encoding="DefaultFingerprint"), - SubstanceParameter(name="Ligand", data=dict_ligand, encoding="DefaultFingerprint"), + SubstanceParameter(name="Solvent", data=dict_solvent, encoding="MORDRED"), + SubstanceParameter(name="Base", data=dict_base, encoding="MORDRED"), + SubstanceParameter(name="Ligand", data=dict_ligand, encoding="MORDRED"), NumericalDiscreteParameter(name="Temp_C", values=[90, 105, 120], tolerance=2), NumericalDiscreteParameter(name="Concentration", values=[0.057, 0.1, 0.153]), ] diff --git a/examples/Custom_Surrogates/surrogate_params.py b/examples/Custom_Surrogates/surrogate_params.py index 11b9179c4..f72cc0b68 100644 --- a/examples/Custom_Surrogates/surrogate_params.py +++ b/examples/Custom_Surrogates/surrogate_params.py @@ -53,7 +53,7 @@ "Solvent C": "O", "Solvent D": "CS(=O)C", }, - encoding="DefaultFingerprint", + encoding="MORDRED", ), ] diff --git a/examples/Serialization/create_from_config.py b/examples/Serialization/create_from_config.py index c15f03499..f11b336ea 100644 --- a/examples/Serialization/create_from_config.py +++ b/examples/Serialization/create_from_config.py @@ -53,7 +53,7 @@ "Solvent D": "CCOCCOCCN" }, "decorrelate": true, - "encoding": "DefaultFingerprint" + "encoding": "MORDRED" } ], "constraints": [] diff --git a/examples/Serialization/validate_config.py b/examples/Serialization/validate_config.py index 4919cdc8c..52e8bd311 100644 --- a/examples/Serialization/validate_config.py +++ b/examples/Serialization/validate_config.py @@ -52,7 +52,7 @@ "Solvent D": "CCOCCOCCN" }, "decorrelate": true, - "encoding": "DefaultFingerprint" + "encoding": "MORDRED" } ], "constraints": [] @@ -123,7 +123,7 @@ "Solvent D": "CCOCCOCCN" }, "decorrelate": true, - "encoding": "DefaultFingerprint" + "encoding": "MORDRED" } ], "constraints": [] diff --git a/mypy.ini b/mypy.ini index 34e7c2190..885349a00 100644 --- a/mypy.ini +++ b/mypy.ini @@ -54,9 +54,6 @@ ignore_missing_imports = True [mypy-rdkit.Chem.rdMolDescriptors] ignore_missing_imports = True -[mypy-skfp.*] -ignore_missing_imports = True - [mypy-xyzpy] ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index af63b2064..055b00431 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,6 @@ Issues = "https://github.com/emdgroup/baybe/issues/" [project.optional-dependencies] chem = [ - "rdkit>=2022.3.4", "scikit-fingerprints>=1.7.0", ] diff --git a/tests/conftest.py b/tests/conftest.py index 8dd1dfa2e..559396907 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -323,7 +323,7 @@ def fixture_parameters( SubstanceParameter( name=f"Substance_1_{encoding}", data=mock_substances, - encoding=encoding.name, + encoding=encoding, ) for encoding in SubstanceEncoding ], @@ -731,7 +731,7 @@ def fixture_default_config(): "name": "Solvent", "data": {"sol1":"C", "sol2":"CC", "sol3":"CCC"}, "decorrelate": true, - "encoding": "DefaultFingerprint" + "encoding": "MORDRED" },""" if CHEM_INSTALLED else """ diff --git a/tests/hypothesis_strategies/parameters.py b/tests/hypothesis_strategies/parameters.py index 6608b6eef..10b067409 100644 --- a/tests/hypothesis_strategies/parameters.py +++ b/tests/hypothesis_strategies/parameters.py @@ -142,7 +142,7 @@ def substance_parameters(draw: st.DrawFn): decorrelate = draw(decorrelations) encoding = draw(st.sampled_from(SubstanceEncoding)) return SubstanceParameter( - name=name, data=data, decorrelate=decorrelate, encoding=encoding.name + name=name, data=data, decorrelate=decorrelate, encoding=encoding ) diff --git a/tests/simulate_telemetry.py b/tests/simulate_telemetry.py index 39d09bad5..ce26db3a8 100644 --- a/tests/simulate_telemetry.py +++ b/tests/simulate_telemetry.py @@ -55,11 +55,9 @@ } parameters = [ - SubstanceParameter( - name="Solvent", data=dict_solvent, encoding="DefaultFingerprint" - ), - SubstanceParameter(name="Base", data=dict_base, encoding="DefaultFingerprint"), - SubstanceParameter(name="Ligand", data=dict_ligand, encoding="DefaultFingerprint"), + SubstanceParameter(name="Solvent", data=dict_solvent, encoding="MORDRED"), + SubstanceParameter(name="Base", data=dict_base, encoding="MORDRED"), + SubstanceParameter(name="Ligand", data=dict_ligand, encoding="MORDRED"), NumericalDiscreteParameter(name="Temp_C", values=[90, 105, 120], tolerance=2), NumericalDiscreteParameter( name="Concentration", values=[0.057, 0.1, 0.153], tolerance=0.005 From d5809a7c1aa1444197bc7ad15456694aaf7c5b0b Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 6 Sep 2024 14:57:01 +0200 Subject: [PATCH 07/87] fix test param naming that caused NotEnoughPointsLeftError --- baybe/utils/chemistry.py | 6 +++++- tests/test_substance_parameter.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 07e160866..40f64960d 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -110,7 +110,11 @@ def smiles_to_fingerprint_features( **kwargs_fingerprint ) - if fingerprint_encoder.requires_conformers: + if fingerprint_encoder.requires_conformers or ( + # TODO removed once fixed in skfp package, see + # https://github.com/scikit-fingerprints/scikit-fingerprints/issues/239 + fingerprint_name in ["USRCATFingerprint", "USRFingerprint"] + ): kwargs_conformer = {} if kwargs_conformer is None else kwargs_conformer smiles_list = ConformerGenerator(**kwargs_conformer).transform( MolFromSmilesTransformer().transform(smiles_list) diff --git a/tests/test_substance_parameter.py b/tests/test_substance_parameter.py index 6fbdcca98..52bc69a0a 100644 --- a/tests/test_substance_parameter.py +++ b/tests/test_substance_parameter.py @@ -13,7 +13,7 @@ ) @pytest.mark.parametrize( "parameter_names", - [["Categorical_1", f"Substance_1_{enc.name}"] for enc in SubstanceEncoding], + [["Categorical_1", f"Substance_1_{enc}"] for enc in SubstanceEncoding], ids=[enc.name for enc in SubstanceEncoding], ) def test_run_iterations(campaign, batch_size, n_iterations): From f29085379e659cddd888144fd8110a136bf5049a Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 6 Sep 2024 15:15:32 +0200 Subject: [PATCH 08/87] update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30216284d..25f688793 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `to_tensor` now also handles `numpy` arrays - `MIN` mode of `NumericalTarget` is now implemented via the acquisition function instead of negating the computational representation +- Fingerprint encodings are now computed exclusively with `scikit_fingerprints` package. + Previously available `SubstanceParameter` values for `encoding` attribute are still valid, + with `MORGAN_FP` now being an alias for `ECFP`. + Additional values for `encoding` attribute are available, corresponding to + `scikit_fingerprints` fingerprint classes (see + [Parameters documentation](https://emdgroup.github.io/baybe/userguide/parameters.html#substanceparameter) + for details). ### Fixed - `CategoricalParameter` and `TaskParameter` no longer incorrectly coerce a single From c0d16a388bffb4ceab7a5b5eb4a067ea0a31e703 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:33:23 +0200 Subject: [PATCH 09/87] Add parameters test for FP encoding aliases --- baybe/parameters/enum.py | 11 +++++++++-- tests/hypothesis_strategies/parameters.py | 10 +++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 6bdf41d8b..f7244e4a7 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -17,6 +17,12 @@ class CategoricalEncoding(ParameterEncoding): """Integer encoding.""" +class SubstanceEncodingAliases(ParameterEncoding): + """Aliases for SubstanceEncoding values.""" + + MORGAN_FP = "ECFP" + + class SubstanceEncoding(ParameterEncoding): """Available encodings for substance parameters.""" @@ -60,8 +66,9 @@ def _missing_(cls, value): Enable backwards compatibility of value names that differ between SKFP and previous version. """ - if value == "MORGAN_FP": - return cls.ECFPFingerprint + if value in SubstanceEncodingAliases.__members__: + replace = SubstanceEncodingAliases[str(value)].value + return cls(replace) else: return super()._missing_(value) diff --git a/tests/hypothesis_strategies/parameters.py b/tests/hypothesis_strategies/parameters.py index 10b067409..e8f39e94a 100644 --- a/tests/hypothesis_strategies/parameters.py +++ b/tests/hypothesis_strategies/parameters.py @@ -10,6 +10,7 @@ TaskParameter, ) from baybe.parameters.custom import CustomDiscreteParameter +from baybe.parameters.enum import SubstanceEncodingAliases from baybe.parameters.numerical import ( NumericalContinuousParameter, NumericalDiscreteParameter, @@ -140,7 +141,14 @@ def substance_parameters(draw: st.DrawFn): name = draw(parameter_names) data = draw(substance_data()) decorrelate = draw(decorrelations) - encoding = draw(st.sampled_from(SubstanceEncoding)) + encoding = draw( + st.sampled_from( + list(SubstanceEncoding) # Check all fingerprints + + [ + SubstanceEncoding(alias.name) for alias in SubstanceEncodingAliases + ] # Check fingerprint aliases + ) + ) return SubstanceParameter( name=name, data=data, decorrelate=decorrelate, encoding=encoding ) From a645647dead88d0f76a585443d3324df6156c60e Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:37:21 +0200 Subject: [PATCH 10/87] update imports --- baybe/_optional/chem.py | 9 ++++++++- baybe/utils/chemistry.py | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/baybe/_optional/chem.py b/baybe/_optional/chem.py index cb2c35555..7662aa8dd 100644 --- a/baybe/_optional/chem.py +++ b/baybe/_optional/chem.py @@ -6,6 +6,7 @@ from rdkit import Chem from skfp import fingerprints as skfp_fingerprints from skfp.bases import BaseFingerprintTransformer + from skfp.preprocessing import ConformerGenerator, MolFromSmilesTransformer except ModuleNotFoundError as ex: raise OptionalImportError( @@ -15,4 +16,10 @@ "e.g. via `pip install baybe[chem]`." ) from ex -__all__ = ["Chem", "skfp_fingerprints", "BaseFingerprintTransformer"] +__all__ = [ + "Chem", + "skfp_fingerprints", + "BaseFingerprintTransformer", + "ConformerGenerator", + "MolFromSmilesTransformer", +] diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 40f64960d..e24ab7b6e 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -10,11 +10,12 @@ import numpy as np import pandas as pd from joblib import Memory -from skfp.preprocessing import ConformerGenerator, MolFromSmilesTransformer from baybe._optional.chem import ( BaseFingerprintTransformer, Chem, + ConformerGenerator, + MolFromSmilesTransformer, skfp_fingerprints, ) from baybe.utils.numerical import DTypeFloatNumpy From e1d4c0ffa06b02d7842d0e9c54556900d0921a76 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 6 Sep 2024 17:00:37 +0200 Subject: [PATCH 11/87] Update CONTRIBUTORS.md Co-authored-by: AdrianSosic --- CONTRIBUTORS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f0f45558f..ff2e53311 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -25,4 +25,4 @@ - Di Jin (Merck Life Science KGaA, Darmstadt, Germany):\ Cardinality constraints - Karin Hrovatin (Merck KGaA, Darmstadt, Germany):\ - Add scikit-fingerprints support +`scikit-fingerprints` support From 8161cb91ccf017ed5426c56d456148e6221bc9cb Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 17 Sep 2024 11:05:25 +0200 Subject: [PATCH 12/87] comments and typos --- CHANGELOG.md | 4 ++-- examples/Backtesting/full_lookup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25f688793..04e92b020 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,11 +27,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `to_tensor` now also handles `numpy` arrays - `MIN` mode of `NumericalTarget` is now implemented via the acquisition function instead of negating the computational representation -- Fingerprint encodings are now computed exclusively with `scikit_fingerprints` package. +- Fingerprint encodings are now computed exclusively with `scikit-fingerprints` package. Previously available `SubstanceParameter` values for `encoding` attribute are still valid, with `MORGAN_FP` now being an alias for `ECFP`. Additional values for `encoding` attribute are available, corresponding to - `scikit_fingerprints` fingerprint classes (see + `scikit-fingerprints` fingerprint classes (see [Parameters documentation](https://emdgroup.github.io/baybe/userguide/parameters.html#substanceparameter) for details). diff --git a/examples/Backtesting/full_lookup.py b/examples/Backtesting/full_lookup.py index ffd2b6ba4..954c18e3c 100644 --- a/examples/Backtesting/full_lookup.py +++ b/examples/Backtesting/full_lookup.py @@ -93,7 +93,7 @@ ### Constructing campaigns for the simulation loop # In this example, we create several campaigns. -# First let us create two campaigns that each use a different chemical encoding to +# First let us create three campaigns that each use a different chemical encoding to # treat substances. substance_encodings = ["MORDRED", "RDKIT", "ECFP"] From a8a00feb560659135c4d355e32716df13b5bd1c9 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 17 Sep 2024 12:10:37 +0200 Subject: [PATCH 13/87] add fingeprint generation test --- baybe/utils/chemistry.py | 6 +----- tests/test_fingerprints.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) create mode 100644 tests/test_fingerprints.py diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index e24ab7b6e..bee970848 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -111,11 +111,7 @@ def smiles_to_fingerprint_features( **kwargs_fingerprint ) - if fingerprint_encoder.requires_conformers or ( - # TODO removed once fixed in skfp package, see - # https://github.com/scikit-fingerprints/scikit-fingerprints/issues/239 - fingerprint_name in ["USRCATFingerprint", "USRFingerprint"] - ): + if fingerprint_encoder.requires_conformers: kwargs_conformer = {} if kwargs_conformer is None else kwargs_conformer smiles_list = ConformerGenerator(**kwargs_conformer).transform( MolFromSmilesTransformer().transform(smiles_list) diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py new file mode 100644 index 000000000..f6d44a251 --- /dev/null +++ b/tests/test_fingerprints.py @@ -0,0 +1,31 @@ +"""Test for fingerprint generation.""" + +from baybe.parameters import SubstanceEncoding +from baybe.utils.chemistry import smiles_to_fingerprint_features + + +def test_fingerprint_computation(): + smiles_list = ["CC(N(C)C)=O", "CCCC#N"] + for fingerprint in SubstanceEncoding: + smiles_to_fingerprint_features( + smiles_list=smiles_list, + fingerprint_name=fingerprint.name, + prefix="", + # Some params that make the test faster + kwargs_conformer={ + "max_gen_attempts": 5000, + "n_jobs": 4, + }, + kwargs_fingerprint={ + "n_jobs": 4, + }, + ) + + # Also run one time without passing kwargs + smiles_to_fingerprint_features( + smiles_list=smiles_list, + fingerprint_name=SubstanceEncoding("MORGAN_FP").name, + prefix="", + kwargs_conformer=None, + kwargs_fingerprint=None, + ) From 218c501707d0613d827932249f6723d27dff5c72 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:36:34 +0200 Subject: [PATCH 14/87] adapt header on package availability --- baybe/_optional/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/_optional/info.py b/baybe/_optional/info.py index 32907fa68..b35c53dab 100644 --- a/baybe/_optional/info.py +++ b/baybe/_optional/info.py @@ -42,7 +42,7 @@ def exclude_sys_path(path: str, /): # noqa: DOC402, DOC404 # directly depend on the flag – we thus simply set it to `True`. TYPOS_INSTALLED = True -# Package combinations +# Information on whether all required packages for certain functionality are available CHEM_INSTALLED = SKFP_INSTALLED LINT_INSTALLED = all( ( From 6c3c48945a82286bc050f96450ccf7e9c9ecda79 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:45:07 +0200 Subject: [PATCH 15/87] change field default from dict obj to factory --- baybe/parameters/substance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py index 32beb774b..82a0840c2 100644 --- a/baybe/parameters/substance.py +++ b/baybe/parameters/substance.py @@ -73,10 +73,10 @@ class SubstanceParameter(DiscreteParameter): ) # See base class. - kwargs_fingerprint: dict = field(default={}) + kwargs_fingerprint: dict = field(factory=dict) """Kwargs for fingerprint generator""" - kwargs_conformer: dict = field(default={}) + kwargs_conformer: dict = field(factory=dict) """Kwargs for conformer generator""" @data.validator From 82de6d15a4a4171f5df1cd03a4e1b8e973be26a6 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:57:08 +0200 Subject: [PATCH 16/87] deprecate morgan fp --- CHANGELOG.md | 3 ++- baybe/parameters/enum.py | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04e92b020..5a6f74e98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 instead of negating the computational representation - Fingerprint encodings are now computed exclusively with `scikit-fingerprints` package. Previously available `SubstanceParameter` values for `encoding` attribute are still valid, - with `MORGAN_FP` now being an alias for `ECFP`. + with `MORGAN_FP` now being an alias for `ECFP`. + However, the alias `MORGAN_FP` will be deprecated in future versions. Additional values for `encoding` attribute are available, corresponding to `scikit-fingerprints` fingerprint classes (see [Parameters documentation](https://emdgroup.github.io/baybe/userguide/parameters.html#substanceparameter) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index f7244e4a7..9134094bd 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -1,5 +1,6 @@ """Parameter-related enumerations.""" +import warnings from enum import Enum @@ -68,6 +69,11 @@ def _missing_(cls, value): """ if value in SubstanceEncodingAliases.__members__: replace = SubstanceEncodingAliases[str(value)].value + warnings.warn( + f"Fingerprint name {value} has changed and will be disabled in " + f"a future version. Use fingerprint name {replace} instead.", + DeprecationWarning, + ) return cls(replace) else: return super()._missing_(value) From 068f08c65e43d62a083c7b5b54dbbfabe0ca8247 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:06:41 +0200 Subject: [PATCH 17/87] shorten changelog --- CHANGELOG.md | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a6f74e98..45a78a6fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,14 +27,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `to_tensor` now also handles `numpy` arrays - `MIN` mode of `NumericalTarget` is now implemented via the acquisition function instead of negating the computational representation -- Fingerprint encodings are now computed exclusively with `scikit-fingerprints` package. - Previously available `SubstanceParameter` values for `encoding` attribute are still valid, - with `MORGAN_FP` now being an alias for `ECFP`. - However, the alias `MORGAN_FP` will be deprecated in future versions. - Additional values for `encoding` attribute are available, corresponding to - `scikit-fingerprints` fingerprint classes (see - [Parameters documentation](https://emdgroup.github.io/baybe/userguide/parameters.html#substanceparameter) - for details). +- Fingerprint encodings are now computed exclusively with `scikit-fingerprints` package, + granting access to all fingerprints available within `scikit-fingerprints`. + The `SubstanceParameter` value `MORGAN_FP` is now an alias for `ECFP` + and will be deprecated in future versions. ### Fixed - `CategoricalParameter` and `TaskParameter` no longer incorrectly coerce a single From a469caa2a08853b2de875bb5750e062b8cc53dc6 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:15:52 +0200 Subject: [PATCH 18/87] test deprecated FP name and that it warns about deprecation --- tests/test_deprecations.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index fc2e83a6c..aa22ad60c 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -8,6 +8,7 @@ from baybe.objective import Objective as OldObjective from baybe.objectives.base import Objective from baybe.objectives.desirability import DesirabilityObjective +from baybe.parameters import SubstanceEncoding from baybe.parameters.numerical import NumericalContinuousParameter from baybe.recommenders.pure.bayesian import ( BotorchRecommender, @@ -117,3 +118,9 @@ def test_deprecated_surrogate_registration(): with pytest.raises(DeprecationError): register_custom_architecture() + + +def test_deprecated_morgan_fp(acqf): + """Deprecated fingerprint name raises warning and uses a replacement.""" + with pytest.warns(DeprecationWarning): + assert SubstanceEncoding("MORGAN_FP") == SubstanceEncoding("ECFP") From 71375b3a7298dbff4623c272ae18ea5e3285b3cf Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:29:12 +0200 Subject: [PATCH 19/87] add a few popular fingeprint examples to user guide --- docs/userguide/parameters.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index ec3f36aae..7799ecc65 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -128,10 +128,16 @@ SubstanceParameter( ``` The ``encoding`` option defines what kind of descriptors are calculated. -All descriptors are calculated using [scikit-fingerprints package](https://github.com/scikit-fingerprints/scikit-fingerprints/). +All descriptors are calculated using +[scikit-fingerprints package](https://github.com/scikit-fingerprints/scikit-fingerprints/). Any fingerprint class from `scikit-fingerprints` can be used as an input parameter for chemical encoding. The fingerprint class names should be passed in all upper case and without the `Fingeprint` suffix, e.g. use alias `MORDRED` for `MordredFingerprint` class. +Here are examples of a few popular fingerprints: +* ``ECFP``: Extended Connectivity FingerPrint, +which is a circular topological fingerprint similar to Morgan fingerprint. +* ``MORDRED``: Chemical descriptor based fingerprint. +* ``RDKIT``: The RDKit fingerprint, which is based on hashing of molecular sub-graphs. These calculations will typically result in 500 to 1500 numbers per molecule. To avoid detrimental effects on the surrogate model fit, we reduce the number of From ce57ef54fd6eb533f94ece14a884696c457cd08e Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:40:52 +0200 Subject: [PATCH 20/87] add fingerprint kwargs example --- docs/userguide/parameters.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index 7799ecc65..fbe20e9dd 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -139,6 +139,27 @@ which is a circular topological fingerprint similar to Morgan fingerprint. * ``MORDRED``: Chemical descriptor based fingerprint. * ``RDKIT``: The RDKit fingerprint, which is based on hashing of molecular sub-graphs. +You can adjust fingerprint computation with parameters for `Fingerprint` classes from `scikit-fingerprints`. +These can be specified via the `kwargs_fingerprint` in the `SubstanceParameter` class. + +```python +from baybe.parameters import SubstanceParameter + +SubstanceParameter( + name="Solvent", + data={ + "Water": "O", + "1-Octanol": "CCCCCCCCO", + "Toluene": "CC1=CC=CC=C1", + }, + encoding="ECFP", + kwargs_fingerprint = { + "n_jobs" : 2, # Parallelize fingerprint computation across input molecules + "fp_size" : 1024, # Change the number of computed fingerprint features + }, +) +``` + These calculations will typically result in 500 to 1500 numbers per molecule. To avoid detrimental effects on the surrogate model fit, we reduce the number of descriptors via decorrelation before using them. From 378b5511dff5bedfb0855998787d0d2be62628b1 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 18 Sep 2024 07:41:47 +0200 Subject: [PATCH 21/87] Update baybe/utils/chemistry.py Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- baybe/utils/chemistry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index bee970848..704556c28 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -106,7 +106,7 @@ def smiles_to_fingerprint_features( Returns: Dataframe containing fingerprints for each SMILES string. """ - kwargs_fingerprint = {} if kwargs_fingerprint is None else kwargs_fingerprint + kwargs_fingerprint = kwargs_fingerprint or {} fingerprint_encoder = getattr(skfp_fingerprints, fingerprint_name)( **kwargs_fingerprint ) From 619b87ea110002d11019c9d31671211cc837a97a Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 18 Sep 2024 07:42:45 +0200 Subject: [PATCH 22/87] Update baybe/utils/chemistry.py Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- baybe/utils/chemistry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 704556c28..c690d31b7 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -112,7 +112,7 @@ def smiles_to_fingerprint_features( ) if fingerprint_encoder.requires_conformers: - kwargs_conformer = {} if kwargs_conformer is None else kwargs_conformer + kwargs_conformer = kwargs_conformer or {} smiles_list = ConformerGenerator(**kwargs_conformer).transform( MolFromSmilesTransformer().transform(smiles_list) ) From 0f0a6273f019bef3bdd26e1821e7bd4f0066bb51 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 18 Sep 2024 07:43:03 +0200 Subject: [PATCH 23/87] Update baybe/utils/chemistry.py Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- baybe/utils/chemistry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index c690d31b7..ec8ee9679 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -100,8 +100,8 @@ def smiles_to_fingerprint_features( fingerprint_name: Name of Fingerprint class used to transform smiles to fingerprints prefix: Name prefix for each descriptor (e.g., nBase --> _nBase). - kwargs_conformer: kwargs for ConformerGenerator - kwargs_fingerprint: kwargs for ConformerGenerator + kwargs_conformer: kwargs for conformer generator + kwargs_fingerprint: kwargs for fingerprint generator Returns: Dataframe containing fingerprints for each SMILES string. From 7f1b3c7c780ae43eda65a468b0816121e7f779b8 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 18 Sep 2024 07:45:44 +0200 Subject: [PATCH 24/87] move kwargs handlig to top --- baybe/utils/chemistry.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index ec8ee9679..ee5012a62 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -107,12 +107,13 @@ def smiles_to_fingerprint_features( Dataframe containing fingerprints for each SMILES string. """ kwargs_fingerprint = kwargs_fingerprint or {} + kwargs_conformer = kwargs_conformer or {} + fingerprint_encoder = getattr(skfp_fingerprints, fingerprint_name)( **kwargs_fingerprint ) if fingerprint_encoder.requires_conformers: - kwargs_conformer = kwargs_conformer or {} smiles_list = ConformerGenerator(**kwargs_conformer).transform( MolFromSmilesTransformer().transform(smiles_list) ) From b47209e208eccd6eb146e0c55637abeb7805f7f7 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 18 Sep 2024 08:23:04 +0200 Subject: [PATCH 25/87] rename smiles to mol as may be str or mol obj --- baybe/utils/chemistry.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index ee5012a62..7dd497536 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -69,21 +69,22 @@ def name_to_smiles(name: str) -> str: @lru_cache(maxsize=None) @_disk_cache -def _smiles_str_to_fingerprint_features( +def _molecule_to_fingerprint_features( fingerprint_encoder: BaseFingerprintTransformer, - smiles_str: str, + molecule: str | Chem.PropertyMol.PropertyMol, ) -> np.ndarray: """Compute molecular fingerprint for a single SMILES string. Args: fingerprint_encoder: Instance of Fingerprint class used to transform smiles string to fingerprint - smiles_str: Smiles string + molecule: Smiles string or molecule object, + depending on what should be input into fingerprint_encoder's transform Returns: Array containing fingerprint for SMILES string. """ - return fingerprint_encoder.transform([smiles_str]) + return fingerprint_encoder.transform([molecule]) def smiles_to_fingerprint_features( @@ -114,16 +115,18 @@ def smiles_to_fingerprint_features( ) if fingerprint_encoder.requires_conformers: - smiles_list = ConformerGenerator(**kwargs_conformer).transform( + mol_list = ConformerGenerator(**kwargs_conformer).transform( MolFromSmilesTransformer().transform(smiles_list) ) + else: + mol_list = smiles_list features = np.concatenate( [ - _smiles_str_to_fingerprint_features( - fingerprint_encoder=fingerprint_encoder, smiles_str=smiles_str + _molecule_to_fingerprint_features( + fingerprint_encoder=fingerprint_encoder, molecule=mol ) - for smiles_str in smiles_list + for mol in mol_list ] ) name = f"skfp{fingerprint_encoder.__class__.__name__.replace('Fingerprint', '')}_" From 7c21c4fb0af12fa206047c347362258d84a25e49 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 18 Sep 2024 08:33:22 +0200 Subject: [PATCH 26/87] test for fp embedding size --- tests/test_fingerprints.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index f6d44a251..f1f72e9c2 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -24,8 +24,18 @@ def test_fingerprint_computation(): # Also run one time without passing kwargs smiles_to_fingerprint_features( smiles_list=smiles_list, - fingerprint_name=SubstanceEncoding("MORGAN_FP").name, + fingerprint_name=SubstanceEncoding("MORDRED").name, prefix="", kwargs_conformer=None, kwargs_fingerprint=None, ) + + # Check that fingerprint embedding is of correct size and + # fingerprint kwargs specifying embedding size are used + assert smiles_to_fingerprint_features( + smiles_list=smiles_list, + fingerprint_name=SubstanceEncoding("ECFP").name, + prefix="", + kwargs_conformer=None, + kwargs_fingerprint={"fp_size": 64}, + ).shape == (len(smiles_list), 64) From 797616e914855bc74bdd309a8dea7e9c796d42bd Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Mon, 30 Sep 2024 11:08:12 +0200 Subject: [PATCH 27/87] Update CHANGELOG.md Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 45a78a6fd..30ef5f64b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,7 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `to_tensor` now also handles `numpy` arrays - `MIN` mode of `NumericalTarget` is now implemented via the acquisition function instead of negating the computational representation -- Fingerprint encodings are now computed exclusively with `scikit-fingerprints` package, +- Fingerprint encodings are now computed exclusively with the `scikit-fingerprints` package, granting access to all fingerprints available within `scikit-fingerprints`. The `SubstanceParameter` value `MORGAN_FP` is now an alias for `ECFP` and will be deprecated in future versions. From 80eb43a6f39a93cbba83e8cbe1064a7f7b35fb2c Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Mon, 30 Sep 2024 11:08:23 +0200 Subject: [PATCH 28/87] Update CHANGELOG.md Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30ef5f64b..4b3705d2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,7 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `MIN` mode of `NumericalTarget` is now implemented via the acquisition function instead of negating the computational representation - Fingerprint encodings are now computed exclusively with the `scikit-fingerprints` package, - granting access to all fingerprints available within `scikit-fingerprints`. + granting access to all fingerprints available therein. The `SubstanceParameter` value `MORGAN_FP` is now an alias for `ECFP` and will be deprecated in future versions. From 3ac85983cc6e4d7556b484ba1d351f75629d4ba6 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Mon, 30 Sep 2024 11:08:39 +0200 Subject: [PATCH 29/87] Update baybe/parameters/enum.py Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- baybe/parameters/enum.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 9134094bd..9729193a5 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -70,8 +70,8 @@ def _missing_(cls, value): if value in SubstanceEncodingAliases.__members__: replace = SubstanceEncodingAliases[str(value)].value warnings.warn( - f"Fingerprint name {value} has changed and will be disabled in " - f"a future version. Use fingerprint name {replace} instead.", + f"Substance encoding '{value}' is deprecated and will be disabled in " + f"a future version. Use '{replace}' instead.", DeprecationWarning, ) return cls(replace) From b0759c49b3ae3e08b5a494d09b47dcee4d4819c9 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Mon, 30 Sep 2024 11:08:49 +0200 Subject: [PATCH 30/87] Update docs/userguide/parameters.md Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- docs/userguide/parameters.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index fbe20e9dd..e9dec9698 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -155,7 +155,7 @@ SubstanceParameter( encoding="ECFP", kwargs_fingerprint = { "n_jobs" : 2, # Parallelize fingerprint computation across input molecules - "fp_size" : 1024, # Change the number of computed fingerprint features + "fp_size" : 1024, # Change the number of computed bits }, ) ``` From 3f3443521dea0867ceb459c1337e81ca9470ba8d Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Mon, 30 Sep 2024 11:10:13 +0200 Subject: [PATCH 31/87] Update docs/userguide/parameters.md Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- docs/userguide/parameters.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index e9dec9698..29b6461ac 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -128,7 +128,7 @@ SubstanceParameter( ``` The ``encoding`` option defines what kind of descriptors are calculated. -All descriptors are calculated using +All descriptors are calculated using the [scikit-fingerprints package](https://github.com/scikit-fingerprints/scikit-fingerprints/). Any fingerprint class from `scikit-fingerprints` can be used as an input parameter for chemical encoding. The fingerprint class names should be passed in all upper case and without the `Fingeprint` suffix, From d8903cc171f2502d7c0b7f00e279bfff50cbc54c Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 2 Oct 2024 13:52:51 +0200 Subject: [PATCH 32/87] add default kwargs to morgan fp --- baybe/parameters/enum.py | 128 ++++++++++++---------- baybe/parameters/substance.py | 14 +-- baybe/utils/chemistry.py | 53 ++++++++- docs/userguide/parameters.md | 9 +- tests/hypothesis_strategies/parameters.py | 6 +- tests/test_deprecations.py | 19 +++- tests/test_fingerprints.py | 8 +- 7 files changed, 150 insertions(+), 87 deletions(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 9729193a5..96fbffae5 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -1,6 +1,5 @@ """Parameter-related enumerations.""" -import warnings from enum import Enum @@ -18,65 +17,42 @@ class CategoricalEncoding(ParameterEncoding): """Integer encoding.""" -class SubstanceEncodingAliases(ParameterEncoding): - """Aliases for SubstanceEncoding values.""" - - MORGAN_FP = "ECFP" - - class SubstanceEncoding(ParameterEncoding): """Available encodings for substance parameters.""" - AtomPairFingerprint = "ATOMPAIR" - AutocorrFingerprint = "AUTOCORR" - AvalonFingerprint = "AVALON" - E3FPFingerprint = "E3FP" - ECFPFingerprint = "ECFP" - ERGFingerprint = "ERG" - EStateFingerprint = "ESTATE" - FunctionalGroupsFingerprint = "FUNCTIONALGROUPS" - GETAWAYFingerprint = "GETAWAY" - GhoseCrippenFingerprint = "GHOSECRIPPEN" - KlekotaRothFingerprint = "KLEKOTAROTH" - LaggnerFingerprint = "LAGGNER" - LayeredFingerprint = "LAYERED" - LingoFingerprint = "LINGO" - MACCSFingerprint = "MACCS" - MAPFingerprint = "MAP" - MHFPFingerprint = "MHFP" - MORSEFingerprint = "MORSE" - MQNsFingerprint = "MQNS" - MordredFingerprint = "MORDRED" - PatternFingerprint = "PATTERN" - PharmacophoreFingerprint = "PHARMACOPHORE" - PhysiochemicalPropertiesFingerprint = "PHYSIOCHEMICALPROPERTIES" - PubChemFingerprint = "PUBCHEM" - RDFFingerprint = "RDF" - RDKit2DDescriptorsFingerprint = "RDKIT2DDESCRIPTORS" - RDKitFingerprint = "RDKIT" - SECFPFingerprint = "SECFP" - TopologicalTorsionFingerprint = "TOPOLOGICALTORSION" - USRCATFingerprint = "USRCAT" - USRFingerprint = "USR" - WHIMFingerprint = "WHIM" - - @classmethod - def _missing_(cls, value): - """Backward compatibility of enum values. - - Enable backwards compatibility of value names that - differ between SKFP and previous version. - """ - if value in SubstanceEncodingAliases.__members__: - replace = SubstanceEncodingAliases[str(value)].value - warnings.warn( - f"Substance encoding '{value}' is deprecated and will be disabled in " - f"a future version. Use '{replace}' instead.", - DeprecationWarning, - ) - return cls(replace) - else: - return super()._missing_(value) + ATOMPAIR = "ATOMPAIR" + AUTOCORR = "AUTOCORR" + AVALON = "AVALON" + E3FP = "E3FP" + ECFP = "ECFP" + MORGAN_FP = "MORGAN_FP" + ERG = "ERG" + ESTATE = "ESTATE" + FUNCTIONALGROUPS = "FUNCTIONALGROUPS" + GETAWAY = "GETAWAY" + GHOSECRIPPEN = "GHOSECRIPPEN" + KLEKOTAROTH = "KLEKOTAROTH" + LAGGNER = "LAGGNER" + LAYERED = "LAYERED" + LINGO = "LINGO" + MACCS = "MACCS" + MAP = "MAP" + MHFP = "MHFP" + MORSE = "MORSE" + MQNS = "MQNS" + MORDRED = "MORDRED" + PATTERN = "PATTERN" + PHARMACOPHORE = "PHARMACOPHORE" + PHYSIOCHEMICALPROPERTIES = "PHYSIOCHEMICALPROPERTIES" + PUBCHEM = "PUBCHEM" + RDF = "RDF" + RDKIT2DDESCRIPTORS = "RDKIT2DDESCRIPTORS" + RDKIT = "RDKIT" + SECFP = "SECFP" + TOPOLOGICALTORSION = "TOPOLOGICALTORSION" + USRCAT = "USRCAT" + USR = "USR" + WHIM = "WHIM" class CustomEncoding(ParameterEncoding): @@ -84,3 +60,41 @@ class CustomEncoding(ParameterEncoding): CUSTOM = "CUSTOM" """User-defined encoding.""" + + +class FingerprintNames(Enum): + """Mapping of substance parameter encoding names to fingerprint classes.""" + + ATOMPAIR = "AtomPairFingerprint" + AUTOCORR = "AutocorrFingerprint" + AVALON = "AvalonFingerprint" + E3FP = "E3FPFingerprint" + ECFP = "ECFPFingerprint" + MORGAN_FP = "ECFPFingerprint" + ERG = "ERGFingerprint" + ESTATE = "EStateFingerprint" + FUNCTIONALGROUPS = "FunctionalGroupsFingerprint" + GETAWAY = "GETAWAYFingerprint" + GHOSECRIPPEN = "GhoseCrippenFingerprint" + KLEKOTAROTH = "KlekotaRothFingerprint" + LAGGNER = "LaggnerFingerprint" + LAYERED = "LayeredFingerprint" + LINGO = "LingoFingerprint" + MACCS = "MACCSFingerprint" + MAP = "MAPFingerprint" + MHFP = "MHFPFingerprint" + MORSE = "MORSEFingerprint" + MQNS = "MQNsFingerprint" + MORDRED = "MordredFingerprint" + PATTERN = "PatternFingerprint" + PHARMACOPHORE = "PharmacophoreFingerprint" + PHYSIOCHEMICALPROPERTIES = "PhysiochemicalPropertiesFingerprint" + PUBCHEM = "PubChemFingerprint" + RDF = "RDFFingerprint" + RDKIT2DDESCRIPTORS = "RDKit2DDescriptorsFingerprint" + RDKIT = "RDKitFingerprint" + SECFP = "SECFPFingerprint" + TOPOLOGICALTORSION = "TopologicalTorsionFingerprint" + USRCAT = "USRCATFingerprint" + USR = "USRFingerprint" + WHIM = "WHIMFingerprint" diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py index 82a0840c2..7f2cfb6b8 100644 --- a/baybe/parameters/substance.py +++ b/baybe/parameters/substance.py @@ -57,19 +57,7 @@ class SubstanceParameter(DiscreteParameter): """ encoding: SubstanceEncoding = field( - default=SubstanceEncoding.MordredFingerprint, - converter=lambda x: ( - # Passed enum - x - if isinstance(x, SubstanceEncoding) - # Passed enum name - else ( - SubstanceEncoding[x] - if x in SubstanceEncoding.__members__ - # Passed enum value - else SubstanceEncoding(x) - ) - ), + default=SubstanceEncoding.MORDRED, converter=SubstanceEncoding ) # See base class. diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 7dd497536..ddba01e2a 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -4,6 +4,7 @@ import ssl import tempfile import urllib.request +import warnings from functools import lru_cache from pathlib import Path @@ -18,6 +19,7 @@ MolFromSmilesTransformer, skfp_fingerprints, ) +from baybe.parameters.enum import FingerprintNames from baybe.utils.numerical import DTypeFloatNumpy # Caching @@ -107,10 +109,12 @@ def smiles_to_fingerprint_features( Returns: Dataframe containing fingerprints for each SMILES string. """ - kwargs_fingerprint = kwargs_fingerprint or {} + fingerprint_cls, kwargs_fingerprint = convert_fingeprint_parameters( + name=fingerprint_name, kwargs_fingerprint=kwargs_fingerprint + ) kwargs_conformer = kwargs_conformer or {} - fingerprint_encoder = getattr(skfp_fingerprints, fingerprint_name)( + fingerprint_encoder = getattr(skfp_fingerprints, fingerprint_cls)( **kwargs_fingerprint ) @@ -136,6 +140,51 @@ def smiles_to_fingerprint_features( return df +def convert_fingeprint_parameters( + name: str, kwargs_fingerprint: dict | None = None +) -> (str, dict): + """Convert fingerprint name parameters for computing the fingerprint. + + Args: + name: Name of fingerprint. + kwargs_fingerprint: Optional user-specified params + for computing the fingerprint. + + Raises: + KeyError: If fingerprint name is not recognized. + + Returns: + Fingerprint class name and kwargs to use for the fingerprint computation. + """ + # Get fingerprint class + try: + fp_class = FingerprintNames[name].value + except KeyError: + raise KeyError(f"Fingerprint name {name} is not valid.") + + # For backwards-compatibility purposes + + # Update default kwargs to match the fingerprint name when + # using a different fingerprint class to compute the desired fingerprint + kwargs_fp_update = {} + kwargs_fingerprint = {} if not kwargs_fingerprint else kwargs_fingerprint + if name == "MORGAN_FP": + warnings.warn( + "Substance encoding 'MORGAN_FP' is deprecated and will be disabled in " + "a future version. Use 'ECFP' with 'fp_size' 1204 and 'radius' 4 instead.", + DeprecationWarning, + ) + kwargs_fp_update = { + "fp_size": 1024, + "radius": 4, + } + # Update kwargs with fingerprint-specific defaults + # If a kwarg is specified in the input it overrides the fingerprint default + kwargs_fingerprint = {**kwargs_fp_update, **kwargs_fingerprint} + + return fp_class, kwargs_fingerprint + + def get_canonical_smiles(smiles: str) -> str: """Return the "canonical" representation of the given SMILES.""" try: diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index 29b6461ac..35eca1158 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -152,12 +152,13 @@ SubstanceParameter( "1-Octanol": "CCCCCCCCO", "Toluene": "CC1=CC=CC=C1", }, - encoding="ECFP", - kwargs_fingerprint = { - "n_jobs" : 2, # Parallelize fingerprint computation across input molecules - "fp_size" : 1024, # Change the number of computed bits + encoding="ECFP", + kwargs_fingerprint={ + "n_jobs": 2, # Parallelize fingerprint computation across input molecules + "fp_size": 1024, # Change the number of computed bits }, ) + ``` These calculations will typically result in 500 to 1500 numbers per molecule. diff --git a/tests/hypothesis_strategies/parameters.py b/tests/hypothesis_strategies/parameters.py index e8f39e94a..41648295f 100644 --- a/tests/hypothesis_strategies/parameters.py +++ b/tests/hypothesis_strategies/parameters.py @@ -10,7 +10,6 @@ TaskParameter, ) from baybe.parameters.custom import CustomDiscreteParameter -from baybe.parameters.enum import SubstanceEncodingAliases from baybe.parameters.numerical import ( NumericalContinuousParameter, NumericalDiscreteParameter, @@ -143,10 +142,7 @@ def substance_parameters(draw: st.DrawFn): decorrelate = draw(decorrelations) encoding = draw( st.sampled_from( - list(SubstanceEncoding) # Check all fingerprints - + [ - SubstanceEncoding(alias.name) for alias in SubstanceEncodingAliases - ] # Check fingerprint aliases + list(SubstanceEncoding) # Check all fingerprint names ) ) return SubstanceParameter( diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index aa22ad60c..5a072babe 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -16,6 +16,7 @@ ) from baybe.searchspace.continuous import SubspaceContinuous from baybe.targets.numerical import NumericalTarget +from baybe.utils.chemistry import convert_fingeprint_parameters def test_deprecated_objective_class(): @@ -121,6 +122,20 @@ def test_deprecated_surrogate_registration(): def test_deprecated_morgan_fp(acqf): - """Deprecated fingerprint name raises warning and uses a replacement.""" + """Deprecated fingerprint name raises warning and uses ECFP replacement.""" with pytest.warns(DeprecationWarning): - assert SubstanceEncoding("MORGAN_FP") == SubstanceEncoding("ECFP") + # Check that ECFP is used instead of Morgan with correct pre-defined kwargs + morgan_class, morgan_kwargs = convert_fingeprint_parameters( + name=SubstanceEncoding("MORGAN_FP").name, kwargs_fingerprint=None + ) + ecfp_class, _ = convert_fingeprint_parameters( + name=SubstanceEncoding("ECFP").name, kwargs_fingerprint=None + ) + assert morgan_class == ecfp_class + assert morgan_kwargs == {"fp_size": 1024, "radius": 4} + + # Check that user-specified kwargs override the defaults + _, morgan_custom_kwargs = convert_fingeprint_parameters( + name=SubstanceEncoding("MORGAN_FP").name, kwargs_fingerprint={"radius": 5} + ) + assert morgan_custom_kwargs == {"fp_size": 1024, "radius": 5} diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index f1f72e9c2..988b885cf 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -1,12 +1,12 @@ """Test for fingerprint generation.""" -from baybe.parameters import SubstanceEncoding +from baybe.parameters.enum import FingerprintNames from baybe.utils.chemistry import smiles_to_fingerprint_features def test_fingerprint_computation(): smiles_list = ["CC(N(C)C)=O", "CCCC#N"] - for fingerprint in SubstanceEncoding: + for fingerprint in FingerprintNames: smiles_to_fingerprint_features( smiles_list=smiles_list, fingerprint_name=fingerprint.name, @@ -24,7 +24,7 @@ def test_fingerprint_computation(): # Also run one time without passing kwargs smiles_to_fingerprint_features( smiles_list=smiles_list, - fingerprint_name=SubstanceEncoding("MORDRED").name, + fingerprint_name=FingerprintNames["MORDRED"].name, prefix="", kwargs_conformer=None, kwargs_fingerprint=None, @@ -34,7 +34,7 @@ def test_fingerprint_computation(): # fingerprint kwargs specifying embedding size are used assert smiles_to_fingerprint_features( smiles_list=smiles_list, - fingerprint_name=SubstanceEncoding("ECFP").name, + fingerprint_name=FingerprintNames["ECFP"].name, prefix="", kwargs_conformer=None, kwargs_fingerprint={"fp_size": 64}, From bf29920ec2a45566c06b63cceb263bc8ae8cec50 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 2 Oct 2024 14:35:06 +0200 Subject: [PATCH 33/87] add Morgan_FP deprecation --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b3705d2f..1bcae8b6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,9 +28,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `MIN` mode of `NumericalTarget` is now implemented via the acquisition function instead of negating the computational representation - Fingerprint encodings are now computed exclusively with the `scikit-fingerprints` package, - granting access to all fingerprints available therein. + granting access to all fingerprints available therein. The `SubstanceParameter` value `MORGAN_FP` is now an alias for `ECFP` - and will be deprecated in future versions. + with 1024 features and radius of 4. ### Fixed - `CategoricalParameter` and `TaskParameter` no longer incorrectly coerce a single @@ -44,6 +44,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Deprecations - The role of `register_custom_architecture` has been taken over by `baybe.surrogates.base.SurrogateProtocol` +- The `SubstanceParameter` value `MORGAN_FP` will be deprecated in future versions. ## [0.10.0] - 2024-08-02 ### Breaking Changes From b2960c6c4cd656c24d703460cca44b67cdc0ba4b Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 2 Oct 2024 14:38:54 +0200 Subject: [PATCH 34/87] docs conformer kwargs --- docs/userguide/parameters.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index 35eca1158..e54337a4b 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -141,6 +141,8 @@ which is a circular topological fingerprint similar to Morgan fingerprint. You can adjust fingerprint computation with parameters for `Fingerprint` classes from `scikit-fingerprints`. These can be specified via the `kwargs_fingerprint` in the `SubstanceParameter` class. +Similarly, for fingerprints requiring conformers, +the parameters for conformer computation can be specified via `kwargs_conformer`. ```python from baybe.parameters import SubstanceParameter From f5bbea627307cb00ef335101233066655618db84 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 2 Oct 2024 14:44:28 +0200 Subject: [PATCH 35/87] remove n_jobs from example on single mol --- docs/userguide/parameters.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index e54337a4b..c2daa4eb5 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -156,7 +156,6 @@ SubstanceParameter( }, encoding="ECFP", kwargs_fingerprint={ - "n_jobs": 2, # Parallelize fingerprint computation across input molecules "fp_size": 1024, # Change the number of computed bits }, ) From 59ede88dcdf56561860d2198f7e4eaf1f6a22d9b Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Thu, 3 Oct 2024 20:36:45 +0200 Subject: [PATCH 36/87] test fingerprint computation function --- tests/test_fingerprints.py | 49 +++++++++++++++----------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index 988b885cf..58c35b9c2 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -1,41 +1,30 @@ """Test for fingerprint generation.""" -from baybe.parameters.enum import FingerprintNames +import pytest + from baybe.utils.chemistry import smiles_to_fingerprint_features -def test_fingerprint_computation(): +@pytest.mark.parametrize( + "fingerprint_name,kwargs_fingerprint,kwargs_conformer", + [ + # Test fingerprint calculation with different kwargs + ("ECFP", {}, {}), + ("ECFP", {"fp_size": 64}, {}), + ("ECFP", {}, {"max_gen_attempts": 5000}), + ], +) +def test_fingerprint_kwargs(fingerprint_name, kwargs_fingerprint, kwargs_conformer): smiles_list = ["CC(N(C)C)=O", "CCCC#N"] - for fingerprint in FingerprintNames: - smiles_to_fingerprint_features( - smiles_list=smiles_list, - fingerprint_name=fingerprint.name, - prefix="", - # Some params that make the test faster - kwargs_conformer={ - "max_gen_attempts": 5000, - "n_jobs": 4, - }, - kwargs_fingerprint={ - "n_jobs": 4, - }, - ) - - # Also run one time without passing kwargs - smiles_to_fingerprint_features( + x = smiles_to_fingerprint_features( smiles_list=smiles_list, - fingerprint_name=FingerprintNames["MORDRED"].name, + fingerprint_name=fingerprint_name, prefix="", - kwargs_conformer=None, - kwargs_fingerprint=None, + kwargs_conformer=kwargs_conformer, + kwargs_fingerprint=kwargs_fingerprint, ) - # Check that fingerprint embedding is of correct size and # fingerprint kwargs specifying embedding size are used - assert smiles_to_fingerprint_features( - smiles_list=smiles_list, - fingerprint_name=FingerprintNames["ECFP"].name, - prefix="", - kwargs_conformer=None, - kwargs_fingerprint={"fp_size": 64}, - ).shape == (len(smiles_list), 64) + assert x.shape[0] == len(smiles_list) + if "fp_size" in kwargs_fingerprint: + assert x.shape[1] == kwargs_fingerprint["fp_size"] From 46760e90e43ee6e2778d94ca3af96b9aaa789e4e Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 8 Oct 2024 14:15:15 +0200 Subject: [PATCH 37/87] Mention radius --- docs/userguide/parameters.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index c2daa4eb5..293ec6877 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -156,6 +156,7 @@ SubstanceParameter( }, encoding="ECFP", kwargs_fingerprint={ + "radius": 4, # Set maximum radius of resulting subgraphs "fp_size": 1024, # Change the number of computed bits }, ) From d71dd392e52ea6b2593b9e7d12f5e6acb51e896a Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 8 Oct 2024 14:28:30 +0200 Subject: [PATCH 38/87] Extent tests --- tests/test_fingerprints.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index 58c35b9c2..b69b8cb13 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -2,29 +2,40 @@ import pytest +from baybe.parameters.substance import SubstanceEncoding from baybe.utils.chemistry import smiles_to_fingerprint_features +test_lst = [ + (enc.name, {}, {}) + for enc in SubstanceEncoding + if enc is not SubstanceEncoding.MORGAN_FP # excluded due to deprecation +] + +print(test_lst) + @pytest.mark.parametrize( - "fingerprint_name,kwargs_fingerprint,kwargs_conformer", - [ - # Test fingerprint calculation with different kwargs - ("ECFP", {}, {}), + "name,kw_fp,kw_conf", + test_lst + + [ ("ECFP", {"fp_size": 64}, {}), + ("ECFP", {"fp_size": 512}, {}), + ("ECFP", {"radius": 4}, {}), + ("ECFP", {"fp_size": 512, "radius": 4}, {}), ("ECFP", {}, {"max_gen_attempts": 5000}), ], ) -def test_fingerprint_kwargs(fingerprint_name, kwargs_fingerprint, kwargs_conformer): +def test_fingerprint_kwargs(name, kw_fp, kw_conf): smiles_list = ["CC(N(C)C)=O", "CCCC#N"] x = smiles_to_fingerprint_features( smiles_list=smiles_list, - fingerprint_name=fingerprint_name, + fingerprint_name=name, prefix="", - kwargs_conformer=kwargs_conformer, - kwargs_fingerprint=kwargs_fingerprint, + kwargs_conformer=kw_conf, + kwargs_fingerprint=kw_fp, ) # Check that fingerprint embedding is of correct size and # fingerprint kwargs specifying embedding size are used assert x.shape[0] == len(smiles_list) - if "fp_size" in kwargs_fingerprint: - assert x.shape[1] == kwargs_fingerprint["fp_size"] + if "fp_size" in kw_fp: + assert x.shape[1] == kw_fp["fp_size"] From 2415bf2cdcf4799baae45fc0c5a42ea646b50940 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 8 Oct 2024 15:06:24 +0200 Subject: [PATCH 39/87] Adjust hypothesis --- tests/hypothesis_strategies/parameters.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/hypothesis_strategies/parameters.py b/tests/hypothesis_strategies/parameters.py index 41648295f..a1944351f 100644 --- a/tests/hypothesis_strategies/parameters.py +++ b/tests/hypothesis_strategies/parameters.py @@ -140,11 +140,12 @@ def substance_parameters(draw: st.DrawFn): name = draw(parameter_names) data = draw(substance_data()) decorrelate = draw(decorrelations) - encoding = draw( - st.sampled_from( - list(SubstanceEncoding) # Check all fingerprint names - ) - ) + + # Ignore deprecated encodings + encodings = list(SubstanceEncoding) + encodings.remove(SubstanceEncoding.MORGAN_FP) + encoding = draw(st.sampled_from(encodings)) + return SubstanceParameter( name=name, data=data, decorrelate=decorrelate, encoding=encoding ) From 5a5afbd67e775e40455293742494ada5817ccddf Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 8 Oct 2024 17:06:59 +0200 Subject: [PATCH 40/87] Update full lookup example --- examples/Backtesting/full_lookup.py | 2 +- examples/Backtesting/full_lookup_dark.svg | 1180 +++++++++----------- examples/Backtesting/full_lookup_light.svg | 1180 +++++++++----------- 3 files changed, 1105 insertions(+), 1257 deletions(-) diff --git a/examples/Backtesting/full_lookup.py b/examples/Backtesting/full_lookup.py index 954c18e3c..a40e8a974 100644 --- a/examples/Backtesting/full_lookup.py +++ b/examples/Backtesting/full_lookup.py @@ -96,7 +96,7 @@ # First let us create three campaigns that each use a different chemical encoding to # treat substances. -substance_encodings = ["MORDRED", "RDKIT", "ECFP"] +substance_encodings = ["MORDRED", "PUBCHEM", "ECFP"] scenarios = { encoding: Campaign( searchspace=SearchSpace.from_product( diff --git a/examples/Backtesting/full_lookup_dark.svg b/examples/Backtesting/full_lookup_dark.svg index bda220f8e..440e19520 100644 --- a/examples/Backtesting/full_lookup_dark.svg +++ b/examples/Backtesting/full_lookup_dark.svg @@ -6,11 +6,11 @@ - 2024-08-02T17:36:40.679469 + 2024-10-08T16:10:40.948771 image/svg+xml - Matplotlib v3.9.1, https://matplotlib.org/ + Matplotlib v3.9.2, https://matplotlib.org/ @@ -41,269 +41,269 @@ z - - - + + - - - + + - - - + + - - - + + - - - + + - - + @@ -343,7 +343,7 @@ z - + @@ -394,7 +394,7 @@ z - + @@ -408,7 +408,7 @@ z - + @@ -448,7 +448,7 @@ z - + @@ -462,7 +462,7 @@ z - + @@ -510,7 +510,7 @@ z - + @@ -524,7 +524,7 @@ z - + @@ -837,17 +837,17 @@ z - - + - + @@ -856,12 +856,12 @@ L -3.5 0 - + - + @@ -870,12 +870,12 @@ L -3.5 0 - + - + @@ -884,12 +884,12 @@ L -3.5 0 - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - + - + - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - - - + @@ -1714,16 +1699,16 @@ z - + - + - + @@ -1821,113 +1806,95 @@ z - + - + - - + + - - + - - - - - - + + + + + + + - + - + - - + + - - - - - - - - - - - - + + + + - - + - + - + - - - - + @@ -2011,18 +1935,18 @@ z - - + - + - + - + @@ -2044,7 +1968,7 @@ L 247.029063 223.407813 - + diff --git a/examples/Backtesting/full_lookup_light.svg b/examples/Backtesting/full_lookup_light.svg index 58d251c47..66a9860ca 100644 --- a/examples/Backtesting/full_lookup_light.svg +++ b/examples/Backtesting/full_lookup_light.svg @@ -6,11 +6,11 @@ - 2024-08-02T17:36:40.703138 + 2024-10-08T16:10:40.979532 image/svg+xml - Matplotlib v3.9.1, https://matplotlib.org/ + Matplotlib v3.9.2, https://matplotlib.org/ @@ -41,269 +41,269 @@ z - - - + + - - - + + - - - + + - - - + + - - - + + - - + @@ -343,7 +343,7 @@ z - + @@ -394,7 +394,7 @@ z - + @@ -408,7 +408,7 @@ z - + @@ -448,7 +448,7 @@ z - + @@ -462,7 +462,7 @@ z - + @@ -510,7 +510,7 @@ z - + @@ -524,7 +524,7 @@ z - + @@ -837,17 +837,17 @@ z - - + - + @@ -856,12 +856,12 @@ L -3.5 0 - + - + @@ -870,12 +870,12 @@ L -3.5 0 - + - + @@ -884,12 +884,12 @@ L -3.5 0 - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - + - + - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - - - + @@ -1714,16 +1699,16 @@ z - + - + - + @@ -1821,113 +1806,95 @@ z - + - + - - + + - - + - - - - - - + + + + + + + - + - + - - + + - - - - - - - - - - - - + + + + - - + - + - + - - - - + @@ -2011,18 +1935,18 @@ z - - + - + - + - + @@ -2044,7 +1968,7 @@ L 247.029063 223.407813 - + From 8f04f8408fc957ad4ae0d5a1f4c39bddb6ddd011 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 8 Oct 2024 17:20:10 +0200 Subject: [PATCH 41/87] Fix CHANGELOG.md --- CHANGELOG.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 322625f8d..a6c3cc0c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] +### Changed +- `SubstanceParameter` encodings are now computed exclusively with the + `scikit-fingerprints` package, granting access to all fingerprints available therein + +### Deprecations +- `SubstanceEncoding` value `MORGAN_FP`. It is equivalent to `ECFP` with 1024 bits and + radius of 4 + ## [0.11.1] - 2024-10-01 ### Added - Continuous linear constraints have been consolidated in the new @@ -16,7 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Unsafe name-based matching of columns in `get_comp_rep_parameter_indices` -### Deprecated +### Deprecations - `ContinuousLinearEqualityConstraint` and `ContinuousLinearInequalityConstraint` replaced by `ContinuousLinearConstraint` with the corresponding `operator` keyword @@ -57,10 +66,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `to_tensor` now also handles `numpy` arrays - `MIN` mode of `NumericalTarget` is now implemented via the acquisition function instead of negating the computational representation -- Fingerprint encodings are now computed exclusively with the `scikit-fingerprints` package, - granting access to all fingerprints available therein. - The `SubstanceParameter` value `MORGAN_FP` is now an alias for `ECFP` - with 1024 features and radius of 4. - Search spaces now store their parameters in alphabetical order by name - Improvement-based acquisition functions now consider the maximum posterior mean instead of the maximum noisy measurement as reference value @@ -87,7 +92,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Deprecations - The role of `register_custom_architecture` has been taken over by `baybe.surrogates.base.SurrogateProtocol` -- The `SubstanceParameter` value `MORGAN_FP` will be deprecated in future versions. - `BayesianRecommender.surrogate_model` has been replaced with `get_surrogate` ## [0.10.0] - 2024-08-02 From 7704998bf46552f2571d2f22d597eff971a0778d Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 8 Oct 2024 17:43:56 +0200 Subject: [PATCH 42/87] Fix typing --- baybe/parameters/enum.py | 86 ++++++++++++++++++++-------------------- baybe/utils/chemistry.py | 6 +-- mypy.ini | 3 ++ 3 files changed, 49 insertions(+), 46 deletions(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 96fbffae5..98399e7cb 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -17,6 +17,13 @@ class CategoricalEncoding(ParameterEncoding): """Integer encoding.""" +class CustomEncoding(ParameterEncoding): + """Available encodings for custom parameters.""" + + CUSTOM = "CUSTOM" + """User-defined encoding.""" + + class SubstanceEncoding(ParameterEncoding): """Available encodings for substance parameters.""" @@ -55,46 +62,39 @@ class SubstanceEncoding(ParameterEncoding): WHIM = "WHIM" -class CustomEncoding(ParameterEncoding): - """Available encodings for custom parameters.""" - - CUSTOM = "CUSTOM" - """User-defined encoding.""" - - -class FingerprintNames(Enum): - """Mapping of substance parameter encoding names to fingerprint classes.""" - - ATOMPAIR = "AtomPairFingerprint" - AUTOCORR = "AutocorrFingerprint" - AVALON = "AvalonFingerprint" - E3FP = "E3FPFingerprint" - ECFP = "ECFPFingerprint" - MORGAN_FP = "ECFPFingerprint" - ERG = "ERGFingerprint" - ESTATE = "EStateFingerprint" - FUNCTIONALGROUPS = "FunctionalGroupsFingerprint" - GETAWAY = "GETAWAYFingerprint" - GHOSECRIPPEN = "GhoseCrippenFingerprint" - KLEKOTAROTH = "KlekotaRothFingerprint" - LAGGNER = "LaggnerFingerprint" - LAYERED = "LayeredFingerprint" - LINGO = "LingoFingerprint" - MACCS = "MACCSFingerprint" - MAP = "MAPFingerprint" - MHFP = "MHFPFingerprint" - MORSE = "MORSEFingerprint" - MQNS = "MQNsFingerprint" - MORDRED = "MordredFingerprint" - PATTERN = "PatternFingerprint" - PHARMACOPHORE = "PharmacophoreFingerprint" - PHYSIOCHEMICALPROPERTIES = "PhysiochemicalPropertiesFingerprint" - PUBCHEM = "PubChemFingerprint" - RDF = "RDFFingerprint" - RDKIT2DDESCRIPTORS = "RDKit2DDescriptorsFingerprint" - RDKIT = "RDKitFingerprint" - SECFP = "SECFPFingerprint" - TOPOLOGICALTORSION = "TopologicalTorsionFingerprint" - USRCAT = "USRCATFingerprint" - USR = "USRFingerprint" - WHIM = "WHIMFingerprint" +fingerprint_name_map: dict[str, str] = { + "ATOMPAIR": "AtomPairFingerprint", + "AUTOCORR": "AutocorrFingerprint", + "AVALON": "AvalonFingerprint", + "E3FP": "E3FPFingerprint", + "ECFP": "ECFPFingerprint", + "MORGAN_FP": "ECFPFingerprint", + "ERG": "ERGFingerprint", + "ESTATE": "EStateFingerprint", + "FUNCTIONALGROUPS": "FunctionalGroupsFingerprint", + "GETAWAY": "GETAWAYFingerprint", + "GHOSECRIPPEN": "GhoseCrippenFingerprint", + "KLEKOTAROTH": "KlekotaRothFingerprint", + "LAGGNER": "LaggnerFingerprint", + "LAYERED": "LayeredFingerprint", + "LINGO": "LingoFingerprint", + "MACCS": "MACCSFingerprint", + "MAP": "MAPFingerprint", + "MHFP": "MHFPFingerprint", + "MORSE": "MORSEFingerprint", + "MQNS": "MQNsFingerprint", + "MORDRED": "MordredFingerprint", + "PATTERN": "PatternFingerprint", + "PHARMACOPHORE": "PharmacophoreFingerprint", + "PHYSIOCHEMICALPROPERTIES": "PhysiochemicalPropertiesFingerprint", + "PUBCHEM": "PubChemFingerprint", + "RDF": "RDFFingerprint", + "RDKIT2DDESCRIPTORS": "RDKit2DDescriptorsFingerprint", + "RDKIT": "RDKitFingerprint", + "SECFP": "SECFPFingerprint", + "TOPOLOGICALTORSION": "TopologicalTorsionFingerprint", + "USRCAT": "USRCATFingerprint", + "USR": "USRFingerprint", + "WHIM": "WHIMFingerprint", +} +"""Mapping of substance parameter encoding names to fingerprint classes.""" diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index ddba01e2a..3c65b8abb 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -19,7 +19,7 @@ MolFromSmilesTransformer, skfp_fingerprints, ) -from baybe.parameters.enum import FingerprintNames +from baybe.parameters.enum import fingerprint_name_map from baybe.utils.numerical import DTypeFloatNumpy # Caching @@ -142,7 +142,7 @@ def smiles_to_fingerprint_features( def convert_fingeprint_parameters( name: str, kwargs_fingerprint: dict | None = None -) -> (str, dict): +) -> tuple[str, dict]: """Convert fingerprint name parameters for computing the fingerprint. Args: @@ -158,7 +158,7 @@ def convert_fingeprint_parameters( """ # Get fingerprint class try: - fp_class = FingerprintNames[name].value + fp_class = fingerprint_name_map[name] except KeyError: raise KeyError(f"Fingerprint name {name} is not valid.") diff --git a/mypy.ini b/mypy.ini index 5fd92738c..ad0c54104 100644 --- a/mypy.ini +++ b/mypy.ini @@ -42,6 +42,9 @@ ignore_missing_imports = True [mypy-scipy.stats] ignore_missing_imports = True +[mypy-skfp.*] +ignore_missing_imports = True + [mypy-sklearn.*] ignore_missing_imports = True From 580f8ae9c10ac1825d8143190002285e0d08dcef Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 8 Oct 2024 17:50:22 +0200 Subject: [PATCH 43/87] Fix optional tests --- tests/test_deprecations.py | 7 ++++++- tests/test_fingerprints.py | 12 ++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index 832fe4db5..82072d16c 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -5,6 +5,7 @@ import pandas as pd import pytest +from baybe._optional.info import CHEM_INSTALLED from baybe.acquisition.base import AcquisitionFunction from baybe.constraints import ( ContinuousLinearConstraint, @@ -24,7 +25,6 @@ ) from baybe.searchspace.continuous import SubspaceContinuous from baybe.targets.numerical import NumericalTarget -from baybe.utils.chemistry import convert_fingeprint_parameters def test_objective_class(): @@ -131,8 +131,13 @@ def test_surrogate_registration(): register_custom_architecture() +@pytest.mark.skipif( + not CHEM_INSTALLED, reason="Optional chem dependency not installed." +) def test_deprecated_morgan_fp(acqf): """Deprecated fingerprint name raises warning and uses ECFP replacement.""" + from baybe.utils.chemistry import convert_fingeprint_parameters + with pytest.warns(DeprecationWarning): # Check that ECFP is used instead of Morgan with correct pre-defined kwargs morgan_class, morgan_kwargs = convert_fingeprint_parameters( diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index b69b8cb13..3a20d72b3 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -1,9 +1,9 @@ -"""Test for fingerprint generation.""" +"""Tests for fingerprint generation.""" import pytest +from baybe._optional.info import CHEM_INSTALLED from baybe.parameters.substance import SubstanceEncoding -from baybe.utils.chemistry import smiles_to_fingerprint_features test_lst = [ (enc.name, {}, {}) @@ -11,9 +11,10 @@ if enc is not SubstanceEncoding.MORGAN_FP # excluded due to deprecation ] -print(test_lst) - +@pytest.mark.skipif( + not CHEM_INSTALLED, reason="Optional chem dependency not installed." +) @pytest.mark.parametrize( "name,kw_fp,kw_conf", test_lst @@ -26,6 +27,9 @@ ], ) def test_fingerprint_kwargs(name, kw_fp, kw_conf): + """Test all fingerprint computations.""" + from baybe.utils.chemistry import smiles_to_fingerprint_features + smiles_list = ["CC(N(C)C)=O", "CCCC#N"] x = smiles_to_fingerprint_features( smiles_list=smiles_list, From cfdcfcc5074c19eff835660dda2b51c5f7cf9606 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 8 Oct 2024 18:10:21 +0200 Subject: [PATCH 44/87] Update lockfile --- .lockfiles/py310-dev.lock | 55 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/.lockfiles/py310-dev.lock b/.lockfiles/py310-dev.lock index 2d71d23e8..f5da67e09 100644 --- a/.lockfiles/py310-dev.lock +++ b/.lockfiles/py310-dev.lock @@ -103,6 +103,8 @@ cyclonedx-python-lib==7.5.1 # via pip-audit dask==2024.7.1 # via xyzpy +datasketch==1.6.5 + # via scikit-fingerprints debugpy==1.8.2 # via ipykernel decorator==5.1.1 @@ -116,6 +118,8 @@ deprecated==1.2.14 # opentelemetry-api # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-exporter-otlp-proto-http +descriptastorus==2.6.1 + # via scikit-fingerprints distlib==0.3.8 # via virtualenv docstring-parser-fork==0.0.9 @@ -126,6 +130,8 @@ docutils==0.21.2 # pybtex-docutils # sphinx # sphinxcontrib-bibtex +e3fp==1.2.5 + # via scikit-fingerprints et-xmlfile==1.1.0 # via openpyxl exceptiongroup==1.2.2 @@ -143,6 +149,7 @@ fastjsonschema==2.20.0 filelock==3.15.4 # via # cachecontrol + # huggingface-hub # torch # tox # triton @@ -160,6 +167,7 @@ fqdn==1.5.1 fsspec==2024.6.1 # via # dask + # huggingface-hub # torch funcy==1.17 # via @@ -199,6 +207,8 @@ httpcore==1.0.5 # via httpx httpx==0.27.0 # via jupyterlab +huggingface-hub==0.25.1 + # via scikit-fingerprints humanfriendly==10.0 # via coloredlogs hypothesis==6.108.4 @@ -256,6 +266,7 @@ jinja2==3.1.4 joblib==1.4.2 # via # baybe (pyproject.toml) + # scikit-fingerprints # scikit-learn # xyzpy json5==0.9.25 @@ -329,6 +340,8 @@ linear-operator==0.5.2 # via # botorch # gpytorch +llvmlite==0.43.0 + # via numba locket==1.0.0 # via partd markdown-it-py==3.0.0 @@ -363,8 +376,10 @@ mistune==3.0.2 # via nbconvert mkl==2021.4.0 ; platform_system == 'Windows' # via torch +mmh3==5.0.1 + # via e3fp mordredcommunity==2.0.6 - # via baybe (pyproject.toml) + # via scikit-fingerprints mpmath==1.3.0 # via # botorch @@ -408,6 +423,8 @@ notebook-shim==0.2.4 # via # jupyterlab # notebook +numba==0.60.0 + # via scikit-fingerprints numpy==1.26.4 # via # baybe (pyproject.toml) @@ -415,12 +432,16 @@ numpy==1.26.4 # autograd # botorch # contourpy + # datasketch + # descriptastorus + # e3fp # formulaic # h5py # lifelines # matplotlib # mordredcommunity # ngboost + # numba # onnx # onnxconverter-common # onnxruntime @@ -431,6 +452,7 @@ numpy==1.26.4 # pydeck # pyro-ppl # rdkit + # scikit-fingerprints # scikit-learn # scikit-learn-extra # scipy @@ -526,6 +548,7 @@ packaging==24.1 # altair # dask # h5netcdf + # huggingface-hub # ipykernel # jupyter-server # jupyterlab @@ -556,10 +579,14 @@ pandas==2.2.2 # formulaic # hypothesis # lifelines + # pandas-flavor + # scikit-fingerprints # seaborn # streamlit # xarray # xyzpy +pandas-flavor==0.6.0 + # via descriptastorus pandas-stubs==2.2.2.240603 # via # baybe (pyproject.toml) @@ -695,6 +722,7 @@ pywinpty==2.0.13 ; os_name == 'nt' pyyaml==6.0.1 # via # dask + # huggingface-hub # jupyter-events # jupytext # myst-parser @@ -713,8 +741,9 @@ qtpy==2.4.1 # via qtconsole rdkit==2024.3.3 # via - # baybe (pyproject.toml) + # descriptastorus # mordredcommunity + # scikit-fingerprints referencing==0.35.1 # via # jsonschema @@ -723,6 +752,7 @@ referencing==0.35.1 requests==2.32.3 # via # cachecontrol + # huggingface-hub # jupyterlab-server # opentelemetry-exporter-otlp-proto-http # pip-audit @@ -746,11 +776,14 @@ rpds-py==0.19.0 # referencing ruff==0.5.2 # via baybe (pyproject.toml) +scikit-fingerprints==1.9.0 + # via baybe (pyproject.toml) scikit-learn==1.5.1 # via # baybe (pyproject.toml) # gpytorch # ngboost + # scikit-fingerprints # scikit-learn-extra # skl2onnx scikit-learn-extra==0.3.0 @@ -760,13 +793,19 @@ scipy==1.14.0 # baybe (pyproject.toml) # autograd-gamma # botorch + # datasketch + # descriptastorus + # e3fp # formulaic # gpytorch # lifelines # linear-operator # ngboost + # scikit-fingerprints # scikit-learn # scikit-learn-extra +sdaxen-python-utilities==0.1.5 + # via e3fp seaborn==0.13.2 # via baybe (pyproject.toml) send2trash==1.8.3 @@ -788,6 +827,8 @@ six==1.16.0 # rfc3339-validator skl2onnx==1.17.0 # via baybe (pyproject.toml) +smart-open==7.0.5 + # via e3fp smmap==5.0.1 # via gitdb sniffio==1.3.1 @@ -860,7 +901,7 @@ toml==0.10.2 # via # pip-audit # streamlit -tomli==2.0.1 ; python_full_version == '3.11' +tomli==2.0.1 ; python_full_version <= '3.11' # via # coverage # jupyterlab @@ -899,8 +940,10 @@ tox-uv==1.9.1 # via baybe (pyproject.toml) tqdm==4.66.4 # via + # huggingface-hub # ngboost # pyro-ppl + # scikit-fingerprints # xyzpy traitlets==5.14.3 # via @@ -940,6 +983,7 @@ typing-extensions==4.12.2 # cattrs # formulaic # funcy-stubs + # huggingface-hub # ipython # mypy # opentelemetry-sdk @@ -978,8 +1022,11 @@ wrapt==1.16.0 # via # deprecated # formulaic + # smart-open xarray==2024.6.0 - # via xyzpy + # via + # pandas-flavor + # xyzpy xyzpy==1.2.1 # via baybe (pyproject.toml) zipp==3.19.2 From 8428afa006f97d0afbbbb3245f65f041355ae512 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 8 Oct 2024 18:51:38 +0200 Subject: [PATCH 45/87] Pin problematic package --- .lockfiles/py310-dev.lock | 4 +++- pyproject.toml | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.lockfiles/py310-dev.lock b/.lockfiles/py310-dev.lock index f5da67e09..51bdbb695 100644 --- a/.lockfiles/py310-dev.lock +++ b/.lockfiles/py310-dev.lock @@ -119,7 +119,9 @@ deprecated==1.2.14 # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-exporter-otlp-proto-http descriptastorus==2.6.1 - # via scikit-fingerprints + # via + # baybe (pyproject.toml) + # scikit-fingerprints distlib==0.3.8 # via virtualenv docstring-parser-fork==0.0.9 diff --git a/pyproject.toml b/pyproject.toml index 2be892541..a0cd6b8b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,7 @@ Issues = "https://github.com/emdgroup/baybe/issues/" [project.optional-dependencies] chem = [ "scikit-fingerprints>=1.7.0", + "descriptastorus==2.6.1", # secondary dep, newer versions cause uv install issues ] onnx = [ From 57772eabc75fa5e00da0befcc81f78c13dee7840 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 8 Oct 2024 19:09:29 +0200 Subject: [PATCH 46/87] Change autodoc setting --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index 79822b084..93f611eb0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -268,6 +268,7 @@ autodoc_default_options = { # Order by type (function, attribute...), required for proper inheritance "member-order": "groupwise", + "undoc-members": True, # Include members without docstrings } # Only show parameters that are documented. autodoc_typehints_description_target = "documented_params" From 9b4f550d26e5949279e435476f1c9d142595a6d9 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Wed, 9 Oct 2024 12:14:51 +0200 Subject: [PATCH 47/87] Fix enum documentation --- baybe/parameters/enum.py | 65 ++++++++++++++++++++++++++++++++++++++++ docs/conf.py | 1 - 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 98399e7cb..517ca0246 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -28,38 +28,103 @@ class SubstanceEncoding(ParameterEncoding): """Available encodings for substance parameters.""" ATOMPAIR = "ATOMPAIR" + """AtomPairFingerprint from scikit-fingerprints.""" + AUTOCORR = "AUTOCORR" + """AutocorrFingerprint from scikit-fingerprints.""" + AVALON = "AVALON" + """AvalonFingerprint from scikit-fingerprints.""" + E3FP = "E3FP" + """E3FPFingerprint from scikit-fingerprints.""" + ECFP = "ECFP" + """ECFPFingerprint from scikit-fingerprints.""" + MORGAN_FP = "MORGAN_FP" + """Deprecated!""" + ERG = "ERG" + """ERGFingerprint from scikit-fingerprints.""" + ESTATE = "ESTATE" + """EStateFingerprint from scikit-fingerprints.""" + FUNCTIONALGROUPS = "FUNCTIONALGROUPS" + """FunctionalGroupsFingerprint from scikit-fingerprints.""" + GETAWAY = "GETAWAY" + """GETAWAYFingerprint from scikit-fingerprints.""" + GHOSECRIPPEN = "GHOSECRIPPEN" + """GhoseCrippenFingerprint from scikit-fingerprints.""" + KLEKOTAROTH = "KLEKOTAROTH" + """KlekotaRothFingerprint from scikit-fingerprints.""" + LAGGNER = "LAGGNER" + """LaggnerFingerprint from scikit-fingerprints.""" + LAYERED = "LAYERED" + """LayeredFingerprint from scikit-fingerprints.""" + LINGO = "LINGO" + """LingoFingerprint from scikit-fingerprints.""" + MACCS = "MACCS" + """MACCSFingerprint from scikit-fingerprints.""" + MAP = "MAP" + """MAPFingerprint from scikit-fingerprints.""" + MHFP = "MHFP" + """MHFPFingerprint from scikit-fingerprints.""" + MORSE = "MORSE" + """MORSEFingerprint from scikit-fingerprints.""" + MQNS = "MQNS" + """MQNsFingerprint from scikit-fingerprints.""" + MORDRED = "MORDRED" + """MordredFingerprint from scikit-fingerprints.""" + PATTERN = "PATTERN" + """PatternFingerprint from scikit-fingerprints.""" + PHARMACOPHORE = "PHARMACOPHORE" + """PharmacophoreFingerprint from scikit-fingerprints.""" + PHYSIOCHEMICALPROPERTIES = "PHYSIOCHEMICALPROPERTIES" + """PhysiochemicalPropertiesFingerprint from scikit-fingerprints.""" + PUBCHEM = "PUBCHEM" + """PubChemFingerprint from scikit-fingerprints.""" + RDF = "RDF" + """RDFFingerprint from scikit-fingerprints.""" + RDKIT2DDESCRIPTORS = "RDKIT2DDESCRIPTORS" + """RDKit2DDescriptorsFingerprint from scikit-fingerprints.""" + RDKIT = "RDKIT" + """RDKitFingerprint from scikit-fingerprints.""" + SECFP = "SECFP" + """SECFPFingerprint from scikit-fingerprints.""" + TOPOLOGICALTORSION = "TOPOLOGICALTORSION" + """TopologicalTorsionFingerprint from scikit-fingerprints.""" + USRCAT = "USRCAT" + """USRFingerprint from scikit-fingerprints.""" + USR = "USR" + """USRCATFingerprint from scikit-fingerprints.""" + WHIM = "WHIM" + """WHIMFingerprint from scikit-fingerprints.""" fingerprint_name_map: dict[str, str] = { diff --git a/docs/conf.py b/docs/conf.py index 93f611eb0..79822b084 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -268,7 +268,6 @@ autodoc_default_options = { # Order by type (function, attribute...), required for proper inheritance "member-order": "groupwise", - "undoc-members": True, # Include members without docstrings } # Only show parameters that are documented. autodoc_typehints_description_target = "documented_params" From a39f3be5a8481213a25ccd7d8425f31eafe1af6c Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Wed, 9 Oct 2024 14:28:41 +0200 Subject: [PATCH 48/87] Deprecate RDKIT encoding --- CHANGELOG.md | 3 ++- baybe/parameters/enum.py | 14 ++++++---- baybe/utils/chemistry.py | 26 +++++++++--------- tests/hypothesis_strategies/parameters.py | 1 + tests/test_deprecations.py | 33 +++++++++++++---------- tests/test_fingerprints.py | 9 +++++-- 6 files changed, 52 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6c3cc0c2..bf6897dae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,8 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `scikit-fingerprints` package, granting access to all fingerprints available therein ### Deprecations -- `SubstanceEncoding` value `MORGAN_FP`. It is equivalent to `ECFP` with 1024 bits and +- `SubstanceEncoding` value `MORGAN_FP`. Please use `ECFP` with 1024 bits and radius of 4 +- `SubstanceEncoding` value `RDKIT`. Please use `RDKIT2DDESCRIPTORS`. ## [0.11.1] - 2024-10-01 ### Added diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 517ca0246..ff9fee044 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -105,12 +105,15 @@ class SubstanceEncoding(ParameterEncoding): RDF = "RDF" """RDFFingerprint from scikit-fingerprints.""" - RDKIT2DDESCRIPTORS = "RDKIT2DDESCRIPTORS" - """RDKit2DDescriptorsFingerprint from scikit-fingerprints.""" - RDKIT = "RDKIT" + """Deprecated!""" + + RDKITFINGERPRINT = "RDKITFINGERPRINT" """RDKitFingerprint from scikit-fingerprints.""" + RDKIT2DDESCRIPTORS = "RDKIT2DDESCRIPTORS" + """RDKit2DDescriptorsFingerprint from scikit-fingerprints.""" + SECFP = "SECFP" """SECFPFingerprint from scikit-fingerprints.""" @@ -133,7 +136,6 @@ class SubstanceEncoding(ParameterEncoding): "AVALON": "AvalonFingerprint", "E3FP": "E3FPFingerprint", "ECFP": "ECFPFingerprint", - "MORGAN_FP": "ECFPFingerprint", "ERG": "ERGFingerprint", "ESTATE": "EStateFingerprint", "FUNCTIONALGROUPS": "FunctionalGroupsFingerprint", @@ -146,6 +148,7 @@ class SubstanceEncoding(ParameterEncoding): "MACCS": "MACCSFingerprint", "MAP": "MAPFingerprint", "MHFP": "MHFPFingerprint", + "MORGAN_FP": "ECFPFingerprint", # Deprecated! "MORSE": "MORSEFingerprint", "MQNS": "MQNsFingerprint", "MORDRED": "MordredFingerprint", @@ -154,8 +157,9 @@ class SubstanceEncoding(ParameterEncoding): "PHYSIOCHEMICALPROPERTIES": "PhysiochemicalPropertiesFingerprint", "PUBCHEM": "PubChemFingerprint", "RDF": "RDFFingerprint", + "RDKIT": "RDKit2DDescriptorsFingerprint", # Deprecated! + "RDKITFINGERPRINT": "RDKitFingerprint", "RDKIT2DDESCRIPTORS": "RDKit2DDescriptorsFingerprint", - "RDKIT": "RDKitFingerprint", "SECFP": "SECFPFingerprint", "TOPOLOGICALTORSION": "TopologicalTorsionFingerprint", "USRCAT": "USRCATFingerprint", diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 3c65b8abb..d44f9f1c9 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -146,9 +146,9 @@ def convert_fingeprint_parameters( """Convert fingerprint name parameters for computing the fingerprint. Args: - name: Name of fingerprint. - kwargs_fingerprint: Optional user-specified params - for computing the fingerprint. + name: Name of the fingerprint. + kwargs_fingerprint: Optional user-specified settings for computing the + fingerprint. Raises: KeyError: If fingerprint name is not recognized. @@ -156,18 +156,16 @@ def convert_fingeprint_parameters( Returns: Fingerprint class name and kwargs to use for the fingerprint computation. """ + kwargs_fingerprint = kwargs_fingerprint or {} + # Get fingerprint class try: fp_class = fingerprint_name_map[name] except KeyError: - raise KeyError(f"Fingerprint name {name} is not valid.") - - # For backwards-compatibility purposes + raise KeyError(f"Substance encoding {name} is not valid.") - # Update default kwargs to match the fingerprint name when - # using a different fingerprint class to compute the desired fingerprint + # For deprecation purposes kwargs_fp_update = {} - kwargs_fingerprint = {} if not kwargs_fingerprint else kwargs_fingerprint if name == "MORGAN_FP": warnings.warn( "Substance encoding 'MORGAN_FP' is deprecated and will be disabled in " @@ -178,9 +176,13 @@ def convert_fingeprint_parameters( "fp_size": 1024, "radius": 4, } - # Update kwargs with fingerprint-specific defaults - # If a kwarg is specified in the input it overrides the fingerprint default - kwargs_fingerprint = {**kwargs_fp_update, **kwargs_fingerprint} + elif name == "RDKIT": + warnings.warn( + "Substance encoding 'RDKIT' is deprecated and will be disabled in " + "a future version. Use 'RDKIT2DDESCRIPTORS' instead.", + DeprecationWarning, + ) + kwargs_fingerprint.update(kwargs_fp_update) return fp_class, kwargs_fingerprint diff --git a/tests/hypothesis_strategies/parameters.py b/tests/hypothesis_strategies/parameters.py index a1944351f..8b56bc1ca 100644 --- a/tests/hypothesis_strategies/parameters.py +++ b/tests/hypothesis_strategies/parameters.py @@ -144,6 +144,7 @@ def substance_parameters(draw: st.DrawFn): # Ignore deprecated encodings encodings = list(SubstanceEncoding) encodings.remove(SubstanceEncoding.MORGAN_FP) + encodings.remove(SubstanceEncoding.RDKIT) encoding = draw(st.sampled_from(encodings)) return SubstanceParameter( diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index 82072d16c..6df7b92bd 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -4,6 +4,7 @@ import pandas as pd import pytest +from pytest import param from baybe._optional.info import CHEM_INSTALLED from baybe.acquisition.base import AcquisitionFunction @@ -131,29 +132,33 @@ def test_surrogate_registration(): register_custom_architecture() +@pytest.mark.parametrize( + "deprecated,expected", + [ + param("MORGAN_FP", "ECFP", id="morgan"), + param("RDKIT", "RDKIT2DDESCRIPTORS", id="rdkit"), + ], +) @pytest.mark.skipif( not CHEM_INSTALLED, reason="Optional chem dependency not installed." ) -def test_deprecated_morgan_fp(acqf): +def test_deprecated_encodings(deprecated, expected): """Deprecated fingerprint name raises warning and uses ECFP replacement.""" from baybe.utils.chemistry import convert_fingeprint_parameters with pytest.warns(DeprecationWarning): - # Check that ECFP is used instead of Morgan with correct pre-defined kwargs - morgan_class, morgan_kwargs = convert_fingeprint_parameters( - name=SubstanceEncoding("MORGAN_FP").name, kwargs_fingerprint=None + # Check that equivalent is used instead of deprecated encoding + deprecated_cls, fp_kwargs = convert_fingeprint_parameters( + name=SubstanceEncoding(deprecated).name, kwargs_fingerprint=None ) - ecfp_class, _ = convert_fingeprint_parameters( - name=SubstanceEncoding("ECFP").name, kwargs_fingerprint=None - ) - assert morgan_class == ecfp_class - assert morgan_kwargs == {"fp_size": 1024, "radius": 4} - # Check that user-specified kwargs override the defaults - _, morgan_custom_kwargs = convert_fingeprint_parameters( - name=SubstanceEncoding("MORGAN_FP").name, kwargs_fingerprint={"radius": 5} - ) - assert morgan_custom_kwargs == {"fp_size": 1024, "radius": 5} + expected_cls, _ = convert_fingeprint_parameters( + name=SubstanceEncoding(expected).name, kwargs_fingerprint=None + ) + assert deprecated_cls == expected_cls + + if deprecated == "MORGAN_FP": + assert fp_kwargs == {"fp_size": 1024, "radius": 4} def test_surrogate_access(): diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index 3a20d72b3..e8a5d26ef 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -8,7 +8,11 @@ test_lst = [ (enc.name, {}, {}) for enc in SubstanceEncoding - if enc is not SubstanceEncoding.MORGAN_FP # excluded due to deprecation + if enc + not in { # Ignore deprecated encodings + SubstanceEncoding.MORGAN_FP, + SubstanceEncoding.RDKIT, + } ] @@ -18,7 +22,7 @@ @pytest.mark.parametrize( "name,kw_fp,kw_conf", test_lst - + [ + + [ # Add some custom tests ("ECFP", {"fp_size": 64}, {}), ("ECFP", {"fp_size": 512}, {}), ("ECFP", {"radius": 4}, {}), @@ -38,6 +42,7 @@ def test_fingerprint_kwargs(name, kw_fp, kw_conf): kwargs_conformer=kw_conf, kwargs_fingerprint=kw_fp, ) + # Check that fingerprint embedding is of correct size and # fingerprint kwargs specifying embedding size are used assert x.shape[0] == len(smiles_list) From 1179f41d0ddcb452b13ed3b2a7239b49f763be3c Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Wed, 9 Oct 2024 14:40:39 +0200 Subject: [PATCH 49/87] Revert "Pin problematic package" This reverts commit 8428afa006f97d0afbbbb3245f65f041355ae512. --- .lockfiles/py310-dev.lock | 4 +--- pyproject.toml | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.lockfiles/py310-dev.lock b/.lockfiles/py310-dev.lock index 51bdbb695..f5da67e09 100644 --- a/.lockfiles/py310-dev.lock +++ b/.lockfiles/py310-dev.lock @@ -119,9 +119,7 @@ deprecated==1.2.14 # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-exporter-otlp-proto-http descriptastorus==2.6.1 - # via - # baybe (pyproject.toml) - # scikit-fingerprints + # via scikit-fingerprints distlib==0.3.8 # via virtualenv docstring-parser-fork==0.0.9 diff --git a/pyproject.toml b/pyproject.toml index a0cd6b8b3..2be892541 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,6 @@ Issues = "https://github.com/emdgroup/baybe/issues/" [project.optional-dependencies] chem = [ "scikit-fingerprints>=1.7.0", - "descriptastorus==2.6.1", # secondary dep, newer versions cause uv install issues ] onnx = [ From 694ab0620dc6b97771c2515adcd654a76b10a2fd Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Wed, 9 Oct 2024 14:59:52 +0200 Subject: [PATCH 50/87] Update full lookup --- examples/Backtesting/full_lookup.py | 2 +- examples/Backtesting/full_lookup_dark.svg | 1149 ++++++++++---------- examples/Backtesting/full_lookup_light.svg | 1149 ++++++++++---------- 3 files changed, 1193 insertions(+), 1107 deletions(-) diff --git a/examples/Backtesting/full_lookup.py b/examples/Backtesting/full_lookup.py index a40e8a974..8f0dd6238 100644 --- a/examples/Backtesting/full_lookup.py +++ b/examples/Backtesting/full_lookup.py @@ -96,7 +96,7 @@ # First let us create three campaigns that each use a different chemical encoding to # treat substances. -substance_encodings = ["MORDRED", "PUBCHEM", "ECFP"] +substance_encodings = ["MORDRED", "RDKIT2DDESCRIPTORS", "ECFP"] scenarios = { encoding: Campaign( searchspace=SearchSpace.from_product( diff --git a/examples/Backtesting/full_lookup_dark.svg b/examples/Backtesting/full_lookup_dark.svg index 440e19520..d2bbb8447 100644 --- a/examples/Backtesting/full_lookup_dark.svg +++ b/examples/Backtesting/full_lookup_dark.svg @@ -6,7 +6,7 @@ - 2024-10-08T16:10:40.948771 + 2024-10-09T14:57:50.630496 image/svg+xml @@ -41,269 +41,269 @@ z - - - + + - - - + + - - - + + - - - + + - - - + + - - + @@ -343,7 +343,7 @@ z - + @@ -394,7 +394,7 @@ z - + @@ -408,7 +408,7 @@ z - + @@ -448,7 +448,7 @@ z - + @@ -462,7 +462,7 @@ z - + @@ -510,7 +510,7 @@ z - + @@ -524,7 +524,7 @@ z - + @@ -837,17 +837,17 @@ z - - + - + @@ -856,12 +856,12 @@ L -3.5 0 - + - + @@ -870,12 +870,12 @@ L -3.5 0 - + - + @@ -884,12 +884,12 @@ L -3.5 0 - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + - + - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - + - + - - + - + - + - + - - + - + - - - + + + + + + - - - - - - - - - + + + + + + + + + + + + + + + + + + - - + - + - + - + - - + - + - + - + + + + @@ -1935,18 +1978,18 @@ L 247.029063 196.504063 - - + - + - + - + @@ -1968,7 +2011,7 @@ L 247.029063 222.795938 - + diff --git a/examples/Backtesting/full_lookup_light.svg b/examples/Backtesting/full_lookup_light.svg index 66a9860ca..a766d7671 100644 --- a/examples/Backtesting/full_lookup_light.svg +++ b/examples/Backtesting/full_lookup_light.svg @@ -6,7 +6,7 @@ - 2024-10-08T16:10:40.979532 + 2024-10-09T14:57:50.668380 image/svg+xml @@ -41,269 +41,269 @@ z - - - + + - - - + + - - - + + - - - + + - - - + + - - + @@ -343,7 +343,7 @@ z - + @@ -394,7 +394,7 @@ z - + @@ -408,7 +408,7 @@ z - + @@ -448,7 +448,7 @@ z - + @@ -462,7 +462,7 @@ z - + @@ -510,7 +510,7 @@ z - + @@ -524,7 +524,7 @@ z - + @@ -837,17 +837,17 @@ z - - + - + @@ -856,12 +856,12 @@ L -3.5 0 - + - + @@ -870,12 +870,12 @@ L -3.5 0 - + - + @@ -884,12 +884,12 @@ L -3.5 0 - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + - + - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - + - + - - + - + - + - + - - + - + - - - + + + + + + - - - - - - - - - + + + + + + + + + + + + + + + + + + - - + - + - + - + - - + - + - + - + + + + @@ -1935,18 +1978,18 @@ L 247.029063 196.504063 - - + - + - + - + @@ -1968,7 +2011,7 @@ L 247.029063 222.795938 - + From b8d9d0d003cc4f1193a53f683ca67112af307f55 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 16 Oct 2024 10:22:14 +0200 Subject: [PATCH 51/87] Fix enum entries --- baybe/parameters/enum.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index ff9fee044..323d88e27 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -120,10 +120,10 @@ class SubstanceEncoding(ParameterEncoding): TOPOLOGICALTORSION = "TOPOLOGICALTORSION" """TopologicalTorsionFingerprint from scikit-fingerprints.""" - USRCAT = "USRCAT" + USR = "USR" """USRFingerprint from scikit-fingerprints.""" - USR = "USR" + USRCAT = "USRCAT" """USRCATFingerprint from scikit-fingerprints.""" WHIM = "WHIM" From f5f0de0bcaa1c613746dfa6c314a5e6e66dfe4e4 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 16 Oct 2024 10:26:58 +0200 Subject: [PATCH 52/87] Fix kwargs attributes * Add validators * Add type annotations * Refine docstrings --- baybe/parameters/substance.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py index 0b6ddd89b..2a3c8b69b 100644 --- a/baybe/parameters/substance.py +++ b/baybe/parameters/substance.py @@ -62,11 +62,13 @@ class SubstanceParameter(DiscreteParameter): ) # See base class. - kwargs_fingerprint: dict = field(factory=dict) - """Kwargs for fingerprint generator""" + kwargs_fingerprint: dict[str, Any] = field( + factory=dict, validator=instance_of(dict) + ) + """Keyword arguments passed to fingerprint generator.""" - kwargs_conformer: dict = field(factory=dict) - """Kwargs for conformer generator""" + kwargs_conformer: dict[str, Any] = field(factory=dict, validator=instance_of(dict)) + """Keyword arguments passed to conformer generator.""" @data.validator def _validate_substance_data( # noqa: DOC101, DOC103 From db09ea7f4568d83d4684bf98093410689cadbdc6 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 16 Oct 2024 11:23:30 +0200 Subject: [PATCH 53/87] Simplify fingerprint class lookup --- baybe/parameters/enum.py | 39 -------------------- baybe/utils/chemistry.py | 80 +++++++++++++++++++--------------------- 2 files changed, 37 insertions(+), 82 deletions(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 323d88e27..4c36cfdd1 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -128,42 +128,3 @@ class SubstanceEncoding(ParameterEncoding): WHIM = "WHIM" """WHIMFingerprint from scikit-fingerprints.""" - - -fingerprint_name_map: dict[str, str] = { - "ATOMPAIR": "AtomPairFingerprint", - "AUTOCORR": "AutocorrFingerprint", - "AVALON": "AvalonFingerprint", - "E3FP": "E3FPFingerprint", - "ECFP": "ECFPFingerprint", - "ERG": "ERGFingerprint", - "ESTATE": "EStateFingerprint", - "FUNCTIONALGROUPS": "FunctionalGroupsFingerprint", - "GETAWAY": "GETAWAYFingerprint", - "GHOSECRIPPEN": "GhoseCrippenFingerprint", - "KLEKOTAROTH": "KlekotaRothFingerprint", - "LAGGNER": "LaggnerFingerprint", - "LAYERED": "LayeredFingerprint", - "LINGO": "LingoFingerprint", - "MACCS": "MACCSFingerprint", - "MAP": "MAPFingerprint", - "MHFP": "MHFPFingerprint", - "MORGAN_FP": "ECFPFingerprint", # Deprecated! - "MORSE": "MORSEFingerprint", - "MQNS": "MQNsFingerprint", - "MORDRED": "MordredFingerprint", - "PATTERN": "PatternFingerprint", - "PHARMACOPHORE": "PharmacophoreFingerprint", - "PHYSIOCHEMICALPROPERTIES": "PhysiochemicalPropertiesFingerprint", - "PUBCHEM": "PubChemFingerprint", - "RDF": "RDFFingerprint", - "RDKIT": "RDKit2DDescriptorsFingerprint", # Deprecated! - "RDKITFINGERPRINT": "RDKitFingerprint", - "RDKIT2DDESCRIPTORS": "RDKit2DDescriptorsFingerprint", - "SECFP": "SECFPFingerprint", - "TOPOLOGICALTORSION": "TopologicalTorsionFingerprint", - "USRCAT": "USRCATFingerprint", - "USR": "USRFingerprint", - "WHIM": "WHIMFingerprint", -} -"""Mapping of substance parameter encoding names to fingerprint classes.""" diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index d44f9f1c9..c85b1b53a 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -19,7 +19,7 @@ MolFromSmilesTransformer, skfp_fingerprints, ) -from baybe.parameters.enum import fingerprint_name_map +from baybe.parameters.enum import SubstanceEncoding from baybe.utils.numerical import DTypeFloatNumpy # Caching @@ -109,14 +109,30 @@ def smiles_to_fingerprint_features( Returns: Dataframe containing fingerprints for each SMILES string. """ - fingerprint_cls, kwargs_fingerprint = convert_fingeprint_parameters( - name=fingerprint_name, kwargs_fingerprint=kwargs_fingerprint - ) + kwargs_fingerprint = kwargs_fingerprint or {} kwargs_conformer = kwargs_conformer or {} - fingerprint_encoder = getattr(skfp_fingerprints, fingerprint_cls)( - **kwargs_fingerprint - ) + if fingerprint_name == "MORGAN_FP": + warnings.warn( + f"Substance encoding 'MORGAN_FP' is deprecated and will be disabled in " + f"a future version. Use '{SubstanceEncoding.ECFP.name}' " + f"with 'fp_size' 1204 and 'radius' 4 instead.", + DeprecationWarning, + ) + fingerprint_name = SubstanceEncoding.ECFP.name + kwargs_fingerprint.update({"fp_size": 1024, "radius": 4}) + + elif fingerprint_name == "RDKIT": + warnings.warn( + f"Substance encoding 'RDKIT' is deprecated and will be disabled in " + f"a future version. Use '{SubstanceEncoding.RDKIT2DDESCRIPTORS.name}' " + f"instead.", + DeprecationWarning, + ) + fingerprint_name = SubstanceEncoding.RDKIT2DDESCRIPTORS.name + + fingerprint_cls = get_fingerprint_class(SubstanceEncoding(fingerprint_name)) + fingerprint_encoder = fingerprint_cls(**kwargs_fingerprint) if fingerprint_encoder.requires_conformers: mol_list = ConformerGenerator(**kwargs_conformer).transform( @@ -140,51 +156,29 @@ def smiles_to_fingerprint_features( return df -def convert_fingeprint_parameters( - name: str, kwargs_fingerprint: dict | None = None -) -> tuple[str, dict]: - """Convert fingerprint name parameters for computing the fingerprint. +def get_fingerprint_class(encoding: SubstanceEncoding) -> BaseFingerprintTransformer: + """Retrieve the fingerprint class corresponding to a given encoding. Args: - name: Name of the fingerprint. - kwargs_fingerprint: Optional user-specified settings for computing the - fingerprint. + encoding: A substance encoding. Raises: - KeyError: If fingerprint name is not recognized. + ValueError: If no fingerprint class for the specified encoding. Returns: - Fingerprint class name and kwargs to use for the fingerprint computation. + The fingerprint class. """ - kwargs_fingerprint = kwargs_fingerprint or {} - - # Get fingerprint class try: - fp_class = fingerprint_name_map[name] - except KeyError: - raise KeyError(f"Substance encoding {name} is not valid.") - - # For deprecation purposes - kwargs_fp_update = {} - if name == "MORGAN_FP": - warnings.warn( - "Substance encoding 'MORGAN_FP' is deprecated and will be disabled in " - "a future version. Use 'ECFP' with 'fp_size' 1204 and 'radius' 4 instead.", - DeprecationWarning, + cls_name = next( + name + for name in dir(skfp_fingerprints) + if (encoding.name + "Fingerprint").casefold() == name.casefold() ) - kwargs_fp_update = { - "fp_size": 1024, - "radius": 4, - } - elif name == "RDKIT": - warnings.warn( - "Substance encoding 'RDKIT' is deprecated and will be disabled in " - "a future version. Use 'RDKIT2DDESCRIPTORS' instead.", - DeprecationWarning, - ) - kwargs_fingerprint.update(kwargs_fp_update) - - return fp_class, kwargs_fingerprint + except StopIteration as e: + raise ValueError( + f"No fingerprint class exists for the specified encoding '{encoding.name}'." + ) from e + return getattr(skfp_fingerprints, cls_name) def get_canonical_smiles(smiles: str) -> str: From 66265e1349455fec6bb288435626e1116740a5be Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 16 Oct 2024 16:58:22 +0200 Subject: [PATCH 54/87] Add exception rule to fingerprint mapping --- baybe/utils/chemistry.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index c85b1b53a..9555f2d14 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -168,6 +168,10 @@ def get_fingerprint_class(encoding: SubstanceEncoding) -> BaseFingerprintTransfo Returns: The fingerprint class. """ + # Exception case + if encoding is SubstanceEncoding.RDKITFINGERPRINT: + return skfp_fingerprints.RDKitFingerprint + try: cls_name = next( name From 3811a7809c7e6e10e13a40e2f16f1c09b5adfe57 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 16 Oct 2024 11:29:12 +0200 Subject: [PATCH 55/87] Use encoding instead of string to specify transformation --- baybe/parameters/substance.py | 2 +- baybe/utils/chemistry.py | 23 +++++++++++------------ tests/test_fingerprints.py | 22 ++++++++++++---------- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py index 2a3c8b69b..d523e064d 100644 --- a/baybe/parameters/substance.py +++ b/baybe/parameters/substance.py @@ -128,7 +128,7 @@ def comp_df(self) -> pd.DataFrame: # noqa: D102 # Get the raw descriptors comp_df = chemistry.smiles_to_fingerprint_features( vals, - fingerprint_name=self.encoding.name, + encoding=self.encoding, prefix=pref, kwargs_conformer=self.kwargs_conformer, kwargs_fingerprint=self.kwargs_fingerprint, diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 9555f2d14..0055c9596 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -91,7 +91,7 @@ def _molecule_to_fingerprint_features( def smiles_to_fingerprint_features( smiles_list: list[str], - fingerprint_name: str, + encoding: SubstanceEncoding, prefix: str = "", kwargs_conformer: dict | None = None, kwargs_fingerprint: dict | None = None, @@ -100,8 +100,7 @@ def smiles_to_fingerprint_features( Args: smiles_list: List of SMILES strings. - fingerprint_name: Name of Fingerprint class used to - transform smiles to fingerprints + encoding: Encoding used to transform SMILES to fingerprints. prefix: Name prefix for each descriptor (e.g., nBase --> _nBase). kwargs_conformer: kwargs for conformer generator kwargs_fingerprint: kwargs for fingerprint generator @@ -112,26 +111,26 @@ def smiles_to_fingerprint_features( kwargs_fingerprint = kwargs_fingerprint or {} kwargs_conformer = kwargs_conformer or {} - if fingerprint_name == "MORGAN_FP": + if encoding is SubstanceEncoding.MORGAN_FP: warnings.warn( - f"Substance encoding 'MORGAN_FP' is deprecated and will be disabled in " - f"a future version. Use '{SubstanceEncoding.ECFP.name}' " + f"Substance encoding '{encoding.name}' is deprecated and will be disabled " + f"in a future version. Use '{SubstanceEncoding.ECFP.name}' " f"with 'fp_size' 1204 and 'radius' 4 instead.", DeprecationWarning, ) - fingerprint_name = SubstanceEncoding.ECFP.name + encoding = SubstanceEncoding.ECFP kwargs_fingerprint.update({"fp_size": 1024, "radius": 4}) - elif fingerprint_name == "RDKIT": + elif encoding is SubstanceEncoding.RDKIT: warnings.warn( - f"Substance encoding 'RDKIT' is deprecated and will be disabled in " - f"a future version. Use '{SubstanceEncoding.RDKIT2DDESCRIPTORS.name}' " + f"Substance encoding '{encoding.name}' is deprecated and will be disabled " + f"in a future version. Use '{SubstanceEncoding.RDKIT2DDESCRIPTORS.name}' " f"instead.", DeprecationWarning, ) - fingerprint_name = SubstanceEncoding.RDKIT2DDESCRIPTORS.name + encoding = SubstanceEncoding.RDKIT2DDESCRIPTORS - fingerprint_cls = get_fingerprint_class(SubstanceEncoding(fingerprint_name)) + fingerprint_cls = get_fingerprint_class(encoding) fingerprint_encoder = fingerprint_cls(**kwargs_fingerprint) if fingerprint_encoder.requires_conformers: diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index e8a5d26ef..b06192cd6 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -3,10 +3,10 @@ import pytest from baybe._optional.info import CHEM_INSTALLED -from baybe.parameters.substance import SubstanceEncoding +from baybe.parameters.enum import SubstanceEncoding test_lst = [ - (enc.name, {}, {}) + (enc, {}, {}) for enc in SubstanceEncoding if enc not in { # Ignore deprecated encodings @@ -15,29 +15,31 @@ } ] +ECFP = SubstanceEncoding.ECFP + @pytest.mark.skipif( not CHEM_INSTALLED, reason="Optional chem dependency not installed." ) @pytest.mark.parametrize( - "name,kw_fp,kw_conf", + "encoding,kw_fp,kw_conf", test_lst + [ # Add some custom tests - ("ECFP", {"fp_size": 64}, {}), - ("ECFP", {"fp_size": 512}, {}), - ("ECFP", {"radius": 4}, {}), - ("ECFP", {"fp_size": 512, "radius": 4}, {}), - ("ECFP", {}, {"max_gen_attempts": 5000}), + (ECFP, {"fp_size": 64}, {}), + (ECFP, {"fp_size": 512}, {}), + (ECFP, {"radius": 4}, {}), + (ECFP, {"fp_size": 512, "radius": 4}, {}), + (ECFP, {}, {"max_gen_attempts": 5000}), ], ) -def test_fingerprint_kwargs(name, kw_fp, kw_conf): +def test_fingerprint_kwargs(encoding, kw_fp, kw_conf): """Test all fingerprint computations.""" from baybe.utils.chemistry import smiles_to_fingerprint_features smiles_list = ["CC(N(C)C)=O", "CCCC#N"] x = smiles_to_fingerprint_features( smiles_list=smiles_list, - fingerprint_name=name, + encoding=encoding, prefix="", kwargs_conformer=kw_conf, kwargs_fingerprint=kw_fp, From 4bc28183599d2a44f486561a77a87243ed9375ae Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 16 Oct 2024 11:32:48 +0200 Subject: [PATCH 56/87] Avoid hard-coding collection type in argument name --- baybe/utils/chemistry.py | 8 ++++---- tests/test_fingerprints.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 0055c9596..d38de8659 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -90,7 +90,7 @@ def _molecule_to_fingerprint_features( def smiles_to_fingerprint_features( - smiles_list: list[str], + smiles: list[str], encoding: SubstanceEncoding, prefix: str = "", kwargs_conformer: dict | None = None, @@ -99,7 +99,7 @@ def smiles_to_fingerprint_features( """Compute molecular fingerprints for a list of SMILES strings. Args: - smiles_list: List of SMILES strings. + smiles: List of SMILES strings. encoding: Encoding used to transform SMILES to fingerprints. prefix: Name prefix for each descriptor (e.g., nBase --> _nBase). kwargs_conformer: kwargs for conformer generator @@ -135,10 +135,10 @@ def smiles_to_fingerprint_features( if fingerprint_encoder.requires_conformers: mol_list = ConformerGenerator(**kwargs_conformer).transform( - MolFromSmilesTransformer().transform(smiles_list) + MolFromSmilesTransformer().transform(smiles) ) else: - mol_list = smiles_list + mol_list = smiles features = np.concatenate( [ diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index b06192cd6..1e54081e5 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -36,9 +36,9 @@ def test_fingerprint_kwargs(encoding, kw_fp, kw_conf): """Test all fingerprint computations.""" from baybe.utils.chemistry import smiles_to_fingerprint_features - smiles_list = ["CC(N(C)C)=O", "CCCC#N"] + smiles = ["CC(N(C)C)=O", "CCCC#N"] x = smiles_to_fingerprint_features( - smiles_list=smiles_list, + smiles=smiles, encoding=encoding, prefix="", kwargs_conformer=kw_conf, @@ -47,6 +47,6 @@ def test_fingerprint_kwargs(encoding, kw_fp, kw_conf): # Check that fingerprint embedding is of correct size and # fingerprint kwargs specifying embedding size are used - assert x.shape[0] == len(smiles_list) + assert x.shape[0] == len(smiles) if "fp_size" in kw_fp: assert x.shape[1] == kw_fp["fp_size"] From c38ded4f9628ad35234e8f1078adbf6cea39f506 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 16 Oct 2024 11:37:44 +0200 Subject: [PATCH 57/87] Generalize input type --- baybe/utils/chemistry.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index d38de8659..6515e4e39 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -5,6 +5,7 @@ import tempfile import urllib.request import warnings +from collections.abc import Sequence from functools import lru_cache from pathlib import Path @@ -90,7 +91,7 @@ def _molecule_to_fingerprint_features( def smiles_to_fingerprint_features( - smiles: list[str], + smiles: Sequence[str], encoding: SubstanceEncoding, prefix: str = "", kwargs_conformer: dict | None = None, @@ -99,7 +100,7 @@ def smiles_to_fingerprint_features( """Compute molecular fingerprints for a list of SMILES strings. Args: - smiles: List of SMILES strings. + smiles: Sequence of SMILES strings. encoding: Encoding used to transform SMILES to fingerprints. prefix: Name prefix for each descriptor (e.g., nBase --> _nBase). kwargs_conformer: kwargs for conformer generator From 8d59e8a661174ad0ddcc5c832db79306c58d77c1 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 16 Oct 2024 11:47:55 +0200 Subject: [PATCH 58/87] Use removesuffix instead of replace --- baybe/utils/chemistry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 6515e4e39..6c26cd728 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -149,7 +149,7 @@ def smiles_to_fingerprint_features( for mol in mol_list ] ) - name = f"skfp{fingerprint_encoder.__class__.__name__.replace('Fingerprint', '')}_" + name = f"skfp{fingerprint_encoder.__class__.__name__.removesuffix('Fingerprint')}_" col_names = [prefix + name + f for f in fingerprint_encoder.get_feature_names_out()] df = pd.DataFrame(features, columns=col_names, dtype=DTypeFloatNumpy) From c2b8666be9b0308be7efdefaaadc2725b7179a4e Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 16 Oct 2024 16:53:10 +0200 Subject: [PATCH 59/87] Fix tests --- tests/test_deprecations.py | 60 +++++++++++++++++++------------------- tests/test_fingerprints.py | 9 +++--- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index 6df7b92bd..e362c0f9a 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -1,10 +1,12 @@ """Deprecation tests.""" import warnings +from unittest.mock import patch import pandas as pd import pytest from pytest import param +from skfp.fingerprints import ECFPFingerprint, RDKit2DDescriptorsFingerprint from baybe._optional.info import CHEM_INSTALLED from baybe.acquisition.base import AcquisitionFunction @@ -18,7 +20,7 @@ from baybe.objective import Objective as OldObjective from baybe.objectives.base import Objective from baybe.objectives.desirability import DesirabilityObjective -from baybe.parameters import SubstanceEncoding +from baybe.parameters.enum import SubstanceEncoding from baybe.parameters.numerical import NumericalContinuousParameter from baybe.recommenders.pure.bayesian import ( BotorchRecommender, @@ -26,6 +28,7 @@ ) from baybe.searchspace.continuous import SubspaceContinuous from baybe.targets.numerical import NumericalTarget +from baybe.utils.chemistry import smiles_to_fingerprint_features def test_objective_class(): @@ -132,35 +135,6 @@ def test_surrogate_registration(): register_custom_architecture() -@pytest.mark.parametrize( - "deprecated,expected", - [ - param("MORGAN_FP", "ECFP", id="morgan"), - param("RDKIT", "RDKIT2DDESCRIPTORS", id="rdkit"), - ], -) -@pytest.mark.skipif( - not CHEM_INSTALLED, reason="Optional chem dependency not installed." -) -def test_deprecated_encodings(deprecated, expected): - """Deprecated fingerprint name raises warning and uses ECFP replacement.""" - from baybe.utils.chemistry import convert_fingeprint_parameters - - with pytest.warns(DeprecationWarning): - # Check that equivalent is used instead of deprecated encoding - deprecated_cls, fp_kwargs = convert_fingeprint_parameters( - name=SubstanceEncoding(deprecated).name, kwargs_fingerprint=None - ) - - expected_cls, _ = convert_fingeprint_parameters( - name=SubstanceEncoding(expected).name, kwargs_fingerprint=None - ) - assert deprecated_cls == expected_cls - - if deprecated == "MORGAN_FP": - assert fp_kwargs == {"fp_size": 1024, "radius": 4} - - def test_surrogate_access(): """Public attribute access to the surrogate model raises a warning.""" recommender = BotorchRecommender() @@ -211,3 +185,29 @@ def test_constraint_config_deserialization(type_, op): warnings.filterwarnings("ignore", category=DeprecationWarning) actual = Constraint.from_json(config) assert expected == actual, (expected, actual) + + +@pytest.mark.parametrize( + ("deprecated", "replacement"), + [ + param(SubstanceEncoding.MORGAN_FP, ECFPFingerprint, id="morgan"), + param(SubstanceEncoding.RDKIT, RDKit2DDescriptorsFingerprint, id="rdkit"), + ], +) +@pytest.mark.skipif( + not CHEM_INSTALLED, reason="Optional chem dependency not installed." +) +def test_deprecated_encodings(deprecated, replacement): + """Deprecated encoding raises a warning and uses correct replacement.""" + path = f"skfp.fingerprints.{replacement.__name__}" + + with patch(path, wraps=replacement) as patched: + # Assert warning + with pytest.warns(DeprecationWarning): + smiles_to_fingerprint_features(["C"], deprecated) + + # Check that equivalent is used instead of deprecated encoding + if deprecated is SubstanceEncoding.MORGAN_FP: + patched.assert_called_once_with(**{"fp_size": 1024, "radius": 4}) + else: + patched.assert_called_once() diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index 1e54081e5..e972bff7e 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -4,8 +4,9 @@ from baybe._optional.info import CHEM_INSTALLED from baybe.parameters.enum import SubstanceEncoding +from baybe.utils.chemistry import smiles_to_fingerprint_features -test_lst = [ +test_cases: list[tuple[SubstanceEncoding, dict, dict]] = [ (enc, {}, {}) for enc in SubstanceEncoding if enc @@ -22,8 +23,8 @@ not CHEM_INSTALLED, reason="Optional chem dependency not installed." ) @pytest.mark.parametrize( - "encoding,kw_fp,kw_conf", - test_lst + ("encoding", "kw_fp", "kw_conf"), + test_cases + [ # Add some custom tests (ECFP, {"fp_size": 64}, {}), (ECFP, {"fp_size": 512}, {}), @@ -34,8 +35,6 @@ ) def test_fingerprint_kwargs(encoding, kw_fp, kw_conf): """Test all fingerprint computations.""" - from baybe.utils.chemistry import smiles_to_fingerprint_features - smiles = ["CC(N(C)C)=O", "CCCC#N"] x = smiles_to_fingerprint_features( smiles=smiles, From 1ca27db0dbbdd292f1dd7152520354f0a74cd11f Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Thu, 17 Oct 2024 08:58:25 +0200 Subject: [PATCH 60/87] rm skfp rename --- baybe/_optional/chem.py | 4 ++-- baybe/utils/chemistry.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/baybe/_optional/chem.py b/baybe/_optional/chem.py index 7662aa8dd..7cd222392 100644 --- a/baybe/_optional/chem.py +++ b/baybe/_optional/chem.py @@ -4,7 +4,7 @@ try: from rdkit import Chem - from skfp import fingerprints as skfp_fingerprints + from skfp import fingerprints from skfp.bases import BaseFingerprintTransformer from skfp.preprocessing import ConformerGenerator, MolFromSmilesTransformer @@ -18,7 +18,7 @@ __all__ = [ "Chem", - "skfp_fingerprints", + "fingerprints", "BaseFingerprintTransformer", "ConformerGenerator", "MolFromSmilesTransformer", diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 6c26cd728..a5d2085ad 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -18,7 +18,7 @@ Chem, ConformerGenerator, MolFromSmilesTransformer, - skfp_fingerprints, + fingerprints, ) from baybe.parameters.enum import SubstanceEncoding from baybe.utils.numerical import DTypeFloatNumpy @@ -170,19 +170,19 @@ def get_fingerprint_class(encoding: SubstanceEncoding) -> BaseFingerprintTransfo """ # Exception case if encoding is SubstanceEncoding.RDKITFINGERPRINT: - return skfp_fingerprints.RDKitFingerprint + return fingerprints.RDKitFingerprint try: cls_name = next( name - for name in dir(skfp_fingerprints) + for name in dir(fingerprints) if (encoding.name + "Fingerprint").casefold() == name.casefold() ) except StopIteration as e: raise ValueError( f"No fingerprint class exists for the specified encoding '{encoding.name}'." ) from e - return getattr(skfp_fingerprints, cls_name) + return getattr(fingerprints, cls_name) def get_canonical_smiles(smiles: str) -> str: From 5ff61991b99486af8f7f2c4c43f968c35750f756 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Thu, 17 Oct 2024 09:08:48 +0200 Subject: [PATCH 61/87] reword "please use" in deprecated --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf6897dae..9030215a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `scikit-fingerprints` package, granting access to all fingerprints available therein ### Deprecations -- `SubstanceEncoding` value `MORGAN_FP`. Please use `ECFP` with 1024 bits and - radius of 4 -- `SubstanceEncoding` value `RDKIT`. Please use `RDKIT2DDESCRIPTORS`. +- `SubstanceEncoding` value `MORGAN_FP`. As a replacement, `ECFP` with 1024 bits and + radius of 4 can be used. +- `SubstanceEncoding` value `RDKIT`. As a replacement, `RDKIT2DDESCRIPTORS` can be used. ## [0.11.1] - 2024-10-01 ### Added From 55dfaa1a5b43cf79851fe200d176fc1cb275407c Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Thu, 17 Oct 2024 09:14:50 +0200 Subject: [PATCH 62/87] rm redundant scikit-fingerprints docs referencing --- baybe/parameters/enum.py | 66 ++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index 4c36cfdd1..bd2d2380c 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -25,106 +25,106 @@ class CustomEncoding(ParameterEncoding): class SubstanceEncoding(ParameterEncoding): - """Available encodings for substance parameters.""" + """Available encodings for substance parameters from scikit-fingerprints package.""" ATOMPAIR = "ATOMPAIR" - """AtomPairFingerprint from scikit-fingerprints.""" + """AtomPairFingerprint.""" AUTOCORR = "AUTOCORR" - """AutocorrFingerprint from scikit-fingerprints.""" + """AutocorrFingerprint.""" AVALON = "AVALON" - """AvalonFingerprint from scikit-fingerprints.""" + """AvalonFingerprint.""" E3FP = "E3FP" - """E3FPFingerprint from scikit-fingerprints.""" + """E3FPFingerprint.""" ECFP = "ECFP" - """ECFPFingerprint from scikit-fingerprints.""" + """ECFPFingerprint.""" MORGAN_FP = "MORGAN_FP" """Deprecated!""" ERG = "ERG" - """ERGFingerprint from scikit-fingerprints.""" + """ERGFingerprint.""" ESTATE = "ESTATE" - """EStateFingerprint from scikit-fingerprints.""" + """EStateFingerprint.""" FUNCTIONALGROUPS = "FUNCTIONALGROUPS" - """FunctionalGroupsFingerprint from scikit-fingerprints.""" + """FunctionalGroupsFingerprint.""" GETAWAY = "GETAWAY" - """GETAWAYFingerprint from scikit-fingerprints.""" + """GETAWAYFingerprint.""" GHOSECRIPPEN = "GHOSECRIPPEN" - """GhoseCrippenFingerprint from scikit-fingerprints.""" + """GhoseCrippenFingerprint.""" KLEKOTAROTH = "KLEKOTAROTH" - """KlekotaRothFingerprint from scikit-fingerprints.""" + """KlekotaRothFingerprint.""" LAGGNER = "LAGGNER" - """LaggnerFingerprint from scikit-fingerprints.""" + """LaggnerFingerprint.""" LAYERED = "LAYERED" - """LayeredFingerprint from scikit-fingerprints.""" + """LayeredFingerprint.""" LINGO = "LINGO" - """LingoFingerprint from scikit-fingerprints.""" + """LingoFingerprint.""" MACCS = "MACCS" - """MACCSFingerprint from scikit-fingerprints.""" + """MACCSFingerprint.""" MAP = "MAP" - """MAPFingerprint from scikit-fingerprints.""" + """MAPFingerprint.""" MHFP = "MHFP" - """MHFPFingerprint from scikit-fingerprints.""" + """MHFPFingerprint.""" MORSE = "MORSE" - """MORSEFingerprint from scikit-fingerprints.""" + """MORSEFingerprint.""" MQNS = "MQNS" - """MQNsFingerprint from scikit-fingerprints.""" + """MQNsFingerprint.""" MORDRED = "MORDRED" - """MordredFingerprint from scikit-fingerprints.""" + """MordredFingerprint.""" PATTERN = "PATTERN" - """PatternFingerprint from scikit-fingerprints.""" + """PatternFingerprint.""" PHARMACOPHORE = "PHARMACOPHORE" - """PharmacophoreFingerprint from scikit-fingerprints.""" + """PharmacophoreFingerprint.""" PHYSIOCHEMICALPROPERTIES = "PHYSIOCHEMICALPROPERTIES" - """PhysiochemicalPropertiesFingerprint from scikit-fingerprints.""" + """PhysiochemicalPropertiesFingerprint.""" PUBCHEM = "PUBCHEM" - """PubChemFingerprint from scikit-fingerprints.""" + """PubChemFingerprint.""" RDF = "RDF" - """RDFFingerprint from scikit-fingerprints.""" + """RDFFingerprint.""" RDKIT = "RDKIT" """Deprecated!""" RDKITFINGERPRINT = "RDKITFINGERPRINT" - """RDKitFingerprint from scikit-fingerprints.""" + """RDKitFingerprint.""" RDKIT2DDESCRIPTORS = "RDKIT2DDESCRIPTORS" - """RDKit2DDescriptorsFingerprint from scikit-fingerprints.""" + """RDKit2DDescriptorsFingerprint.""" SECFP = "SECFP" - """SECFPFingerprint from scikit-fingerprints.""" + """SECFPFingerprint.""" TOPOLOGICALTORSION = "TOPOLOGICALTORSION" - """TopologicalTorsionFingerprint from scikit-fingerprints.""" + """TopologicalTorsionFingerprint.""" USR = "USR" - """USRFingerprint from scikit-fingerprints.""" + """USRFingerprint.""" USRCAT = "USRCAT" - """USRCATFingerprint from scikit-fingerprints.""" + """USRCATFingerprint.""" WHIM = "WHIM" - """WHIMFingerprint from scikit-fingerprints.""" + """WHIMFingerprint.""" From 82a5f07865e7c3d3f8a585bb2410a2590db04127 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Thu, 17 Oct 2024 09:44:44 +0200 Subject: [PATCH 63/87] add assertion err note --- tests/test_fingerprints.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index e972bff7e..c6d086958 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -44,8 +44,14 @@ def test_fingerprint_kwargs(encoding, kw_fp, kw_conf): kwargs_fingerprint=kw_fp, ) - # Check that fingerprint embedding is of correct size and - # fingerprint kwargs specifying embedding size are used - assert x.shape[0] == len(smiles) + assert x.shape[0] == len(smiles), ( + "The number of fingerprint " + + "embedding rows does not match " + + "the number of molecules." + ) if "fp_size" in kw_fp: - assert x.shape[1] == kw_fp["fp_size"] + assert x.shape[1] == kw_fp["fp_size"], ( + "The fingerprint dimension " + + "parameter was ignored, fingerprints " + + "have a wrong number of dimensions." + ) From 0658986963fe4437b68e880a9dcaa5076e4543b1 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Thu, 17 Oct 2024 09:45:18 +0200 Subject: [PATCH 64/87] docs style --- baybe/utils/chemistry.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index a5d2085ad..2a24f5031 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -79,10 +79,11 @@ def _molecule_to_fingerprint_features( """Compute molecular fingerprint for a single SMILES string. Args: - fingerprint_encoder: Instance of Fingerprint class used to + fingerprint_encoder: Instance of the fingerprint class used to transform smiles string to fingerprint molecule: Smiles string or molecule object, - depending on what should be input into fingerprint_encoder's transform + depending on what should be input into + ``transform`` of ``fingerprint_encoder``. Returns: Array containing fingerprint for SMILES string. From 5223f63e0e98a85c9044558e823499cb7d24f9dc Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 16 Oct 2024 21:45:16 +0200 Subject: [PATCH 65/87] Add links to SubstanceEncoding enum --- baybe/parameters/enum.py | 66 ++++++++++++++++++++-------------------- docs/conf.py | 1 + 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index bd2d2380c..55b527109 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -25,106 +25,106 @@ class CustomEncoding(ParameterEncoding): class SubstanceEncoding(ParameterEncoding): - """Available encodings for substance parameters from scikit-fingerprints package.""" + """Available encodings for substance parameters from [scikit-fingerprints](https://scikit-fingerprints.github.io/scikit-fingerprints/).""" ATOMPAIR = "ATOMPAIR" - """AtomPairFingerprint.""" + """:class:`skfp.fingerprints.AtomPairFingerprint`""" AUTOCORR = "AUTOCORR" - """AutocorrFingerprint.""" + """:class:`skfp.fingerprints.AutocorrFingerprint`""" AVALON = "AVALON" - """AvalonFingerprint.""" + """:class:`skfp.fingerprints.AvalonFingerprint`""" E3FP = "E3FP" - """E3FPFingerprint.""" + """:class:`skfp.fingerprints.E3FPFingerprint`""" ECFP = "ECFP" - """ECFPFingerprint.""" + """:class:`skfp.fingerprints.ECFPFingerprint`""" MORGAN_FP = "MORGAN_FP" """Deprecated!""" ERG = "ERG" - """ERGFingerprint.""" + """:class:`skfp.fingerprints.ERGFingerprint`""" ESTATE = "ESTATE" - """EStateFingerprint.""" + """:class:`skfp.fingerprints.EStateFingerprint`""" FUNCTIONALGROUPS = "FUNCTIONALGROUPS" - """FunctionalGroupsFingerprint.""" + """:class:`skfp.fingerprints.FunctionalGroupsFingerprint`""" GETAWAY = "GETAWAY" - """GETAWAYFingerprint.""" + """:class:`skfp.fingerprints.GETAWAYFingerprint`""" GHOSECRIPPEN = "GHOSECRIPPEN" - """GhoseCrippenFingerprint.""" + """:class:`skfp.fingerprints.GhoseCrippenFingerprint`""" KLEKOTAROTH = "KLEKOTAROTH" - """KlekotaRothFingerprint.""" + """:class:`skfp.fingerprints.KlekotaRothFingerprint`""" LAGGNER = "LAGGNER" - """LaggnerFingerprint.""" + """:class:`skfp.fingerprints.LaggnerFingerprint`""" LAYERED = "LAYERED" - """LayeredFingerprint.""" + """:class:`skfp.fingerprints.LayeredFingerprint`""" LINGO = "LINGO" - """LingoFingerprint.""" + """:class:`skfp.fingerprints.LingoFingerprint`""" MACCS = "MACCS" - """MACCSFingerprint.""" + """:class:`skfp.fingerprints.MACCSFingerprint`""" MAP = "MAP" - """MAPFingerprint.""" + """:class:`skfp.fingerprints.MAPFingerprint`""" MHFP = "MHFP" - """MHFPFingerprint.""" + """:class:`skfp.fingerprints.MHFPFingerprint`""" MORSE = "MORSE" - """MORSEFingerprint.""" + """:class:`skfp.fingerprints.MORSEFingerprint`""" MQNS = "MQNS" - """MQNsFingerprint.""" + """:class:`skfp.fingerprints.MQNsFingerprint`""" MORDRED = "MORDRED" - """MordredFingerprint.""" + """:class:`skfp.fingerprints.MordredFingerprint`""" PATTERN = "PATTERN" - """PatternFingerprint.""" + """:class:`skfp.fingerprints.PatternFingerprint`""" PHARMACOPHORE = "PHARMACOPHORE" - """PharmacophoreFingerprint.""" + """:class:`skfp.fingerprints.PharmacophoreFingerprint`""" PHYSIOCHEMICALPROPERTIES = "PHYSIOCHEMICALPROPERTIES" - """PhysiochemicalPropertiesFingerprint.""" + """:class:`skfp.fingerprints.PhysiochemicalPropertiesFingerprint`""" PUBCHEM = "PUBCHEM" - """PubChemFingerprint.""" + """:class:`skfp.fingerprints.PubChemFingerprint`""" RDF = "RDF" - """RDFFingerprint.""" + """:class:`skfp.fingerprints.RDFFingerprint`""" RDKIT = "RDKIT" """Deprecated!""" RDKITFINGERPRINT = "RDKITFINGERPRINT" - """RDKitFingerprint.""" + """:class:`skfp.fingerprints.RDKitFingerprint`""" RDKIT2DDESCRIPTORS = "RDKIT2DDESCRIPTORS" - """RDKit2DDescriptorsFingerprint.""" + """:class:`skfp.fingerprints.RDKit2DDescriptorsFingerprint`""" SECFP = "SECFP" - """SECFPFingerprint.""" + """:class:`skfp.fingerprints.SECFPFingerprint`""" TOPOLOGICALTORSION = "TOPOLOGICALTORSION" - """TopologicalTorsionFingerprint.""" + """:class:`skfp.fingerprints.TopologicalTorsionFingerprint`""" USR = "USR" - """USRFingerprint.""" + """:class:`skfp.fingerprints.USRFingerprint`""" USRCAT = "USRCAT" - """USRCATFingerprint.""" + """:class:`skfp.fingerprints.USRCATFingerprint`""" WHIM = "WHIM" - """WHIMFingerprint.""" + """:class:`skfp.fingerprints.WHIMFingerprint`""" diff --git a/docs/conf.py b/docs/conf.py index 79822b084..403322ffe 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -246,6 +246,7 @@ "python": ("https://docs.python.org/3", None), "pandas": ("https://pandas.pydata.org/docs/", None), "polars": ("https://docs.pola.rs/api/python/stable/", None), + "skfp": ("https://scikit-fingerprints.github.io/scikit-fingerprints/", None), "sklearn": ("https://scikit-learn.org/stable/", None), "sklearn_extra": ("https://scikit-learn-extra.readthedocs.io/en/stable", None), "numpy": ("https://numpy.org/doc/stable/", None), From ace45f85ef6a3e15e477ed7935ffec77c26aa53a Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 17 Oct 2024 08:10:14 +0200 Subject: [PATCH 66/87] Fix CONTRIBUTERS.md --- CONTRIBUTORS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f1cba3715..6f67b86aa 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -24,7 +24,7 @@ Human readable output for search spaces - Di Jin (Merck Life Science KGaA, Darmstadt, Germany):\ Cardinality constraints -- Karin Hrovatin (Merck KGaA, Darmstadt, Germany):\ -`scikit-fingerprints` support - Julian Streibel (Merck Life Science KGaA, Darmstadt, Germany):\ Bernoulli multi-armed bandit and Thompson sampling +- Karin Hrovatin (Merck KGaA, Darmstadt, Germany):\ + `scikit-fingerprints` support From 906acff0d376497319fc192b67a8e13d9b96d116 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 17 Oct 2024 08:17:36 +0200 Subject: [PATCH 67/87] Use normal style for headings in parameters.md Like on all other pages of the documentation --- docs/userguide/parameters.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index 293ec6877..d77e725f1 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -16,7 +16,7 @@ differently under the hood: Discrete and continuous parameters. ## Continuous Parameters -### ``NumericalContinuousParameter`` +### NumericalContinuousParameter This is currently the only continuous parameter type BayBE supports. It defines possible values from a numerical interval called ``bounds``, and thus has an infinite amount of possibilities. @@ -47,7 +47,7 @@ number space. For different parameters, different types of encoding make sense. situations are reflected by the different discrete parameter types BayBE offers. ``` -### ``NumericalDiscreteParameter`` +### NumericalDiscreteParameter This is the right type for parameters that have numerical values. We support sets with equidistant values like ``(1, 2, 3, 4, 5)`` but also unevenly spaced sets of numbers like ``(0.2, 1.0, 2.0, 5.0, 10.0, 50.0)``. @@ -66,7 +66,7 @@ NumericalDiscreteParameter( ) ``` -### ``CategoricalParameter`` +### CategoricalParameter A ``CategoricalParameter`` supports sets of strings as labels. This is most suitable if the experimental choices cannot easily be translated into a number. @@ -104,7 +104,7 @@ in the first place. This concept is generalized in the ``CustomDiscreteParameter``, where the user can provide their own custom set of descriptors for each label. -### ``SubstanceParameter`` +### SubstanceParameter Instead of ``values``, this parameter accepts ``data`` in form of a dictionary. The items correspond to pairs of labels and [SMILES](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system). SMILES are string-based representations of molecular structures. @@ -199,7 +199,7 @@ The ``SubstanceParameter`` is only available if BayBE was installed with the additional ``chem`` dependency. ``` -### ``CustomDiscreteParameter`` +### CustomDiscreteParameter The ``encoding`` concept introduced above is generalized by the ``CustomParameter``. Here, the user is expected to provide their own descriptors for the encoding. @@ -237,7 +237,7 @@ In BayBE's framework, you can provide numbers corresponding e.g. to delivery tim reliability or average price of the vendor to encode the labels via the ``CustomParameter``. -### ``TaskParameter`` +### TaskParameter Often, several experimental campaigns involve similar or even identical parameters but still have one or more differences. For example, when optimizing reagents in a chemical reaction, the reactants remain From a74907a7c4aa4c80b9c88c0f89ef24da87d7495a Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 17 Oct 2024 09:35:42 +0200 Subject: [PATCH 68/87] Add links to parameters.md --- docs/userguide/parameters.md | 40 +++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index d77e725f1..84f95399f 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -1,6 +1,14 @@ +[`SearchSpace`]: baybe.searchspace.core.SearchSpace +[`Constraint`]: baybe.constraints.base.Constraint +[`SubstanceParameter`]: baybe.parameters.categorical.SubstanceParameter +[`CategoricalParameter`]: baybe.parameters.categorical.CategoricalParameter +[`TaskParameter`]: baybe.parameters.categorical.TaskParameter +[`CustomDiscreteParameter`]: baybe.parameters.custom.CustomDiscreteParameter +[scikit-fingerprints]: https://scikit-fingerprints.github.io/scikit-fingerprints/ + # Parameters -Parameters are fundamental for BayBE, as they configure the ``SearchSpace`` and serve +Parameters are fundamental for BayBE, as they configure the [`SearchSpace`] and serve as the direct link to the controllable variables in your experiment. Before starting an iterative campaign, the user is required to specify the exact parameters they can control and want to consider in their optimization. @@ -20,7 +28,7 @@ differently under the hood: Discrete and continuous parameters. This is currently the only continuous parameter type BayBE supports. It defines possible values from a numerical interval called ``bounds``, and thus has an infinite amount of possibilities. -Unless restrained by `Constraint`s, BayBE will consider any possible parameter value +Unless restrained by [`Constraint`]s, BayBE will consider any possible parameter value that lies within the chosen interval. ```python @@ -67,7 +75,7 @@ NumericalDiscreteParameter( ``` ### CategoricalParameter -A ``CategoricalParameter`` supports sets of strings as labels. +A [`CategoricalParameter`] supports sets of strings as labels. This is most suitable if the experimental choices cannot easily be translated into a number. Examples for this could be vendors like ``("Vendor A", "Vendor B", "Vendor C")`` or @@ -97,11 +105,11 @@ simply because the number 1 is closer to 2 than to 3. Hence, for an arbitrary set of labels, such an ordering cannot generally be assumed. In the particular case of substances, it not even possible to describe the similarity between labels by ordering along one single dimension. -For this reason, we also provide the ``SubstanceParameter``, which encodes labels +For this reason, we also provide the [`SubstanceParameter`], which encodes labels corresponding to small molecules with chemical descriptors, capturing their similarities much better and without the need for the user to think about ordering and similarity in the first place. -This concept is generalized in the ``CustomDiscreteParameter``, where the user can +This concept is generalized in the [`CustomDiscreteParameter`], where the user can provide their own custom set of descriptors for each label. ### SubstanceParameter @@ -129,18 +137,18 @@ SubstanceParameter( The ``encoding`` option defines what kind of descriptors are calculated. All descriptors are calculated using the -[scikit-fingerprints package](https://github.com/scikit-fingerprints/scikit-fingerprints/). +[scikit-fingerprints](https://github.com/scikit-fingerprints/scikit-fingerprints/) package. Any fingerprint class from `scikit-fingerprints` can be used as an input parameter for chemical encoding. The fingerprint class names should be passed in all upper case and without the `Fingeprint` suffix, -e.g. use alias `MORDRED` for `MordredFingerprint` class. +e.g. use alias `MORDRED` for {class}`~skfp.fingerprints.MordredFingerprint` class. Here are examples of a few popular fingerprints: * ``ECFP``: Extended Connectivity FingerPrint, which is a circular topological fingerprint similar to Morgan fingerprint. * ``MORDRED``: Chemical descriptor based fingerprint. * ``RDKIT``: The RDKit fingerprint, which is based on hashing of molecular sub-graphs. -You can adjust fingerprint computation with parameters for `Fingerprint` classes from `scikit-fingerprints`. -These can be specified via the `kwargs_fingerprint` in the `SubstanceParameter` class. +You can adjust fingerprint computation with parameters for `Fingerprint` classes from [scikit-fingerprints]. +These can be specified via the `kwargs_fingerprint` in the [`SubstanceParameter`] class. Similarly, for fingerprints requiring conformers, the parameters for conformer computation can be specified via `kwargs_conformer`. @@ -172,10 +180,10 @@ This usually reduces the number of descriptors to 10-50, depending on the specif items in ``data``. ```{warning} -The descriptors calculated for a ``SubstanceParameter`` were developed to describe +The descriptors calculated for a [`SubstanceParameter`] were developed to describe small molecules and are not suitable for other substances. If you deal with large molecules like polymers or arbitrary substance mixtures, we recommend to provide your -own descriptors via the ``CustomParameter``. +own descriptors via the [`CustomDiscreteParameter`]. ``` In the following example from an application you can see @@ -201,12 +209,12 @@ additional ``chem`` dependency. ### CustomDiscreteParameter The ``encoding`` concept introduced above is generalized by the -``CustomParameter``. +[`CustomDiscreteParameter`]. Here, the user is expected to provide their own descriptors for the encoding. Take, for instance, a parameter that corresponds to the choice of a polymer. Polymers are not well represented by the small molecule descriptors utilized in the -``SubstanceParameter``. +[`SubstanceParameter`]. Still, one could provide experimental measurements or common metrics used to classify polymers: @@ -229,13 +237,13 @@ CustomDiscreteParameter( ) ``` -With the ``CustomParameter``, you can also encode parameter labels that have +With the [`CustomDiscreteParameter`], you can also encode parameter labels that have nothing to do with substances. For example, a parameter corresponding to the choice of a vendor is typically not easily encoded with standard means. In BayBE's framework, you can provide numbers corresponding e.g. to delivery time, reliability or average price of the vendor to encode the labels via the -``CustomParameter``. +[`CustomDiscreteParameter`]. ### TaskParameter Often, several experimental campaigns involve similar or even identical parameters but @@ -246,7 +254,7 @@ Similarly, in a mixture development for cell culture media, the cell type is fix hence not a parameter. However, once we plan to mix data from several campaigns, both reactants and cell lines can also be considered parameters in that they encode the necessary context. -BayBE is able to process such context information with the `TaskParameter`. +BayBE is able to process such context information with the [`TaskParameter`]. In many cases, this can drastically increase the optimization performance due to the enlarged data corpus. From 83213faceff978291972a4147badde133e8f8ddb Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Thu, 17 Oct 2024 09:54:07 +0200 Subject: [PATCH 69/87] info on fp subsitutions in docsrtings --- baybe/parameters/enum.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index bd2d2380c..fe697c4ec 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -25,7 +25,11 @@ class CustomEncoding(ParameterEncoding): class SubstanceEncoding(ParameterEncoding): - """Available encodings for substance parameters from scikit-fingerprints package.""" + """Available encodings for substance parameters from scikit-fingerprints package. + + For more information on individual fingerprints refer to + scikit-fingerprints package. + """ ATOMPAIR = "ATOMPAIR" """AtomPairFingerprint.""" @@ -43,7 +47,10 @@ class SubstanceEncoding(ParameterEncoding): """ECFPFingerprint.""" MORGAN_FP = "MORGAN_FP" - """Deprecated!""" + """Deprecated! + As a substitution, ECFP fingerprint + with fp_size=1024 and radius=4 will be used. + """ ERG = "ERG" """ERGFingerprint.""" @@ -106,7 +113,8 @@ class SubstanceEncoding(ParameterEncoding): """RDFFingerprint.""" RDKIT = "RDKIT" - """Deprecated!""" + """Deprecated! As a substitution, RDKit2DDescriptors will be used. + """ RDKITFINGERPRINT = "RDKITFINGERPRINT" """RDKitFingerprint.""" From ca93b104c4c6433439db1eb7451864b1022ef4f8 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 17 Oct 2024 10:30:46 +0200 Subject: [PATCH 70/87] Update encoding section in userguide --- docs/userguide/parameters.md | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index 84f95399f..5b68248f3 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -4,6 +4,7 @@ [`CategoricalParameter`]: baybe.parameters.categorical.CategoricalParameter [`TaskParameter`]: baybe.parameters.categorical.TaskParameter [`CustomDiscreteParameter`]: baybe.parameters.custom.CustomDiscreteParameter +[`SubstanceEncoding`]: baybe.parameters.enum.SubstanceEncoding [scikit-fingerprints]: https://scikit-fingerprints.github.io/scikit-fingerprints/ # Parameters @@ -135,22 +136,24 @@ SubstanceParameter( ) ``` -The ``encoding`` option defines what kind of descriptors are calculated. -All descriptors are calculated using the -[scikit-fingerprints](https://github.com/scikit-fingerprints/scikit-fingerprints/) package. -Any fingerprint class from `scikit-fingerprints` can be used as an input parameter for chemical encoding. -The fingerprint class names should be passed in all upper case and without the `Fingeprint` suffix, -e.g. use alias `MORDRED` for {class}`~skfp.fingerprints.MordredFingerprint` class. +The ``encoding`` option defines what kind of descriptors are calculated using the +[scikit-fingerprints] package. +Any of the fingerprint classes provided by the package can be used. +The encoding can be specified either by passing the corresponding [`SubstanceEncoding`] member +(click to see full list of options) or its string representation, e.g. use +[`SubstanceParameter.MORDRED`](baybe.parameters.enum.SubstanceEncoding.MORDRED) +or its string alias `"MORDRED"` to select the {class}`~skfp.fingerprints.MordredFingerprint`. + Here are examples of a few popular fingerprints: -* ``ECFP``: Extended Connectivity FingerPrint, +* {attr}`~baybe.parameters.enum.SubstanceEncoding.ECFP`: Extended Connectivity FingerPrint, which is a circular topological fingerprint similar to Morgan fingerprint. -* ``MORDRED``: Chemical descriptor based fingerprint. -* ``RDKIT``: The RDKit fingerprint, which is based on hashing of molecular sub-graphs. +* {attr}`~baybe.parameters.enum.SubstanceEncoding.MORDRED`: Chemical descriptor based fingerprint. +* {attr}`~baybe.parameters.enum.SubstanceEncoding.RDKIT`: The RDKit fingerprint, which is based on hashing of molecular subgraphs. -You can adjust fingerprint computation with parameters for `Fingerprint` classes from [scikit-fingerprints]. -These can be specified via the `kwargs_fingerprint` in the [`SubstanceParameter`] class. +You can customize the fingerprint computation by passing arguments of the corresponding +[scikit-fingerprints] class to the `kwargs_fingerprint` argument the [`SubstanceParameter`] constructor. Similarly, for fingerprints requiring conformers, -the parameters for conformer computation can be specified via `kwargs_conformer`. +the configuration options for conformer computation can be specified via `kwargs_conformer`. ```python from baybe.parameters import SubstanceParameter From 4a981deb878aa3bff45baa778ac78e82ce9fde3b Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 17 Oct 2024 11:13:01 +0200 Subject: [PATCH 71/87] Fix links in SubstanceEncoding --- baybe/parameters/enum.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index e18d2a1e0..fd4603fcb 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -25,10 +25,9 @@ class CustomEncoding(ParameterEncoding): class SubstanceEncoding(ParameterEncoding): - """Available encodings for substance parameters from scikit-fingerprints package. + """Available encodings for substance parameters from `scikit-fingerprints`_ package. - For more information on individual fingerprints refer to - [scikit-fingerprints](https://scikit-fingerprints.github.io/scikit-fingerprints/). + .. _scikit-fingerprints: https://scikit-fingerprints.github.io/scikit-fingerprints/ """ ATOMPAIR = "ATOMPAIR" @@ -47,9 +46,9 @@ class SubstanceEncoding(ParameterEncoding): """:class:`skfp.fingerprints.ECFPFingerprint`""" MORGAN_FP = "MORGAN_FP" - """Deprecated! - As a substitution, ECFP fingerprint - with fp_size=1024 and radius=4 will be used. + """ + Deprecated! Uses :class:`skfp.fingerprints.ECFPFingerprint` with ``fp_size=1024`` + and ``radius=4``. """ ERG = "ERG" @@ -113,8 +112,7 @@ class SubstanceEncoding(ParameterEncoding): """:class:`skfp.fingerprints.RDFFingerprint`""" RDKIT = "RDKIT" - """Deprecated! As a substitution, RDKit2DDescriptors will be used. - """ + """Deprecated! Uses :class:`skfp.fingerprints.RDKit2DDescriptors`.""" RDKITFINGERPRINT = "RDKITFINGERPRINT" """:class:`skfp.fingerprints.RDKitFingerprint`""" From 7d00b656f5124d40cdea28cf0118d23b81734ab2 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:29:05 +0200 Subject: [PATCH 72/87] fix fp feature names --- baybe/parameters/substance.py | 2 +- baybe/utils/chemistry.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py index d523e064d..fb8ad74c9 100644 --- a/baybe/parameters/substance.py +++ b/baybe/parameters/substance.py @@ -123,7 +123,7 @@ def comp_df(self) -> pd.DataFrame: # noqa: D102 from baybe.utils import chemistry vals = list(self.data.values()) - pref = self.name + "_" + pref = self.name # Get the raw descriptors comp_df = chemistry.smiles_to_fingerprint_features( diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 2a24f5031..6c3d24f23 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -94,7 +94,7 @@ def _molecule_to_fingerprint_features( def smiles_to_fingerprint_features( smiles: Sequence[str], encoding: SubstanceEncoding, - prefix: str = "", + prefix: str | None = None, kwargs_conformer: dict | None = None, kwargs_fingerprint: dict | None = None, ) -> pd.DataFrame: @@ -151,7 +151,11 @@ def smiles_to_fingerprint_features( ] ) name = f"skfp{fingerprint_encoder.__class__.__name__.removesuffix('Fingerprint')}_" - col_names = [prefix + name + f for f in fingerprint_encoder.get_feature_names_out()] + prefix = prefix + "_" if prefix else "" + col_names = [ + prefix + name + f.split("fingerprint")[1] + for f in fingerprint_encoder.get_feature_names_out() + ] df = pd.DataFrame(features, columns=col_names, dtype=DTypeFloatNumpy) return df From 3ca34984f294c6885f6d61e31641364554c3a7fc Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:40:49 +0200 Subject: [PATCH 73/87] Update baybe/utils/chemistry.py Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- baybe/utils/chemistry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 6c3d24f23..94e6e7c25 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -117,7 +117,7 @@ def smiles_to_fingerprint_features( warnings.warn( f"Substance encoding '{encoding.name}' is deprecated and will be disabled " f"in a future version. Use '{SubstanceEncoding.ECFP.name}' " - f"with 'fp_size' 1204 and 'radius' 4 instead.", + f"with 'fp_size' 1024 and 'radius' 4 instead.", DeprecationWarning, ) encoding = SubstanceEncoding.ECFP From 8ee7f200000f9bba415a349b0f9acd186e017203 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:42:13 +0200 Subject: [PATCH 74/87] rm skfp from feature name --- baybe/utils/chemistry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 94e6e7c25..bdf5a620a 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -150,7 +150,7 @@ def smiles_to_fingerprint_features( for mol in mol_list ] ) - name = f"skfp{fingerprint_encoder.__class__.__name__.removesuffix('Fingerprint')}_" + name = f"{fingerprint_encoder.__class__.__name__.removesuffix('Fingerprint')}_" prefix = prefix + "_" if prefix else "" col_names = [ prefix + name + f.split("fingerprint")[1] From 76f4a57299a58e478f3cffb4449f8ee7be4b4cbc Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:48:27 +0200 Subject: [PATCH 75/87] add missing FPs --- baybe/parameters/enum.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index fd4603fcb..f1e68f4c7 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -132,5 +132,8 @@ class SubstanceEncoding(ParameterEncoding): USRCAT = "USRCAT" """:class:`skfp.fingerprints.USRCATFingerprint`""" + VSA = "VSA" + """:class:`skfp.fingerprints.VSAFingerprint`""" + WHIM = "WHIM" """:class:`skfp.fingerprints.WHIMFingerprint`""" From 961ba26d9c6cb2ea0e9586e7155a9962102301c5 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 18 Oct 2024 12:52:33 +0200 Subject: [PATCH 76/87] Change type from PropertyMol to Mol --- baybe/utils/chemistry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index bdf5a620a..ab02e9ada 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -74,7 +74,7 @@ def name_to_smiles(name: str) -> str: @_disk_cache def _molecule_to_fingerprint_features( fingerprint_encoder: BaseFingerprintTransformer, - molecule: str | Chem.PropertyMol.PropertyMol, + molecule: str | Chem.Mol, ) -> np.ndarray: """Compute molecular fingerprint for a single SMILES string. From a9e0414107ea9ab746e5ef1d1ca8f769072121e6 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 18 Oct 2024 13:05:53 +0200 Subject: [PATCH 77/87] Move Boolean utility to edbo.py --- baybe/searchspace/core.py | 9 -------- .../gaussian_process/presets/edbo.py | 23 ++++++++++++++++--- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/baybe/searchspace/core.py b/baybe/searchspace/core.py index 83ac5e1e6..c6e26de72 100644 --- a/baybe/searchspace/core.py +++ b/baybe/searchspace/core.py @@ -16,7 +16,6 @@ from baybe.constraints.base import Constraint from baybe.parameters import TaskParameter from baybe.parameters.base import Parameter -from baybe.parameters.enum import SubstanceEncoding from baybe.searchspace.continuous import SubspaceContinuous from baybe.searchspace.discrete import ( MemorySize, @@ -223,14 +222,6 @@ def type(self) -> SearchSpaceType: return SearchSpaceType.HYBRID raise RuntimeError("This line should be impossible to reach.") - @property - def contains_fingerprint(self) -> bool: - """Indicates if any of the discrete parameters uses ``Fingerprint`` encoding.""" - return any( - p.encoding in SubstanceEncoding if p.encoding is not None else False - for p in self.discrete.parameters - ) - @property def comp_rep_columns(self) -> tuple[str, ...]: """The columns spanning the computational representation.""" diff --git a/baybe/surrogates/gaussian_process/presets/edbo.py b/baybe/surrogates/gaussian_process/presets/edbo.py index ee851f9e4..be9f0683c 100644 --- a/baybe/surrogates/gaussian_process/presets/edbo.py +++ b/baybe/surrogates/gaussian_process/presets/edbo.py @@ -2,6 +2,7 @@ from __future__ import annotations +from collections.abc import Collection from typing import TYPE_CHECKING from attrs import define @@ -9,7 +10,10 @@ from baybe.kernels.basic import MaternKernel from baybe.kernels.composite import ScaleKernel from baybe.parameters import TaskParameter +from baybe.parameters.enum import SubstanceEncoding +from baybe.parameters.substance import SubstanceParameter from baybe.priors.basic import GammaPrior +from baybe.searchspace.discrete import SubspaceDiscrete from baybe.surrogates.gaussian_process.kernel_factory import KernelFactory if TYPE_CHECKING: @@ -19,6 +23,17 @@ from baybe.searchspace.core import SearchSpace +def _contains_encoding( + subspace: SubspaceDiscrete, encodings: Collection[SubstanceEncoding] +) -> bool: + """Tell if any of the substance parameters uses one of the specified encodings.""" + return any( + p.encoding in encodings + for p in subspace.parameters + if isinstance(p, SubstanceParameter) + ) + + @define class EDBOKernelFactory(KernelFactory): """A factory providing the kernel for Gaussian process surrogates adapted from EDBO. @@ -36,7 +51,9 @@ def __call__( # noqa: D102 [p for p in searchspace.parameters if isinstance(p, TaskParameter)] ) - uses_descriptors = searchspace.contains_fingerprint and (effective_dims >= 50) + switching_condition = _contains_encoding( + searchspace.discrete, (SubstanceEncoding.MORDRED, SubstanceEncoding.RDKIT) + ) and (effective_dims >= 50) # low D priors if effective_dims < 5: @@ -46,14 +63,14 @@ def __call__( # noqa: D102 outputscale_initial_value = 8.0 # DFT optimized priors - elif uses_descriptors and effective_dims < 100: + elif switching_condition and effective_dims < 100: lengthscale_prior = GammaPrior(2.0, 0.2) lengthscale_initial_value = 5.0 outputscale_prior = GammaPrior(5.0, 0.5) outputscale_initial_value = 8.0 # Mordred optimized priors - elif uses_descriptors: + elif switching_condition: lengthscale_prior = GammaPrior(2.0, 0.1) lengthscale_initial_value = 10.0 outputscale_prior = GammaPrior(2.0, 0.1) From bc6e979a8a72575054af7558cdc65f5ef1cc99df Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 18 Oct 2024 13:20:19 +0200 Subject: [PATCH 78/87] Refactor caching utility * Reorder arguments: molecule first * Rename encoder argument * Reformat and simplify docstrings * Avoid hard-coded docstring references to string input --- baybe/utils/chemistry.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index ab02e9ada..385944f4c 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -73,22 +73,19 @@ def name_to_smiles(name: str) -> str: @lru_cache(maxsize=None) @_disk_cache def _molecule_to_fingerprint_features( - fingerprint_encoder: BaseFingerprintTransformer, molecule: str | Chem.Mol, + encoder: BaseFingerprintTransformer, ) -> np.ndarray: - """Compute molecular fingerprint for a single SMILES string. + """Compute molecular fingerprint for a single molecule. Args: - fingerprint_encoder: Instance of the fingerprint class used to - transform smiles string to fingerprint - molecule: Smiles string or molecule object, - depending on what should be input into - ``transform`` of ``fingerprint_encoder``. + molecule: SMILES string or molecule object. + encoder: Instance of the fingerprint class to be used for computation. Returns: - Array containing fingerprint for SMILES string. + Array of fingerprint features. """ - return fingerprint_encoder.transform([molecule]) + return encoder.transform([molecule]) def smiles_to_fingerprint_features( @@ -144,9 +141,7 @@ def smiles_to_fingerprint_features( features = np.concatenate( [ - _molecule_to_fingerprint_features( - fingerprint_encoder=fingerprint_encoder, molecule=mol - ) + _molecule_to_fingerprint_features(mol, fingerprint_encoder) for mol in mol_list ] ) From 4d6fad73f2148cef417233e0a3d6bed0168c542c Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 18 Oct 2024 15:25:51 +0200 Subject: [PATCH 79/87] Fix EDBO logic * Add deprecated RDKIT2DDESCRIPTORS to condition * Implement forgotten changes in noise factory --- .../gaussian_process/presets/edbo.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/baybe/surrogates/gaussian_process/presets/edbo.py b/baybe/surrogates/gaussian_process/presets/edbo.py index be9f0683c..112519d70 100644 --- a/baybe/surrogates/gaussian_process/presets/edbo.py +++ b/baybe/surrogates/gaussian_process/presets/edbo.py @@ -34,6 +34,14 @@ def _contains_encoding( ) +_EDBO_ENCODINGS = ( + SubstanceEncoding.MORDRED, + SubstanceEncoding.RDKIT, + SubstanceEncoding.RDKIT2DDESCRIPTORS, +) +"""Encodings relevant to EDBO logic.""" + + @define class EDBOKernelFactory(KernelFactory): """A factory providing the kernel for Gaussian process surrogates adapted from EDBO. @@ -52,7 +60,7 @@ def __call__( # noqa: D102 ) switching_condition = _contains_encoding( - searchspace.discrete, (SubstanceEncoding.MORDRED, SubstanceEncoding.RDKIT) + searchspace.discrete, _EDBO_ENCODINGS ) and (effective_dims >= 50) # low D priors @@ -110,18 +118,20 @@ def _edbo_noise_factory( [p for p in searchspace.parameters if isinstance(p, TaskParameter)] ) - uses_descriptors = searchspace.contains_fingerprint and effective_dims >= 50 + switching_condition = _contains_encoding( + searchspace.discrete, _EDBO_ENCODINGS + ) and (effective_dims >= 50) # low D priors if effective_dims < 5: return (GammaPrior(1.05, 0.5), 0.1) # DFT optimized priors - elif uses_descriptors and effective_dims < 100: + elif switching_condition and effective_dims < 100: return (GammaPrior(1.5, 0.1), 5.0) # Mordred optimized priors - elif uses_descriptors: + elif switching_condition: return (GammaPrior(1.5, 0.1), 5.0) # OHE optimized priors From 840cf97b7ee7b3b8f263564e73e8fd08c0b1f7bc Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:07:58 +0200 Subject: [PATCH 80/87] add missing fp --- baybe/parameters/enum.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/baybe/parameters/enum.py b/baybe/parameters/enum.py index f1e68f4c7..79a4e8df6 100644 --- a/baybe/parameters/enum.py +++ b/baybe/parameters/enum.py @@ -45,6 +45,9 @@ class SubstanceEncoding(ParameterEncoding): ECFP = "ECFP" """:class:`skfp.fingerprints.ECFPFingerprint`""" + ELECTROSHAPE = "ELECTROSHAPE" + """:class:`skfp.fingerprints.ElectroShapeFingerprint`""" + MORGAN_FP = "MORGAN_FP" """ Deprecated! Uses :class:`skfp.fingerprints.ECFPFingerprint` with ``fp_size=1024`` From 2da418157636d6681fd89af33472c49df1232f99 Mon Sep 17 00:00:00 2001 From: Karin Hrovatin <47607471+Hrovatin@users.noreply.github.com> Date: Thu, 24 Oct 2024 15:40:47 +0200 Subject: [PATCH 81/87] In fingerprint feature names replace fingerprint class with SubstanceEncoding parameter name --- baybe/utils/chemistry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 385944f4c..804d88f3b 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -145,7 +145,7 @@ def smiles_to_fingerprint_features( for mol in mol_list ] ) - name = f"{fingerprint_encoder.__class__.__name__.removesuffix('Fingerprint')}_" + name = f"{encoding.name}_" prefix = prefix + "_" if prefix else "" col_names = [ prefix + name + f.split("fingerprint")[1] From 20bc64f3c1a19ac617311de063b059fe8f4bc34d Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 1 Nov 2024 14:14:42 +0100 Subject: [PATCH 82/87] Fix docstring --- baybe/utils/chemistry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 804d88f3b..077db1d12 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -163,7 +163,7 @@ def get_fingerprint_class(encoding: SubstanceEncoding) -> BaseFingerprintTransfo encoding: A substance encoding. Raises: - ValueError: If no fingerprint class for the specified encoding. + ValueError: If no fingerprint class for the specified encoding is found. Returns: The fingerprint class. From 38fe179961bb3b263623dc83c7737c26876d8dc2 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 1 Nov 2024 14:52:04 +0100 Subject: [PATCH 83/87] Remove statement that all scikit-fingerprints encodings are available --- docs/userguide/parameters.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index 5b68248f3..e568ff6cd 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -136,10 +136,9 @@ SubstanceParameter( ) ``` -The ``encoding`` option defines what kind of descriptors are calculated using the +The ``encoding`` defines what kind of descriptors are calculated using the [scikit-fingerprints] package. -Any of the fingerprint classes provided by the package can be used. -The encoding can be specified either by passing the corresponding [`SubstanceEncoding`] member +It can be specified either by passing the corresponding [`SubstanceEncoding`] member (click to see full list of options) or its string representation, e.g. use [`SubstanceParameter.MORDRED`](baybe.parameters.enum.SubstanceEncoding.MORDRED) or its string alias `"MORDRED"` to select the {class}`~skfp.fingerprints.MordredFingerprint`. From 6377b7dfe2bdb3078a64744d1ad9aff644550f43 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Fri, 1 Nov 2024 21:46:29 +0100 Subject: [PATCH 84/87] Fix optional test imports --- tests/test_deprecations.py | 14 ++++++++------ tests/test_fingerprints.py | 13 ++++++------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index eac952785..0a8dd854f 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -6,7 +6,6 @@ import pandas as pd import pytest from pytest import param -from skfp.fingerprints import ECFPFingerprint, RDKit2DDescriptorsFingerprint from baybe._optional.info import CHEM_INSTALLED from baybe.acquisition.base import AcquisitionFunction @@ -31,7 +30,6 @@ from baybe.searchspace.validation import get_transform_parameters from baybe.targets.binary import BinaryTarget from baybe.targets.numerical import NumericalTarget -from baybe.utils.chemistry import smiles_to_fingerprint_features def test_objective_class(): @@ -248,8 +246,8 @@ def test_target_transform_interface(): @pytest.mark.parametrize( ("deprecated", "replacement"), [ - param(SubstanceEncoding.MORGAN_FP, ECFPFingerprint, id="morgan"), - param(SubstanceEncoding.RDKIT, RDKit2DDescriptorsFingerprint, id="rdkit"), + param(SubstanceEncoding.MORGAN_FP, "ECFPFingerprint", id="morgan"), + param(SubstanceEncoding.RDKIT, "RDKit2DDescriptorsFingerprint", id="rdkit"), ], ) @pytest.mark.skipif( @@ -257,9 +255,13 @@ def test_target_transform_interface(): ) def test_deprecated_encodings(deprecated, replacement): """Deprecated encoding raises a warning and uses correct replacement.""" - path = f"skfp.fingerprints.{replacement.__name__}" + import skfp.fingerprints - with patch(path, wraps=replacement) as patched: + from baybe.utils.chemistry import smiles_to_fingerprint_features + + path = f"skfp.fingerprints.{replacement}" + + with patch(path, wraps=getattr(skfp.fingerprints, replacement)) as patched: # Assert warning with pytest.warns(DeprecationWarning): smiles_to_fingerprint_features(["C"], deprecated) diff --git a/tests/test_fingerprints.py b/tests/test_fingerprints.py index c6d086958..96782caa3 100644 --- a/tests/test_fingerprints.py +++ b/tests/test_fingerprints.py @@ -4,7 +4,6 @@ from baybe._optional.info import CHEM_INSTALLED from baybe.parameters.enum import SubstanceEncoding -from baybe.utils.chemistry import smiles_to_fingerprint_features test_cases: list[tuple[SubstanceEncoding, dict, dict]] = [ (enc, {}, {}) @@ -35,6 +34,8 @@ ) def test_fingerprint_kwargs(encoding, kw_fp, kw_conf): """Test all fingerprint computations.""" + from baybe.utils.chemistry import smiles_to_fingerprint_features + smiles = ["CC(N(C)C)=O", "CCCC#N"] x = smiles_to_fingerprint_features( smiles=smiles, @@ -45,13 +46,11 @@ def test_fingerprint_kwargs(encoding, kw_fp, kw_conf): ) assert x.shape[0] == len(smiles), ( - "The number of fingerprint " - + "embedding rows does not match " - + "the number of molecules." + "The number of fingerprint embedding rows does not match the number of " + "molecules." ) if "fp_size" in kw_fp: assert x.shape[1] == kw_fp["fp_size"], ( - "The fingerprint dimension " - + "parameter was ignored, fingerprints " - + "have a wrong number of dimensions." + "The fingerprint dimension parameter was ignored, fingerprints have a " + "wrong number of dimensions." ) From 370f1b8d8377c4ef2199c4e4ae298364b98d8283 Mon Sep 17 00:00:00 2001 From: "Alexander V. Hopp" Date: Mon, 4 Nov 2024 14:10:46 +0100 Subject: [PATCH 85/87] Fix incorrect link in parameter userguide --- docs/userguide/parameters.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userguide/parameters.md b/docs/userguide/parameters.md index e568ff6cd..3074c73a2 100644 --- a/docs/userguide/parameters.md +++ b/docs/userguide/parameters.md @@ -1,6 +1,6 @@ [`SearchSpace`]: baybe.searchspace.core.SearchSpace [`Constraint`]: baybe.constraints.base.Constraint -[`SubstanceParameter`]: baybe.parameters.categorical.SubstanceParameter +[`SubstanceParameter`]: baybe.parameters.substance.SubstanceParameter [`CategoricalParameter`]: baybe.parameters.categorical.CategoricalParameter [`TaskParameter`]: baybe.parameters.categorical.TaskParameter [`CustomDiscreteParameter`]: baybe.parameters.custom.CustomDiscreteParameter From 01b06511d79cdcad2b4168cbc8e1ea6135859d7e Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 5 Nov 2024 23:14:08 +0100 Subject: [PATCH 86/87] Bump myst-parser version --- .lockfiles/py310-dev.lock | 28 +++++++++------------------- pyproject.toml | 12 ++++++------ 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/.lockfiles/py310-dev.lock b/.lockfiles/py310-dev.lock index 4a7aeeede..e4aeac9b4 100644 --- a/.lockfiles/py310-dev.lock +++ b/.lockfiles/py310-dev.lock @@ -175,7 +175,7 @@ funcy==1.17 # funcy-stubs funcy-stubs==0.1.1 # via baybe (pyproject.toml) -furo==2024.7.18 +furo==2024.8.6 # via baybe (pyproject.toml) future==1.0.0 # via autograd @@ -238,7 +238,6 @@ ipykernel==6.29.5 # jupyter # jupyter-console # jupyterlab - # qtconsole ipython==8.26.0 # via # ipykernel @@ -281,7 +280,7 @@ jsonschema==4.23.0 # nbformat jsonschema-specifications==2023.12.1 # via jsonschema -jupyter==1.0.0 +jupyter==1.1.1 # via baybe (pyproject.toml) jupyter-client==8.6.2 # via @@ -289,7 +288,6 @@ jupyter-client==8.6.2 # jupyter-console # jupyter-server # nbclient - # qtconsole jupyter-console==6.6.3 # via jupyter jupyter-core==5.7.2 @@ -302,7 +300,6 @@ jupyter-core==5.7.2 # nbclient # nbconvert # nbformat - # qtconsole jupyter-events==0.10.0 # via jupyter-server jupyter-lsp==2.2.5 @@ -317,7 +314,9 @@ jupyter-server==2.14.2 jupyter-server-terminals==0.5.3 # via jupyter-server jupyterlab==4.2.4 - # via notebook + # via + # jupyter + # notebook jupyterlab-pygments==0.3.0 # via nbconvert jupyterlab-server==2.27.3 @@ -326,7 +325,7 @@ jupyterlab-server==2.27.3 # notebook jupyterlab-widgets==3.0.11 # via ipywidgets -jupytext==1.16.3 +jupytext==1.16.4 # via baybe (pyproject.toml) kiwisolver==1.4.5 # via matplotlib @@ -393,7 +392,7 @@ mypy==1.11.0 # via baybe (pyproject.toml) mypy-extensions==1.0.0 # via mypy -myst-parser==3.0.1 +myst-parser==4.0.0 # via baybe (pyproject.toml) nbclient==0.10.0 # via nbconvert @@ -564,8 +563,6 @@ packaging==24.1 # plotly # pyproject-api # pytest - # qtconsole - # qtpy # setuptools-scm # sphinx # streamlit @@ -679,7 +676,6 @@ pygments==2.18.0 # ipython # jupyter-console # nbconvert - # qtconsole # rich # sphinx pyparsing==3.1.2 @@ -734,11 +730,6 @@ pyzmq==26.0.3 # jupyter-client # jupyter-console # jupyter-server - # qtconsole -qtconsole==5.5.2 - # via jupyter -qtpy==2.4.1 - # via qtconsole rdkit==2024.3.3 # via # descriptastorus @@ -843,7 +834,7 @@ sortedcontainers==2.4.0 # hypothesis soupsieve==2.5 # via beautifulsoup4 -sphinx==7.4.7 +sphinx==8.1.3 # via # baybe (pyproject.toml) # furo @@ -852,7 +843,7 @@ sphinx==7.4.7 # sphinx-basic-ng # sphinx-copybutton # sphinxcontrib-bibtex -sphinx-autodoc-typehints==2.2.3 +sphinx-autodoc-typehints==2.5.0 # via baybe (pyproject.toml) sphinx-basic-ng==1.0.0b2 # via furo @@ -961,7 +952,6 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat - # qtconsole triton==2.3.1 ; python_full_version < '3.12' and platform_machine == 'x86_64' and platform_system == 'Linux' # via torch typeguard==2.13.3 diff --git a/pyproject.toml b/pyproject.toml index 5ec619001..4790d89aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,13 +96,13 @@ dev = [ docs = [ "baybe[examples]", # docs cannot be built without running examples "furo>=2023.09.10", - "jupyter>=1.0.0", - "jupytext>=1.16.1", - "myst-parser>=2.0.0", - "sphinx>=7.1.1", - "sphinx-autodoc-typehints>=1.24.0", + "jupyter>=1.1.1", + "jupytext>=1.16.4", + "myst-parser>=4.0.0", + "sphinx>=8.0.2", + "sphinx-autodoc-typehints>=2.4.4", "sphinx-copybutton==0.5.2", - "sphinxcontrib-bibtex>=2.6.2 ", + "sphinxcontrib-bibtex>=2.6.2", ] examples = [ From 80a113d39bf48aea96d44a69cd0938d01e958f18 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Mon, 11 Nov 2024 17:11:18 +0100 Subject: [PATCH 87/87] Turn off disk caching verbosity --- baybe/utils/chemistry.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py index 077db1d12..6e20f0fe2 100644 --- a/baybe/utils/chemistry.py +++ b/baybe/utils/chemistry.py @@ -33,7 +33,9 @@ def _dummy_wrapper(func): return func -_disk_cache = _dummy_wrapper if _cachedir == "" else Memory(Path(_cachedir)).cache +_disk_cache = ( + _dummy_wrapper if _cachedir == "" else Memory(Path(_cachedir), verbose=0).cache +) def name_to_smiles(name: str) -> str: