test new struct

fabianliebig · Nov 13, 2024 · 68561cc · 68561cc
1 parent 3af57a5
commit 68561cc
Show file tree

Hide file tree

Showing 87 changed files with 2,632 additions and 5,620 deletions.
diff --git a/.github/workflows/manual_benchmark.yml b/.github/workflows/manual_benchmark.yml
@@ -8,29 +8,49 @@ permissions:
   id-token: write
 
 jobs:
-    add-runner:
-      uses: ./.github/workflows/runner_attach_workflow.yml
-      with:
-        RunnerCount: 1
-        RunnerComputeType: "M"
-      secrets:
-        GitHubToken: ${{ secrets.RUNNER_ATTACH_API_KEY }}
-        AWSRoleToAssume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
+  add-runner:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Generate a token
+        id: generate-token
+        uses: actions/create-github-app-token@v1
+        with:
+          app-id: ${{ vars.APP_ID }}
+          private-key: ${{ secrets.APP_PRIVATE_KEY }}
 
-    benchmark-test:
-      needs: add-runner
-      runs-on: self-hosted
-      env:
-        BAYBE_PERFORMANCE_PERSISTANCE_PATH: ${{ secrets.TEST_RESULT_S3_BUCKET }}
-      steps:
-        - uses: actions/checkout@v4
-          with:
-            fetch-depth: 0
-        - uses: actions/setup-python@v5
-          id: setup-python
-          with:
-            python-version: '3.10'
-        - name: Benchmark
-          run: |
-            pip install '.[dev]'
-            python -m benchmark
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
+          role-session-name: Github_Add_Runner
+          aws-region: eu-central-1
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+
+      - name: Execute Lambda function
+        run: |
+          aws lambda invoke --function-name jit_runner_register_and_create_runner_container  --cli-binary-format raw-in-base64-out --payload '{"github_api_secret": "${{ steps.generate-token.outputs.token }}", "count_container":  1, "container_compute": "XS", "repository": "${{ github.repository }}" }'  response.json
+          cat response.json
+          if ! grep -q '"statusCode": 200' response.json; then
+            echo "Lambda function failed. statusCode is not 200."
+            exit 1
+          fi
+
+  benchmark-test:
+    needs: add-runner
+    runs-on: self-hosted
+    env:
+      BAYBE_PERFORMANCE_PERSISTANCE_PATH: ${{ secrets.TEST_RESULT_S3_BUCKET }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v5
+        id: setup-python
+        with:
+          python-version: "3.10"
+      - name: Benchmark
+        run: |
+          pip install '.[dev]'
diff --git a/.github/workflows/runner_attach_workflow.yml b/.github/workflows/runner_attach_workflow.yml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ### Added
 - `allow_missing` and `allow_extra` keyword arguments to `Objective.transform`
+- Example for a traditional mixture
+- `add_noise_to_perturb_degenerate_rows` utility
+- `benchmarks` subpackage for defining and running performance tests
+
+### Changed
+- `SubstanceParameter` encodings are now computed exclusively with the
+  `scikit-fingerprints` package, granting access to all fingerprints available therein
+- Example for slot-based mixtures has been revised and grouped together with the new 
+  traditional mixture example
+- Memory caching is now non-verbose
+- `CustomDiscreteParameter` does not allow duplicated rows in `data` anymore
+
+### Fixed
+- Rare bug arising from degenerate `SubstanceParameter.comp_df` rows that caused
+  wrong number of recommendations being returned
 
 ### Deprecations
 - Passing a dataframe via the `data` argument to `Objective.transform` is no longer
@@ -16,6 +31,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `get_transform_parameters` has been replaced with `get_transform_objects`
 - Passing a dataframe via the `data` argument to `Target.transform` is no longer
   possible. The data must now be passed as a series as first positional argument.
+- `SubstanceEncoding` value `MORGAN_FP`. As a replacement, `ECFP` with 1024 bits and
+  radius of 4 can be used.
+- `SubstanceEncoding` value `RDKIT`. As a replacement, `RDKIT2DDESCRIPTORS` can be used.
+
+## [0.11.3] - 2024-11-06
+### Fixed
+- `protobuf` dependency issue, version pin was removed
 
 ## [0.11.2] - 2024-10-11
 ### Added

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -25,4 +25,8 @@
 - Di Jin (Merck Life Science KGaA, Darmstadt, Germany):\
   Cardinality constraints
 - Julian Streibel (Merck Life Science KGaA, Darmstadt, Germany):\
-  Bernoulli multi-armed bandit and Thompson sampling
+  Bernoulli multi-armed bandit and Thompson sampling
+- Karin Hrovatin (Merck KGaA, Darmstadt, Germany):\
+  `scikit-fingerprints` support
+- Fabian Liebig (Merck KGaA, Darmstadt, Germany):\
+  Benchmarking structure
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@
 
 [![Supports Python](https://img.shields.io/pypi/pyversions/baybe?style=flat-square&label=Supports%20Python&labelColor=96d7d2&color=ffdcb9)](https://pypi.org/project/baybe/)
 [![PyPI version](https://img.shields.io/pypi/v/baybe.svg?style=flat-square&label=PyPI%20Version&labelColor=96d7d2&color=ffdcb9)](https://pypi.org/project/baybe/)
+[![Downloads](https://img.shields.io/pypi/dm/baybe?style=flat-square&label=Downloads&labelColor=96d7d2&color=ffdcb9)](https://pypistats.org/packages/baybe)
 [![Issues](https://img.shields.io/github/issues/emdgroup/baybe?style=flat-square&label=Issues&labelColor=96d7d2&color=ffdcb9)](https://github.com/emdgroup/baybe/issues/)
 [![PRs](https://img.shields.io/github/issues-pr/emdgroup/baybe?style=flat-square&label=PRs&labelColor=96d7d2&color=ffdcb9)](https://github.com/emdgroup/baybe/pulls/)
 [![License](https://shields.io/badge/License-Apache%202.0-green.svg?style=flat-square&labelColor=96d7d2&color=ffdcb9)](http://www.apache.org/licenses/LICENSE-2.0)
@@ -113,7 +114,7 @@ parameters = [
             "Solvent C": "O",
             "Solvent D": "CS(=O)C",
         },
-        encoding="MORDRED",  # chemical encoding via mordred package
+        encoding="MORDRED",  # chemical encoding via scikit-fingerprints
     ),
 ]
 ```
@@ -298,6 +299,7 @@ The available groups are:
 - `polars`: Required for optimized search space construction via [Polars](https://docs.pola.rs/)
 - `simulation`: Enabling the [simulation](https://emdgroup.github.io/baybe/stable/_autosummary/baybe.simulation.html) module.
 - `test`: Required for running the tests.
+- `benchmarking`: Required for running the benchmarking module.
 - `dev`: All of the above plus `tox` and `pip-audit`. For code contributors.
 
 ## 📡 Telemetry

diff --git a/baybe/_optional/chem.py b/baybe/_optional/chem.py
@@ -3,9 +3,11 @@
 from baybe.exceptions import OptionalImportError
 
 try:
-    from mordred import Calculator, descriptors
-    from rdkit import Chem, RDLogger
-    from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
+    from rdkit import Chem
+    from skfp import fingerprints
+    from skfp.bases import BaseFingerprintTransformer
+    from skfp.preprocessing import ConformerGenerator, MolFromSmilesTransformer
+
 except ModuleNotFoundError as ex:
     raise OptionalImportError(
         "Chemistry functionality is unavailable because the necessary optional "
@@ -15,9 +17,9 @@
     ) from ex
 
 __all__ = [
-    "descriptors",
-    "Calculator",
     "Chem",
-    "GetMorganFingerprintAsBitVect",
-    "RDLogger",
+    "fingerprints",
+    "BaseFingerprintTransformer",
+    "ConformerGenerator",
+    "MolFromSmilesTransformer",
 ]
diff --git a/baybe/_optional/info.py b/baybe/_optional/info.py
@@ -25,13 +25,12 @@ def exclude_sys_path(path: str, /):  # noqa: DOC402, DOC404
 # Individual packages
 with exclude_sys_path(os.getcwd()):
     FLAKE8_INSTALLED = find_spec("flake8") is not None
-    MORDRED_INSTALLED = find_spec("mordred") is not None
     ONNX_INSTALLED = find_spec("onnxruntime") is not None
     POLARS_INSTALLED = find_spec("polars") is not None
     PRE_COMMIT_INSTALLED = find_spec("pre_commit") is not None
     PYDOCLINT_INSTALLED = find_spec("pydoclint") is not None
-    RDKIT_INSTALLED = find_spec("rdkit") is not None
     RUFF_INSTALLED = find_spec("ruff") is not None
+    SKFP_INSTALLED = find_spec("skfp") is not None  # scikit-fingerprints
     STREAMLIT_INSTALLED = find_spec("streamlit") is not None
     XYZPY_INSTALLED = find_spec("xyzpy") is not None
 
@@ -43,8 +42,8 @@ def exclude_sys_path(path: str, /):  # noqa: DOC402, DOC404
 #   directly depend on the flag – we thus simply set it to `True`.
 TYPOS_INSTALLED = True
 
-# Package combinations
-CHEM_INSTALLED = MORDRED_INSTALLED and RDKIT_INSTALLED
+# Information on whether all required packages for certain functionality are available
+CHEM_INSTALLED = SKFP_INSTALLED
 LINT_INSTALLED = all(
     (
         FLAKE8_INSTALLED,

diff --git a/baybe/campaign.py b/baybe/campaign.py
@@ -226,15 +226,13 @@ def recommend(
         self,
         batch_size: int,
         pending_experiments: pd.DataFrame | None = None,
-        batch_quantity: int = None,  # type: ignore[assignment]
     ) -> pd.DataFrame:
         """Provide the recommendations for the next batch of experiments.
 
         Args:
             batch_size: Number of requested recommendations.
             pending_experiments: Parameter configurations specifying experiments
                 that are currently pending.
-            batch_quantity: Deprecated! Use ``batch_size`` instead.
 
         Returns:
             Dataframe containing the recommendations in experimental representation.

diff --git a/baybe/constraints/conditions.py b/baybe/constraints/conditions.py
@@ -32,7 +32,7 @@
 
 
 def _is_not_close(x: ArrayLike, y: ArrayLike, rtol: float, atol: float) -> np.ndarray:
-    """Return a boolean array indicating where ``x`` and ``y`` are not close.
+    """Return a Boolean array indicating where ``x`` and ``y`` are not close.
 
     The counterpart to ``numpy.isclose``.
 
@@ -43,15 +43,15 @@ def _is_not_close(x: ArrayLike, y: ArrayLike, rtol: float, atol: float) -> np.nd
         atol: The absolute tolerance parameter.
 
     Returns:
-        A boolean array of where ``x`` and ``y`` are not equal within the
+        A Boolean array of where ``x`` and ``y`` are not equal within the
         given tolerances.
 
     """
     return np.logical_not(_is_close(x, y, rtol=rtol, atol=atol))
 
 
 def _is_close(x: ArrayLike, y: ArrayLike, rtol: float, atol: float) -> np.ndarray:
-    """Return a boolean array indicating where ``x`` and ``y`` are close.
+    """Return a Boolean array indicating where ``x`` and ``y`` are close.
 
     The equivalent to :func:``numpy.isclose``.
     Using ``numpy.isclose`` with Polars dataframes results in this error:
@@ -64,7 +64,7 @@ def _is_close(x: ArrayLike, y: ArrayLike, rtol: float, atol: float) -> np.ndarra
         atol: The absolute tolerance parameter.
 
     Returns:
-        A boolean array of where ``x`` and ``y`` are equal within the
+        A Boolean array of where ``x`` and ``y`` are equal within the
         given tolerances.
 
     """
@@ -107,7 +107,7 @@ def evaluate(self, data: pd.Series) -> pd.Series:
             data: A series containing parameter values.
 
         Returns:
-            A boolean series indicating which elements satisfy the condition.
+            A Boolean series indicating which elements satisfy the condition.
         """
 
     @abstractmethod

diff --git a/baybe/constraints/discrete.py b/baybe/constraints/discrete.py
@@ -346,7 +346,7 @@ class DiscreteCustomConstraint(DiscreteConstraint):
     # object variables
     validator: Callable[[pd.DataFrame], pd.Series] = field()
     """A user-defined function modeling the validation of the constraint. The expected
-    return is a pandas series with boolean entries True/False for search space elements
+    return is a pandas series with Boolean entries True/False for search space elements
     you want to keep/remove."""
 
     @override

diff --git a/baybe/parameters/custom.py b/baybe/parameters/custom.py
@@ -58,6 +58,7 @@ def _validate_custom_data(  # noqa: DOC101, DOC103
             ValueError: If the dataframe contains ``NaN``.
             ValueError: If the dataframe index contains duplicates.
             ValueError: If the dataframe contains columns with only one unique value.
+            ValueError: If the dataframe contains duplicated rows.
         """
         if value.select_dtypes("number").shape[1] != value.shape[1]:
             raise ValueError(
@@ -89,6 +90,13 @@ def _validate_custom_data(  # noqa: DOC101, DOC103
                 f"The custom dataframe for parameter {self.name} has columns "
                 "that contain only a single value and hence carry no information."
             )
+        if value.duplicated().any():
+            raise ValueError(
+                f"The custom dataframe for parameter {self.name} has duplicated rows. "
+                f"This is not supported because it can lead to ambiguous computational "
+                f"representations of candidate points. Please ensure all labels have a "
+                f"unique numerical representation."
+            )
 
     @override
     @property