Refactor callable lookup mechanism (#441)

This PR refactors the callable lookup mechanism in the simulation module, fixing one of the major limitations of the module and preparing it for multi-target simulation. In particular: * The position-based approach relying on the order of input arguments to the callable is replaced with a proper dataframe-based approach, similar to other user-facing functionality elsewhere. That is, the callable now expects a dataframe containing columns corresponding to the search space parameters and returns a dataframe containing columns corresponding to the targets of the objective. * A `arrays_to_dataframes` decorator utility has been added that allows to conveniently construct such callables from array-based ones * The `custom_analytical` example has been renamed to `custom_blackbox` and was rewritten from scratch, using the new approach and fixing the existing issues. * Redundant examples have been removed. * The `filter_df` function has been fixed such that it now correctly handles the edge case of an empty filter.
emdgroup · Dec 20, 2024 · fd78099 · fd78099
2 parents 813ab62 + a6cc05a
commit fd78099
Show file tree

Hide file tree

Showing 32 changed files with 1,598 additions and 1,850 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -2,5 +2,4 @@
 omit =
     baybe/utils/plotting.py
     baybe/utils/random.py
-    baybe/utils/botorch_wrapper.py
     baybe/simulation/*
diff --git a/.lockfiles/py310-dev.lock b/.lockfiles/py310-dev.lock
@@ -135,6 +135,7 @@ docutils==0.21.2
     #   myst-parser
     #   pybtex-docutils
     #   sphinx
+    #   sphinx-paramlinks
     #   sphinxcontrib-bibtex
 e3fp==1.2.5
     # via scikit-fingerprints
@@ -856,13 +857,16 @@ sphinx==8.1.3
     #   sphinx-autodoc-typehints
     #   sphinx-basic-ng
     #   sphinx-copybutton
+    #   sphinx-paramlinks
     #   sphinxcontrib-bibtex
 sphinx-autodoc-typehints==2.5.0
     # via baybe (pyproject.toml)
 sphinx-basic-ng==1.0.0b2
     # via furo
 sphinx-copybutton==0.5.2
     # via baybe (pyproject.toml)
+sphinx-paramlinks==0.6.0
+    # via baybe (pyproject.toml)
 sphinxcontrib-applehelp==1.0.8
     # via sphinx
 sphinxcontrib-bibtex==2.6.2

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,12 +5,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+### Breaking Changes 
+- Lookup callables for simulation are now expected to accept/return dataframes with
+  the corresponding parameter/target column labels
+
 ### Added
 - `allow_missing` and `allow_extra` keyword arguments to `Objective.transform`
 - Example for a traditional mixture
 - `add_noise_to_perturb_degenerate_rows` utility
 - `benchmarks` subpackage for defining and running performance tests
 – `Campaign.toggle_discrete_candidates` to dynamically in-/exclude discrete candidates
+- `filter_df` utility for filtering dataframe content
+- `arrays_to_dataframes` decorator to create lookups from array-based callables
 - `DiscreteConstraint.get_valid` to conveniently access valid candidates
 - Functionality for persisting benchmarking results on S3 from a manual pipeline run
 
@@ -36,6 +42,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Crash when using `ContinuousCardinalityConstraint` caused by an unintended interplay
   between constraints and dropped parameters yielding empty parameter sets
 
+### Removed
+- `botorch_function_wrapper` utility for creating lookup callables
+
 ### Deprecations
 - Passing a dataframe via the `data` argument to `Objective.transform` is no longer
   possible. The dataframe must now be passed as positional argument.

diff --git a/baybe/simulation/core.py b/baybe/simulation/core.py
@@ -23,7 +23,7 @@
 
 def simulate_experiment(
     campaign: Campaign,
-    lookup: pd.DataFrame | Callable | None = None,
+    lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None = None,
     /,
     *,
     batch_size: int = 1,
@@ -85,8 +85,6 @@ def simulate_experiment(
         * for each target a column ``{targetname}_Measurements``:
           The individual measurements obtained for the respective target and iteration
     """
-    # TODO: Due to the "..." operator, sphinx does not render this properly. Might
-    #   want to investigate in the future.
     # TODO: Use a `will_terminate` campaign property to decide if the campaign will
     #   run indefinitely or not, and allow omitting `n_doe_iterations` for the latter.
     if campaign.objective is None:

diff --git a/baybe/simulation/lookup.py b/baybe/simulation/lookup.py
@@ -19,7 +19,7 @@
 def look_up_targets(
     queries: pd.DataFrame,
     targets: Collection[Target],
-    lookup: pd.DataFrame | Callable | None,
+    lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None,
     impute_mode: Literal[
         "error", "worst", "best", "mean", "random", "ignore"
     ] = "error",
@@ -70,41 +70,19 @@ def look_up_targets(
     if lookup is None:
         add_fake_measurements(queries, targets)
     elif isinstance(lookup, Callable):
-        _look_up_targets_from_callable(queries, targets, lookup)
+        _look_up_targets_from_callable(queries, lookup)
     elif isinstance(lookup, pd.DataFrame):
         _look_up_targets_from_dataframe(queries, targets, lookup, impute_mode)
     else:
         raise ValueError("Unsupported lookup mechanism.")
 
 
 def _look_up_targets_from_callable(
-    queries: pd.DataFrame,
-    targets: Collection[Target],
-    lookup: Callable,
+    queries: pd.DataFrame, lookup: Callable[[pd.DataFrame], pd.DataFrame]
 ) -> None:
     """Look up target values by querying a callable."""
-    # TODO: Currently, the alignment of return values to targets is based on the
-    #   column ordering, which is not robust. Instead, the callable should return
-    #   a dataframe with properly labeled columns.
-
-    # Since the return of a lookup function is a tuple, the following code stores
-    # tuples of floats in a single column with label 0:
-    measured_targets = queries.apply(lambda x: lookup(*x.values), axis=1).to_frame()
-    # We transform this column to a DataFrame in which there is an individual
-    # column for each of the targets....
-    split_target_columns = pd.DataFrame(
-        measured_targets[0].to_list(), index=measured_targets.index
-    )
-    # ... and assign this to measured_targets in order to have one column per target
-    measured_targets[split_target_columns.columns] = split_target_columns
-    if measured_targets.shape[1] != len(targets):
-        raise AssertionError(
-            "If you use an analytical function as lookup, make sure "
-            "the configuration has the right amount of targets "
-            "specified."
-        )
-    for k_target, target in enumerate(targets):
-        queries[target.name] = measured_targets.iloc[:, k_target]
+    df_targets = lookup(queries)
+    queries[df_targets.columns] = df_targets.values
 
 
 def _look_up_targets_from_dataframe(

diff --git a/baybe/simulation/scenarios.py b/baybe/simulation/scenarios.py
@@ -22,7 +22,7 @@
 
 def simulate_scenarios(
     scenarios: dict[Any, Campaign],
-    lookup: pd.DataFrame | Callable | None = None,
+    lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None = None,
     /,
     *,
     batch_size: int = 1,
@@ -161,7 +161,7 @@ def unpack_simulation_results(array: DataArray) -> pd.DataFrame:
 
 def _simulate_groupby(
     campaign: Campaign,
-    lookup: pd.DataFrame | Callable[..., tuple[float, ...]] | None = None,
+    lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None = None,
     /,
     *,
     batch_size: int = 1,

diff --git a/baybe/utils/botorch_wrapper.py b/baybe/utils/botorch_wrapper.py
diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py
@@ -2,8 +2,9 @@
 
 from __future__ import annotations
 
+import functools
 import logging
-from collections.abc import Collection, Iterable, Sequence
+from collections.abc import Callable, Collection, Iterable, Sequence
 from typing import TYPE_CHECKING, Literal, TypeVar, overload
 
 import numpy as np
@@ -21,6 +22,8 @@
     from baybe.targets.base import Target
 
     _T = TypeVar("_T", bound=Parameter | Target)
+    _ArrayLike = TypeVar("_ArrayLike", np.ndarray, Tensor)
+
 
 # Logging
 _logger = logging.getLogger(__name__)
@@ -604,7 +607,7 @@ def get_transform_objects(
 
 
 def filter_df(
-    df: pd.DataFrame, filter: pd.DataFrame, complement: bool = False
+    df: pd.DataFrame, /, to_keep: pd.DataFrame, complement: bool = False
 ) -> pd.DataFrame:
     """Filter a dataframe based on a second dataframe defining filtering conditions.
 
@@ -613,9 +616,11 @@ def filter_df(
 
     Args:
         df: The dataframe to be filtered.
-        filter: The dataframe defining the filtering conditions.
+        to_keep: The dataframe defining the filtering conditions. By default
+            (see ``complement`` argument), it defines the rows to be kept in the sense
+            of an inner join.
         complement: If ``False``, the filter dataframe determines the rows to be kept
-            (i.e. selection via regular join). If ``True``, the filtering mechanism is
+            (i.e. selection via inner join). If ``True``, the filtering mechanism is
             inverted so that the complement set of rows is kept (i.e. selection
             via anti-join).
 
@@ -643,13 +648,30 @@ def filter_df(
            num cat
         2    1   a
         3    1   b
+
+        >>> filter_df(df, pd.DataFrame(), complement=True)
+           num cat
+        0    0   a
+        1    0   b
+        2    1   a
+        3    1   b
+
+        >>> filter_df(df, pd.DataFrame(), complement=False)
+        Empty DataFrame
+        Columns: [num, cat]
+        Index: []
+
     """
+    # Handle special case of empty filter
+    if to_keep.empty:
+        return df if complement else pd.DataFrame(columns=df.columns)
+
     # Remember original index name
     index_name = df.index.name
 
     # Identify rows to be dropped
     out = pd.merge(
-        df.reset_index(names="_df_index"), filter, how="left", indicator=True
+        df.reset_index(names="_df_index"), to_keep, how="left", indicator=True
     ).set_index("_df_index")
     to_drop = out["_merge"] == ("both" if complement else "left_only")
 
@@ -661,3 +683,53 @@ def filter_df(
     out.index.name = index_name
 
     return out
+
+
+def arrays_to_dataframes(
+    input_labels: Sequence[str],
+    output_labels: Sequence[str],
+    /,
+    use_torch: bool = False,
+) -> Callable[
+    [Callable[[_ArrayLike], _ArrayLike]], Callable[[pd.DataFrame], pd.DataFrame]
+]:
+    """Make a decorator for labeling the input/output columns of array-based callables.
+
+    Useful for creating parameter-to-target lookups from array-based logic.
+    The decorator transforms a callable designed to work with unlabelled arrays such
+    that it can operate with dataframes instead. The original callable is expected to
+    accept and return two-dimensional arrays. When decorated, the callable accepts and
+    returns dataframes whose columns are mapped to the corresponding arrays based on the
+    specified label sequences.
+
+    Args:
+        input_labels: The sequence of labels for the input columns.
+        output_labels: The sequence of labels for the output columns.
+        use_torch: Flag indicating if the callable is to be called with a numpy array
+            or with a torch tensor.
+
+    Returns:
+        The decorator for the given input and output labels.
+    """
+
+    def decorator(
+        fn: Callable[[_ArrayLike], _ArrayLike], /
+    ) -> Callable[[pd.DataFrame], pd.DataFrame]:
+        """Turn an array-based callable into a dataframe-based callable."""
+
+        @functools.wraps(fn)
+        def wrapper(df: pd.DataFrame, /) -> pd.DataFrame:
+            """Translate to/from an array-based callable using dataframes."""
+            array_in = df[list(input_labels)].to_numpy()
+            if use_torch:
+                import torch
+
+                with torch.no_grad():
+                    array_out = fn(torch.from_numpy(array_in)).numpy()
+            else:
+                array_out = fn(array_in)
+            return pd.DataFrame(array_out, columns=list(output_labels), index=df.index)
+
+        return wrapper
+
+    return decorator
diff --git a/benchmarks/domains/synthetic_2C1D_1C.py b/benchmarks/domains/synthetic_2C1D_1C.py
@@ -5,15 +5,16 @@
 from typing import TYPE_CHECKING
 
 import numpy as np
+import pandas as pd
 from numpy import pi, sin, sqrt
 from pandas import DataFrame
 
 from baybe.campaign import Campaign
 from baybe.parameters import NumericalContinuousParameter, NumericalDiscreteParameter
-from baybe.recommenders.pure.nonpredictive.sampling import RandomRecommender
+from baybe.recommenders import RandomRecommender
 from baybe.searchspace import SearchSpace
 from baybe.simulation import simulate_scenarios
-from baybe.targets import NumericalTarget, TargetMode
+from baybe.targets import NumericalTarget
 from benchmarks.definition import (
     Benchmark,
     ConvergenceExperimentSettings,
@@ -23,8 +24,9 @@
     from mpl_toolkits.mplot3d import Axes3D
 
 
-def _lookup(z: np.ndarray, x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    """Lookup that is used internally in the callable for the benchmark."""
+def _lookup(arr: np.ndarray, /) -> np.ndarray:
+    """Numpy-based lookup callable defining the objective function."""
+    x, y, z = np.array_split(arr, 3, axis=1)
     try:
         assert np.all(-2 * pi <= x) and np.all(x <= 2 * pi)
         assert np.all(-2 * pi <= y) and np.all(y <= 2 * pi)
@@ -40,6 +42,13 @@ def _lookup(z: np.ndarray, x: np.ndarray, y: np.ndarray) -> np.ndarray:
     )
 
 
+def lookup(df: pd.DataFrame, /) -> pd.DataFrame:
+    """Dataframe-based lookup callable used as the loop-closing element."""
+    return pd.DataFrame(
+        _lookup(df[["x", "y", "z"]].to_numpy()), columns=["target"], index=df.index
+    )
+
+
 def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame:
     """Hybrid synthetic test function.
 
@@ -60,24 +69,25 @@ def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame:
         NumericalDiscreteParameter("z", (1, 2, 3, 4)),
     ]
 
-    objective = NumericalTarget(name="target", mode=TargetMode.MAX).to_objective()
-    search_space = SearchSpace.from_product(parameters=parameters)
+    target = NumericalTarget(name="target", mode="MAX")
+    searchspace = SearchSpace.from_product(parameters=parameters)
+    objective = target.to_objective()
 
     scenarios: dict[str, Campaign] = {
         "Random Recommender": Campaign(
-            searchspace=search_space,
+            searchspace=searchspace,
             recommender=RandomRecommender(),
             objective=objective,
         ),
         "Default Recommender": Campaign(
-            searchspace=search_space,
+            searchspace=searchspace,
             objective=objective,
         ),
     }
 
     return simulate_scenarios(
         scenarios,
-        _lookup,
+        lookup,
         batch_size=settings.batch_size,
         n_doe_iterations=settings.n_doe_iterations,
         n_mc_iterations=settings.n_mc_iterations,
@@ -116,7 +126,9 @@ def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame:
     fig = plt.figure(figsize=(10, 10))
     for i, z in enumerate(Z):
         ax: Axes3D = fig.add_subplot(2, 2, i + 1, projection="3d")
-        t_mesh = _lookup(np.asarray(z), x_mesh, y_mesh)
+        t_mesh = _lookup(
+            np.c_[x_mesh.ravel(), y_mesh.ravel(), np.repeat(z, x_mesh.size)]
+        ).reshape(x_mesh.shape)
         ax.plot_surface(x_mesh, y_mesh, t_mesh)
         plt.title(f"{z=}")