Skip to content

Commit

Permalink
Refactor callable lookup mechanism (#441)
Browse files Browse the repository at this point in the history
This PR refactors the callable lookup mechanism in the simulation
module, fixing one of the major limitations of the module and preparing
it for multi-target simulation.

In particular:
* The position-based approach relying on the order of input arguments to
the callable is replaced with a proper dataframe-based approach, similar
to other user-facing functionality elsewhere. That is, the callable now
expects a dataframe containing columns corresponding to the search space
parameters and returns a dataframe containing columns corresponding to
the targets of the objective.
* A `arrays_to_dataframes` decorator utility has been added that allows
to conveniently construct such callables from array-based ones
* The `custom_analytical` example has been renamed to `custom_blackbox`
and was rewritten from scratch, using the new approach and fixing the
existing issues.
* Redundant examples have been removed.
* The `filter_df` function has been fixed such that it now correctly
handles the edge case of an empty filter.
  • Loading branch information
AdrianSosic authored Dec 20, 2024
2 parents 813ab62 + a6cc05a commit fd78099
Show file tree
Hide file tree
Showing 32 changed files with 1,598 additions and 1,850 deletions.
1 change: 0 additions & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,4 @@
omit =
baybe/utils/plotting.py
baybe/utils/random.py
baybe/utils/botorch_wrapper.py
baybe/simulation/*
4 changes: 4 additions & 0 deletions .lockfiles/py310-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ docutils==0.21.2
# myst-parser
# pybtex-docutils
# sphinx
# sphinx-paramlinks
# sphinxcontrib-bibtex
e3fp==1.2.5
# via scikit-fingerprints
Expand Down Expand Up @@ -856,13 +857,16 @@ sphinx==8.1.3
# sphinx-autodoc-typehints
# sphinx-basic-ng
# sphinx-copybutton
# sphinx-paramlinks
# sphinxcontrib-bibtex
sphinx-autodoc-typehints==2.5.0
# via baybe (pyproject.toml)
sphinx-basic-ng==1.0.0b2
# via furo
sphinx-copybutton==0.5.2
# via baybe (pyproject.toml)
sphinx-paramlinks==0.6.0
# via baybe (pyproject.toml)
sphinxcontrib-applehelp==1.0.8
# via sphinx
sphinxcontrib-bibtex==2.6.2
Expand Down
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Breaking Changes
- Lookup callables for simulation are now expected to accept/return dataframes with
the corresponding parameter/target column labels

### Added
- `allow_missing` and `allow_extra` keyword arguments to `Objective.transform`
- Example for a traditional mixture
- `add_noise_to_perturb_degenerate_rows` utility
- `benchmarks` subpackage for defining and running performance tests
`Campaign.toggle_discrete_candidates` to dynamically in-/exclude discrete candidates
- `filter_df` utility for filtering dataframe content
- `arrays_to_dataframes` decorator to create lookups from array-based callables
- `DiscreteConstraint.get_valid` to conveniently access valid candidates
- Functionality for persisting benchmarking results on S3 from a manual pipeline run

Expand All @@ -36,6 +42,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Crash when using `ContinuousCardinalityConstraint` caused by an unintended interplay
between constraints and dropped parameters yielding empty parameter sets

### Removed
- `botorch_function_wrapper` utility for creating lookup callables

### Deprecations
- Passing a dataframe via the `data` argument to `Objective.transform` is no longer
possible. The dataframe must now be passed as positional argument.
Expand Down
4 changes: 1 addition & 3 deletions baybe/simulation/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

def simulate_experiment(
campaign: Campaign,
lookup: pd.DataFrame | Callable | None = None,
lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None = None,
/,
*,
batch_size: int = 1,
Expand Down Expand Up @@ -85,8 +85,6 @@ def simulate_experiment(
* for each target a column ``{targetname}_Measurements``:
The individual measurements obtained for the respective target and iteration
"""
# TODO: Due to the "..." operator, sphinx does not render this properly. Might
# want to investigate in the future.
# TODO: Use a `will_terminate` campaign property to decide if the campaign will
# run indefinitely or not, and allow omitting `n_doe_iterations` for the latter.
if campaign.objective is None:
Expand Down
32 changes: 5 additions & 27 deletions baybe/simulation/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
def look_up_targets(
queries: pd.DataFrame,
targets: Collection[Target],
lookup: pd.DataFrame | Callable | None,
lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None,
impute_mode: Literal[
"error", "worst", "best", "mean", "random", "ignore"
] = "error",
Expand Down Expand Up @@ -70,41 +70,19 @@ def look_up_targets(
if lookup is None:
add_fake_measurements(queries, targets)
elif isinstance(lookup, Callable):
_look_up_targets_from_callable(queries, targets, lookup)
_look_up_targets_from_callable(queries, lookup)
elif isinstance(lookup, pd.DataFrame):
_look_up_targets_from_dataframe(queries, targets, lookup, impute_mode)
else:
raise ValueError("Unsupported lookup mechanism.")


def _look_up_targets_from_callable(
queries: pd.DataFrame,
targets: Collection[Target],
lookup: Callable,
queries: pd.DataFrame, lookup: Callable[[pd.DataFrame], pd.DataFrame]
) -> None:
"""Look up target values by querying a callable."""
# TODO: Currently, the alignment of return values to targets is based on the
# column ordering, which is not robust. Instead, the callable should return
# a dataframe with properly labeled columns.

# Since the return of a lookup function is a tuple, the following code stores
# tuples of floats in a single column with label 0:
measured_targets = queries.apply(lambda x: lookup(*x.values), axis=1).to_frame()
# We transform this column to a DataFrame in which there is an individual
# column for each of the targets....
split_target_columns = pd.DataFrame(
measured_targets[0].to_list(), index=measured_targets.index
)
# ... and assign this to measured_targets in order to have one column per target
measured_targets[split_target_columns.columns] = split_target_columns
if measured_targets.shape[1] != len(targets):
raise AssertionError(
"If you use an analytical function as lookup, make sure "
"the configuration has the right amount of targets "
"specified."
)
for k_target, target in enumerate(targets):
queries[target.name] = measured_targets.iloc[:, k_target]
df_targets = lookup(queries)
queries[df_targets.columns] = df_targets.values


def _look_up_targets_from_dataframe(
Expand Down
4 changes: 2 additions & 2 deletions baybe/simulation/scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

def simulate_scenarios(
scenarios: dict[Any, Campaign],
lookup: pd.DataFrame | Callable | None = None,
lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None = None,
/,
*,
batch_size: int = 1,
Expand Down Expand Up @@ -161,7 +161,7 @@ def unpack_simulation_results(array: DataArray) -> pd.DataFrame:

def _simulate_groupby(
campaign: Campaign,
lookup: pd.DataFrame | Callable[..., tuple[float, ...]] | None = None,
lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None = None,
/,
*,
batch_size: int = 1,
Expand Down
29 changes: 0 additions & 29 deletions baybe/utils/botorch_wrapper.py

This file was deleted.

82 changes: 77 additions & 5 deletions baybe/utils/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

from __future__ import annotations

import functools
import logging
from collections.abc import Collection, Iterable, Sequence
from collections.abc import Callable, Collection, Iterable, Sequence
from typing import TYPE_CHECKING, Literal, TypeVar, overload

import numpy as np
Expand All @@ -21,6 +22,8 @@
from baybe.targets.base import Target

_T = TypeVar("_T", bound=Parameter | Target)
_ArrayLike = TypeVar("_ArrayLike", np.ndarray, Tensor)


# Logging
_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -604,7 +607,7 @@ def get_transform_objects(


def filter_df(
df: pd.DataFrame, filter: pd.DataFrame, complement: bool = False
df: pd.DataFrame, /, to_keep: pd.DataFrame, complement: bool = False
) -> pd.DataFrame:
"""Filter a dataframe based on a second dataframe defining filtering conditions.
Expand All @@ -613,9 +616,11 @@ def filter_df(
Args:
df: The dataframe to be filtered.
filter: The dataframe defining the filtering conditions.
to_keep: The dataframe defining the filtering conditions. By default
(see ``complement`` argument), it defines the rows to be kept in the sense
of an inner join.
complement: If ``False``, the filter dataframe determines the rows to be kept
(i.e. selection via regular join). If ``True``, the filtering mechanism is
(i.e. selection via inner join). If ``True``, the filtering mechanism is
inverted so that the complement set of rows is kept (i.e. selection
via anti-join).
Expand Down Expand Up @@ -643,13 +648,30 @@ def filter_df(
num cat
2 1 a
3 1 b
>>> filter_df(df, pd.DataFrame(), complement=True)
num cat
0 0 a
1 0 b
2 1 a
3 1 b
>>> filter_df(df, pd.DataFrame(), complement=False)
Empty DataFrame
Columns: [num, cat]
Index: []
"""
# Handle special case of empty filter
if to_keep.empty:
return df if complement else pd.DataFrame(columns=df.columns)

# Remember original index name
index_name = df.index.name

# Identify rows to be dropped
out = pd.merge(
df.reset_index(names="_df_index"), filter, how="left", indicator=True
df.reset_index(names="_df_index"), to_keep, how="left", indicator=True
).set_index("_df_index")
to_drop = out["_merge"] == ("both" if complement else "left_only")

Expand All @@ -661,3 +683,53 @@ def filter_df(
out.index.name = index_name

return out


def arrays_to_dataframes(
input_labels: Sequence[str],
output_labels: Sequence[str],
/,
use_torch: bool = False,
) -> Callable[
[Callable[[_ArrayLike], _ArrayLike]], Callable[[pd.DataFrame], pd.DataFrame]
]:
"""Make a decorator for labeling the input/output columns of array-based callables.
Useful for creating parameter-to-target lookups from array-based logic.
The decorator transforms a callable designed to work with unlabelled arrays such
that it can operate with dataframes instead. The original callable is expected to
accept and return two-dimensional arrays. When decorated, the callable accepts and
returns dataframes whose columns are mapped to the corresponding arrays based on the
specified label sequences.
Args:
input_labels: The sequence of labels for the input columns.
output_labels: The sequence of labels for the output columns.
use_torch: Flag indicating if the callable is to be called with a numpy array
or with a torch tensor.
Returns:
The decorator for the given input and output labels.
"""

def decorator(
fn: Callable[[_ArrayLike], _ArrayLike], /
) -> Callable[[pd.DataFrame], pd.DataFrame]:
"""Turn an array-based callable into a dataframe-based callable."""

@functools.wraps(fn)
def wrapper(df: pd.DataFrame, /) -> pd.DataFrame:
"""Translate to/from an array-based callable using dataframes."""
array_in = df[list(input_labels)].to_numpy()
if use_torch:
import torch

with torch.no_grad():
array_out = fn(torch.from_numpy(array_in)).numpy()
else:
array_out = fn(array_in)
return pd.DataFrame(array_out, columns=list(output_labels), index=df.index)

return wrapper

return decorator
32 changes: 22 additions & 10 deletions benchmarks/domains/synthetic_2C1D_1C.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from numpy import pi, sin, sqrt
from pandas import DataFrame

from baybe.campaign import Campaign
from baybe.parameters import NumericalContinuousParameter, NumericalDiscreteParameter
from baybe.recommenders.pure.nonpredictive.sampling import RandomRecommender
from baybe.recommenders import RandomRecommender
from baybe.searchspace import SearchSpace
from baybe.simulation import simulate_scenarios
from baybe.targets import NumericalTarget, TargetMode
from baybe.targets import NumericalTarget
from benchmarks.definition import (
Benchmark,
ConvergenceExperimentSettings,
Expand All @@ -23,8 +24,9 @@
from mpl_toolkits.mplot3d import Axes3D


def _lookup(z: np.ndarray, x: np.ndarray, y: np.ndarray) -> np.ndarray:
"""Lookup that is used internally in the callable for the benchmark."""
def _lookup(arr: np.ndarray, /) -> np.ndarray:
"""Numpy-based lookup callable defining the objective function."""
x, y, z = np.array_split(arr, 3, axis=1)
try:
assert np.all(-2 * pi <= x) and np.all(x <= 2 * pi)
assert np.all(-2 * pi <= y) and np.all(y <= 2 * pi)
Expand All @@ -40,6 +42,13 @@ def _lookup(z: np.ndarray, x: np.ndarray, y: np.ndarray) -> np.ndarray:
)


def lookup(df: pd.DataFrame, /) -> pd.DataFrame:
"""Dataframe-based lookup callable used as the loop-closing element."""
return pd.DataFrame(
_lookup(df[["x", "y", "z"]].to_numpy()), columns=["target"], index=df.index
)


def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame:
"""Hybrid synthetic test function.
Expand All @@ -60,24 +69,25 @@ def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame:
NumericalDiscreteParameter("z", (1, 2, 3, 4)),
]

objective = NumericalTarget(name="target", mode=TargetMode.MAX).to_objective()
search_space = SearchSpace.from_product(parameters=parameters)
target = NumericalTarget(name="target", mode="MAX")
searchspace = SearchSpace.from_product(parameters=parameters)
objective = target.to_objective()

scenarios: dict[str, Campaign] = {
"Random Recommender": Campaign(
searchspace=search_space,
searchspace=searchspace,
recommender=RandomRecommender(),
objective=objective,
),
"Default Recommender": Campaign(
searchspace=search_space,
searchspace=searchspace,
objective=objective,
),
}

return simulate_scenarios(
scenarios,
_lookup,
lookup,
batch_size=settings.batch_size,
n_doe_iterations=settings.n_doe_iterations,
n_mc_iterations=settings.n_mc_iterations,
Expand Down Expand Up @@ -116,7 +126,9 @@ def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame:
fig = plt.figure(figsize=(10, 10))
for i, z in enumerate(Z):
ax: Axes3D = fig.add_subplot(2, 2, i + 1, projection="3d")
t_mesh = _lookup(np.asarray(z), x_mesh, y_mesh)
t_mesh = _lookup(
np.c_[x_mesh.ravel(), y_mesh.ravel(), np.repeat(z, x_mesh.size)]
).reshape(x_mesh.shape)
ax.plot_surface(x_mesh, y_mesh, t_mesh)
plt.title(f"{z=}")

Expand Down
Loading

0 comments on commit fd78099

Please sign in to comment.