Merge: Add Explanation of some Utilities to User Guide (#392)

Adds explanations for these utilities: - estimate discrete space memory size - control reproducibility via random seeds - adding fake targets and parameter noise Note: This required some adjustment of the code execution test. It seems that list comprehensions called via `exec` have some weird scope, causing `NameError` for `NumericalDiscreteParameter` even though it was imported int he code block. So I am now providing a custom dict as globals and locals arg to `exec` which seems to unify the scopes. As far as I know this should still maintain isolation between separately tested code blocks.
emdgroup · Oct 11, 2024 · 9dcf27e · 9dcf27e
2 parents 573664b + 4493a34
commit 9dcf27e
Show file tree

Hide file tree

Showing 22 changed files with 212 additions and 52 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,10 +8,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - `n_restarts` and `n_raw_samples` keywords to configure continuous optimization
   behavior for `BotorchRecommender`
+- User guide for utilities
+
+### Changed
+- Utility `add_fake_results` renamed to `add_fake_measurements`
+- Utilities `add_fake_measurements` and `add_parameter_noise` now also return the
+  dataframe they modified in-place
 
 ### Fixed
 - Leftover attrs-decorated classes are garbage collected before the subclass tree is
-  traversed, avoiding sporadic serialization problems 
+  traversed, avoiding sporadic serialization problems
 
 ## [0.11.1] - 2024-10-01
 ### Added
@@ -24,8 +30,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 - Unsafe name-based matching of columns in `get_comp_rep_parameter_indices`
-- Leftover attrs-decorated classes are garbage collected before the subclass tree is
-  traversed, avoiding sporadic serialization problems 
 
 ### Deprecations
 - `ContinuousLinearEqualityConstraint` and `ContinuousLinearInequalityConstraint`

diff --git a/baybe/simulation/lookup.py b/baybe/simulation/lookup.py
@@ -11,7 +11,7 @@
 
 from baybe.simulation._imputation import _impute_lookup
 from baybe.targets.base import Target
-from baybe.utils.dataframe import add_fake_results
+from baybe.utils.dataframe import add_fake_measurements
 
 _logger = logging.getLogger(__name__)
 
@@ -68,7 +68,7 @@ def look_up_targets(
         2  3    15.0
     """
     if lookup is None:
-        add_fake_results(queries, targets)
+        add_fake_measurements(queries, targets)
     elif isinstance(lookup, Callable):
         _look_up_targets_from_callable(queries, targets, lookup)
     elif isinstance(lookup, pd.DataFrame):

diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py
@@ -66,19 +66,19 @@ def to_tensor(*x: np.ndarray | pd.DataFrame) -> Tensor | tuple[Tensor, ...]:
     return out
 
 
-def add_fake_results(
+def add_fake_measurements(
     data: pd.DataFrame,
     targets: Collection[Target],
     good_reference_values: dict[str, list] | None = None,
     good_intervals: dict[str, tuple[float, float]] | None = None,
     bad_intervals: dict[str, tuple[float, float]] | None = None,
-) -> None:
-    """Add fake results to a dataframe which was the result of a BayBE recommendation.
+) -> pd.DataFrame:
+    """Add fake measurements to a dataframe which was the result of a recommendation.
 
     It is possible to specify "good" values, which will be given a better
     target value. With this, the algorithm can be driven towards certain optimal values
-    whilst still being random. Useful for testing. Note that this does not return a
-    new dataframe and that the dataframe is changed in-place.
+    whilst still being random. Useful for testing. Note that the dataframe is changed
+    in-place and also returned.
 
     Args:
         data: A dataframe containing parameter configurations in experimental
@@ -99,6 +99,9 @@ def add_fake_results(
             the parameters lie outside the conditions specified through
             ``good_reference_values``.
 
+    Returns:
+        The modified dataframe.
+
     Raises:
         ValueError: If good values for a parameter were specified, but this parameter
             is not part of the dataframe.
@@ -216,19 +219,21 @@ def add_fake_results(
                 final_mask.sum(),
             )
 
+    return data
+
 
 def add_parameter_noise(
     data: pd.DataFrame,
     parameters: Iterable[Parameter],
     noise_type: Literal["absolute", "relative_percent"] = "absolute",
     noise_level: float = 1.0,
-) -> None:
+) -> pd.DataFrame:
     """Apply uniform noise to the parameter values of a recommendation frame.
 
     The noise can be additive or multiplicative.
     This can be used to simulate experimental noise or imperfect user input containing
     numerical parameter values that differ from the recommendations. Note that the
-    dataframe is modified in-place, and that no new dataframe is returned.
+    dataframe is changed in-place and also returned.
 
     Args:
         data: Output of the ``recommend`` function of a ``Campaign`` object, see
@@ -239,6 +244,9 @@ def add_parameter_noise(
             for noise type ``absolute`` and as percentage for noise type
             ``relative_percent``.
 
+    Returns:
+        The modified dataframe.
+
     Raises:
         ValueError: If ``noise_type`` is neither ``absolute`` nor
             ``relative_percent``.
@@ -265,6 +273,8 @@ def add_parameter_noise(
                 param.bounds.lower, param.bounds.upper
             )
 
+    return data
+
 
 def df_drop_single_value_columns(
     df: pd.DataFrame, lst_exclude: list = None

diff --git a/docs/userguide/async.md b/docs/userguide/async.md
@@ -65,15 +65,17 @@ Akin to `measurements` or `recommendations`, `pending_experiments` is a datafram
 In the following example, we get a set of recommendations, add results for half of them,
 and start the next recommendation, marking the other half pending:
 ```python
+from baybe.utils.dataframe import add_fake_measurements
+
 # Get a set of 10 recommendation
 rec = campaign.recommend(batch_size=10)
 
 # Split recommendations into two parts
 rec_finished = rec.iloc[:5]
 rec_pending = rec.iloc[5:]
 
-# Add target measurements to the finished part. Here we add a random number
-rec_finished["Target_max"] = 1337
+# Add target measurements to the finished part. Here we add fake results
+add_fake_measurements(rec_finished, campaign.targets)
 campaign.add_measurements(rec_finished)
 
 # Get the next set of recommendations, incorporating the still unfinished experiments.

diff --git a/docs/userguide/userguide.md b/docs/userguide/userguide.md
@@ -15,4 +15,5 @@ Simulation <simulation>
 Surrogates <surrogates>
 Targets <targets>
 Transfer Learning <transfer_learning>
+Utilities <utils>
 ```
diff --git a/docs/userguide/utils.md b/docs/userguide/utils.md
@@ -0,0 +1,135 @@
+# Utilities
+
+BayBE comes with a set of useful functions that can make your life easier in certain
+scenarios.
+
+## Search Space Memory Estimation
+In search spaces that have discrete parts, the memory needed to store the respective
+data can become excessively large as the number of points grows with the amount of
+possible combinations arising form all discrete parameter values.
+
+The [`SearchSpace.estimate_product_space_size`](baybe.searchspace.core.SearchSpace.estimate_product_space_size)
+and [`SubspaceDiscrete.estimate_product_space_size`](baybe.searchspace.discrete.SubspaceDiscrete.estimate_product_space_size)
+utilities allow estimating the memory needed to represent the discrete subspace. 
+They return a [`MemorySize`](baybe.searchspace.discrete.MemorySize) object that
+contains some relevant estimates:
+
+```python
+import numpy as np
+
+from baybe.parameters import NumericalDiscreteParameter
+from baybe.searchspace import SearchSpace
+
+# This creates 10 parameters with 20 values each.
+# The resulting space would have 20^10 entries, requiring around 745 TB of memory for
+# both experimental and computational representation of the search space.
+parameters = [
+    NumericalDiscreteParameter(name=f"p{k+1}", values=np.linspace(0, 100, 20))
+    for k in range(10)
+]
+
+# Estimate the required memory for such a space
+mem_estimate = SearchSpace.estimate_product_space_size(parameters)
+
+# Print quantities of interest
+print("Experimental Representation")
+print(f"Estimated size: {mem_estimate.exp_rep_human_readable}")
+print(f"Estimated size in Bytes: {mem_estimate.exp_rep_bytes}")
+print(f"Expected data frame shape: {mem_estimate.exp_rep_shape}")
+
+print("Computational Representation")
+print(f"Estimated size: {mem_estimate.comp_rep_human_readable}")
+print(f"Estimated size in Bytes: {mem_estimate.comp_rep_bytes}")
+print(f"Expected data frame shape: {mem_estimate.comp_rep_shape}")
+```
+
+```{admonition} Estimation with Constraints
+:class: warning
+{meth}`~baybe.searchspace.core.SearchSpace.estimate_product_space_size`
+currently does not include the influence of potential constraints in your search space
+as it is generally very hard to incorporate the effect of arbitrary constraints without
+actually building the entire space. Hence, you should always **treat the number you get
+as upper bound** of required memory. This can still be useful – for instance if your
+estimate already is several Exabytes, it is unlikely that most computers would be able
+to handle the result even if there are constraints present.
+```
+
+```{admonition} Memory During Optimization
+:class: warning
+{meth}`~baybe.searchspace.core.SearchSpace.estimate_product_space_size`
+only estimates the memory required to handle the search space. **It does not estimate
+the memory required during optimization**, which can be of a similar magnitude, but
+generally depends on additional factors.
+```
+
+```{admonition} Influence of Continuous Parameters
+:class: info
+Continuous parameters do not influence the size of the discrete search space part.
+Hence, they are ignored by the utility.
+```
+
+```{admonition} Efficient Search Space Creation
+:class: tip
+If you run into issues creating large search spaces, as for instance in mixture
+use cases, you should consider resorting to more specialized ways of creation by
+invoking alternative search space constructors like 
+{meth}`~baybe.searchspace.discrete.SubspaceDiscrete.from_dataframe`
+or 
+{meth}`~baybe.searchspace.discrete.SubspaceDiscrete.from_simplex`.
+Instead of creating a product space first and then filtering it down
+according to constraints, they offer a more direct and thus efficient path to the 
+desired result, typically requiring substantially less memory. 
+For example, {meth}`~baybe.searchspace.discrete.SubspaceDiscrete.from_simplex` 
+includes the mixture constraint already *during* the product creation. 
+In addition, BayBE can also be installed with its optional `polars` dependency 
+(`pip install baybe[polars]`) that activates efficient machinery for constraint handling.
+```
+
+## Reproducibility
+In some scenarios, for instance when testing your code setup, it can be useful to fix
+the random seeds for all relevant engines to generate reproducible results. BayBE offers
+the [`set_random_seed`](baybe.utils.random.set_random_seed) utility for this purpose:
+
+```python
+from baybe.utils.random import set_random_seed
+
+# Set the global random seed for all relevant engines
+set_random_seed(1337)
+
+# Assuming we have a prepared campaign
+campaign.recommend(5)
+```
+
+Setting the global random seed can be undesirable if there are other packages in your
+setup. For this, BayBE offers [`temporary_seed`](baybe.utils.random.temporary_seed):
+
+```python
+from baybe.utils.random import temporary_seed
+
+# Set the random seed for all relevant engines temporarily within the context
+with temporary_seed(1337):
+    campaign.recommend(5)
+```
+
+## Adding Fake Target Measurements and Parameter Noise
+When creating test scripts, it is often useful to try the recommendation loop for a few
+iterations. However, this requires some arbitrary target measurements to be set. Instead
+of coming up with a custom logic every time, you can use the
+[`add_fake_measurements`](baybe.utils.dataframe.add_fake_measurements) utility to add fake target
+measurements and the [`add_parameter_noise`](baybe.utils.dataframe.add_parameter_noise)
+utility to add artificial parameter noise:
+
+```python
+from baybe.utils.dataframe import add_fake_measurements, add_parameter_noise
+
+# Get recommendations
+recommendations = campaign.recommend(5)
+
+# Add fake target measurements and artificial parameter noise to the recommendations.
+# The utilities modify the dataframes inplace.
+measurements = recommendations.copy()
+add_fake_measurements(measurements, campaign.targets)
+add_parameter_noise(measurements, campaign.parameters)
+
+# Now continue the loop, e.g. by adding the measurements...
+```
diff --git a/examples/Basics/campaign.py b/examples/Basics/campaign.py
@@ -11,7 +11,7 @@
 from baybe.parameters import NumericalDiscreteParameter, SubstanceParameter
 from baybe.searchspace import SearchSpace
 from baybe.targets import NumericalTarget
-from baybe.utils.dataframe import add_fake_results
+from baybe.utils.dataframe import add_fake_measurements
 
 ### Setup
 
@@ -82,10 +82,10 @@
 
 # Adding target values is done by creating a new column in the `recommendation`
 # dataframe named after the target.
-# In this example, we use the `add_fake_results()` utility to create fake results.
+# In this example, we use the `add_fake_measurements()` utility to create fake results.
 # We then update the campaign by adding the measurements.
 
-add_fake_results(recommendation, campaign.targets)
+add_fake_measurements(recommendation, campaign.targets)
 print("\n\nRecommended experiments with fake measured values: ")
 print(recommendation)
 

diff --git a/examples/Basics/recommenders.py b/examples/Basics/recommenders.py
@@ -29,7 +29,7 @@
 from baybe.surrogates.base import Surrogate
 from baybe.targets import NumericalTarget
 from baybe.utils.basic import get_subclasses
-from baybe.utils.dataframe import add_fake_results
+from baybe.utils.dataframe import add_fake_measurements
 
 ### Available recommenders suitable for initial recommendation
 
@@ -179,7 +179,7 @@
 print("\n\nRecommended experiments: ")
 print(recommendation)
 
-add_fake_results(recommendation, campaign.targets)
+add_fake_measurements(recommendation, campaign.targets)
 print("\n\nRecommended experiments with fake measured values: ")
 print(recommendation)
 

diff --git a/examples/Constraints_Discrete/custom_constraints.py b/examples/Constraints_Discrete/custom_constraints.py
@@ -23,7 +23,7 @@
 )
 from baybe.searchspace import SearchSpace
 from baybe.targets import NumericalTarget
-from baybe.utils.dataframe import add_fake_results
+from baybe.utils.dataframe import add_fake_measurements
 
 ### Experiment setup
 
@@ -156,5 +156,5 @@ def custom_function(df: pd.DataFrame) -> pd.Series:
     )
 
     rec = campaign.recommend(batch_size=5)
-    add_fake_results(rec, campaign.targets)
+    add_fake_measurements(rec, campaign.targets)
     campaign.add_measurements(rec)
diff --git a/examples/Constraints_Discrete/dependency_constraints.py b/examples/Constraints_Discrete/dependency_constraints.py
@@ -23,7 +23,7 @@
 )
 from baybe.searchspace import SearchSpace
 from baybe.targets import NumericalTarget
-from baybe.utils.dataframe import add_fake_results
+from baybe.utils.dataframe import add_fake_measurements
 
 ### Experiment setup
 
@@ -113,5 +113,5 @@
     )
 
     rec = campaign.recommend(batch_size=5)
-    add_fake_results(rec, campaign.targets)
+    add_fake_measurements(rec, campaign.targets)
     campaign.add_measurements(rec)
diff --git a/examples/Constraints_Discrete/exclusion_constraints.py b/examples/Constraints_Discrete/exclusion_constraints.py
@@ -24,7 +24,7 @@
 )
 from baybe.searchspace import SearchSpace
 from baybe.targets import NumericalTarget
-from baybe.utils.dataframe import add_fake_results
+from baybe.utils.dataframe import add_fake_measurements
 
 ### Experiment setup
 
@@ -144,5 +144,5 @@
     )
 
     rec = campaign.recommend(batch_size=5)
-    add_fake_results(rec, campaign.targets)
+    add_fake_measurements(rec, campaign.targets)
     campaign.add_measurements(rec)
diff --git a/examples/Constraints_Discrete/mixture_constraints.py b/examples/Constraints_Discrete/mixture_constraints.py
@@ -27,7 +27,7 @@
 from baybe.parameters import NumericalDiscreteParameter, SubstanceParameter
 from baybe.searchspace import SearchSpace
 from baybe.targets import NumericalTarget
-from baybe.utils.dataframe import add_fake_results
+from baybe.utils.dataframe import add_fake_measurements
 
 ### Experiment setup
 
@@ -175,5 +175,5 @@
     )
 
     rec = campaign.recommend(batch_size=5)
-    add_fake_results(rec, campaign.targets)
+    add_fake_measurements(rec, campaign.targets)
     campaign.add_measurements(rec)