diff --git a/.lockfiles/py310-dev.lock b/.lockfiles/py310-dev.lock
index 1d59d53b5..ff4431390 100644
--- a/.lockfiles/py310-dev.lock
+++ b/.lockfiles/py310-dev.lock
@@ -182,7 +182,9 @@ future==1.0.0
 gitdb==4.0.11
     # via gitpython
 gitpython==3.1.43
-    # via streamlit
+    # via
+    #   baybe (pyproject.toml)
+    #   streamlit
 googleapis-common-protos==1.63.2
     # via
     #   opentelemetry-exporter-otlp-proto-grpc
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f4c94b96d..03431427a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `allow_missing` and `allow_extra` keyword arguments to `Objective.transform`
 - Example for a traditional mixture
 - `add_noise_to_perturb_degenerate_rows` utility
+- `benchmarks` subpackage for defining and running performance tests
 
 ### Changed
 - `SubstanceParameter` encodings are now computed exclusively with the
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6f67b86aa..0b1252070 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -28,3 +28,5 @@
   Bernoulli multi-armed bandit and Thompson sampling
 - Karin Hrovatin (Merck KGaA, Darmstadt, Germany):\
   `scikit-fingerprints` support
+- Fabian Liebig (Merck KGaA, Darmstadt, Germany):\
+  Benchmarking structure
diff --git a/README.md b/README.md
index 0f2529c7d..18f179d72 100644
--- a/README.md
+++ b/README.md
@@ -299,6 +299,7 @@ The available groups are:
 - `polars`: Required for optimized search space construction via [Polars](https://docs.pola.rs/)
 - `simulation`: Enabling the [simulation](https://emdgroup.github.io/baybe/stable/_autosummary/baybe.simulation.html) module.
 - `test`: Required for running the tests.
+- `benchmarking`: Required for running the benchmarking module.
 - `dev`: All of the above plus `tox` and `pip-audit`. For code contributors.
 
 ## 📡 Telemetry
diff --git a/baybe/serialization/core.py b/baybe/serialization/core.py
index 2947d4cd0..e3e53f052 100644
--- a/baybe/serialization/core.py
+++ b/baybe/serialization/core.py
@@ -3,6 +3,7 @@
 import base64
 import pickle
 from collections.abc import Callable
+from datetime import datetime, timedelta
 from typing import Any, TypeVar, get_type_hints
 
 import attrs
@@ -163,3 +164,9 @@ def select_constructor_hook(specs: dict, cls: type[_T]) -> _T:
 # Register custom un-/structure hooks
 converter.register_unstructure_hook(pd.DataFrame, _unstructure_dataframe_hook)
 converter.register_structure_hook(pd.DataFrame, _structure_dataframe_hook)
+converter.register_unstructure_hook(datetime, lambda x: x.isoformat())
+converter.register_structure_hook(datetime, lambda x, _: datetime.fromisoformat(x))
+converter.register_unstructure_hook(timedelta, lambda x: f"{x.total_seconds()}s")
+converter.register_structure_hook(
+    timedelta, lambda x, _: timedelta(seconds=float(x.removesuffix("s")))
+)
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 000000000..fb631ee0c
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1,9 @@
+"""Benchmarking module for performance tracking."""
+
+from benchmarks.definition import Benchmark
+from benchmarks.result import Result
+
+__all__ = [
+    "Result",
+    "Benchmark",
+]
diff --git a/benchmarks/__main__.py b/benchmarks/__main__.py
new file mode 100644
index 000000000..11fe8aff6
--- /dev/null
+++ b/benchmarks/__main__.py
@@ -0,0 +1,14 @@
+"""Executes the benchmarking module."""
+# Run this via 'python -m benchmarks' from the root directory.
+
+from benchmarks.domains import BENCHMARKS
+
+
+def main():
+    """Run all benchmarks."""
+    for benchmark in BENCHMARKS:
+        benchmark()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/definition/__init__.py b/benchmarks/definition/__init__.py
new file mode 100644
index 000000000..970a3dacb
--- /dev/null
+++ b/benchmarks/definition/__init__.py
@@ -0,0 +1,13 @@
+"""Benchmark task definitions."""
+
+from benchmarks.definition.config import (
+    Benchmark,
+    BenchmarkSettings,
+    ConvergenceExperimentSettings,
+)
+
+__all__ = [
+    "ConvergenceExperimentSettings",
+    "Benchmark",
+    "BenchmarkSettings",
+]
diff --git a/benchmarks/definition/config.py b/benchmarks/definition/config.py
new file mode 100644
index 000000000..e7bc75dd9
--- /dev/null
+++ b/benchmarks/definition/config.py
@@ -0,0 +1,90 @@
+"""Benchmark configurations."""
+
+import time
+from abc import ABC
+from collections.abc import Callable
+from datetime import datetime, timedelta, timezone
+from typing import Any, Generic, TypeVar
+
+from attrs import define, field
+from attrs.validators import instance_of
+from pandas import DataFrame
+
+from baybe.serialization.mixin import SerialMixin
+from baybe.utils.random import temporary_seed
+from benchmarks.result import Result, ResultMetadata
+
+
+@define(frozen=True)
+class BenchmarkSettings(SerialMixin, ABC):
+    """Benchmark configuration for recommender analyses."""
+
+    random_seed: int = field(validator=instance_of(int), kw_only=True, default=1337)
+    """The random seed for reproducibility."""
+
+
+BenchmarkSettingsType = TypeVar("BenchmarkSettingsType", bound=BenchmarkSettings)
+
+
+@define(frozen=True)
+class ConvergenceExperimentSettings(BenchmarkSettings):
+    """Benchmark configuration for recommender convergence analyses."""
+
+    batch_size: int = field(validator=instance_of(int))
+    """The recommendation batch size."""
+
+    n_doe_iterations: int = field(validator=instance_of(int))
+    """The number of Design of Experiment iterations."""
+
+    n_mc_iterations: int = field(validator=instance_of(int))
+    """The number of Monte Carlo iterations."""
+
+
+@define(frozen=True)
+class Benchmark(Generic[BenchmarkSettingsType]):
+    """The base class for a benchmark executable."""
+
+    settings: BenchmarkSettingsType = field()
+    """The benchmark configuration."""
+
+    function: Callable[[BenchmarkSettingsType], DataFrame] = field()
+    """The callable which contains the benchmarking logic."""
+
+    name: str = field(init=False)
+    """The name of the benchmark."""
+
+    best_possible_result: float | None = field(default=None)
+    """The best possible result which can be achieved in the optimization process."""
+
+    optimal_function_inputs: list[dict[str, Any]] | None = field(default=None)
+    """An input that creates the best_possible_result."""
+
+    @property
+    def description(self) -> str:
+        """The description of the benchmark function."""
+        if self.function.__doc__ is not None:
+            return self.function.__doc__
+        return "No description available."
+
+    @name.default
+    def _default_name(self):
+        """Return the name of the benchmark function."""
+        return self.function.__name__
+
+    def __call__(self) -> Result:
+        """Execute the benchmark and return the result."""
+        start_datetime = datetime.now(timezone.utc)
+
+        with temporary_seed(self.settings.random_seed):
+            start_sec = time.perf_counter()
+            result = self.function(self.settings)
+            stop_sec = time.perf_counter()
+
+        duration = timedelta(seconds=stop_sec - start_sec)
+
+        metadata = ResultMetadata(
+            start_datetime=start_datetime,
+            duration=duration,
+        )
+
+        return Result(self.name, result, metadata)
diff --git a/benchmarks/domains/__init__.py b/benchmarks/domains/__init__.py
new file mode 100644
index 000000000..4a0e956a8
--- /dev/null
+++ b/benchmarks/domains/__init__.py
@@ -0,0 +1,10 @@
+"""Benchmark domains."""
+
+from benchmarks.definition.config import Benchmark
+from benchmarks.domains.synthetic_2C1D_1C import synthetic_2C1D_1C_benchmark
+
+BENCHMARKS: list[Benchmark] = [
+    synthetic_2C1D_1C_benchmark,
+]
+
+__all__ = ["BENCHMARKS"]
diff --git a/benchmarks/domains/synthetic_2C1D_1C.py b/benchmarks/domains/synthetic_2C1D_1C.py
new file mode 100644
index 000000000..abb8ab176
--- /dev/null
+++ b/benchmarks/domains/synthetic_2C1D_1C.py
@@ -0,0 +1,123 @@
+"""Synthetic function with two continuous and one discrete input."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+from numpy import pi, sin, sqrt
+from pandas import DataFrame
+
+from baybe.campaign import Campaign
+from baybe.parameters import NumericalContinuousParameter, NumericalDiscreteParameter
+from baybe.recommenders.pure.nonpredictive.sampling import RandomRecommender
+from baybe.searchspace import SearchSpace
+from baybe.simulation import simulate_scenarios
+from baybe.targets import NumericalTarget, TargetMode
+from benchmarks.definition import (
+    Benchmark,
+    ConvergenceExperimentSettings,
+)
+
+if TYPE_CHECKING:
+    from mpl_toolkits.mplot3d import Axes3D
+
+
+def _lookup(z: np.ndarray, x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    """Lookup that is used internally in the callable for the benchmark."""
+    try:
+        assert np.all(-2 * pi <= x) and np.all(x <= 2 * pi)
+        assert np.all(-2 * pi <= y) and np.all(y <= 2 * pi)
+        assert np.all(np.isin(z, [1, 2, 3, 4]))
+    except AssertionError:
+        raise ValueError("Inputs are not in the valid ranges.")
+
+    return (
+        (z == 1) * sin(x) * (1 + sin(y))
+        + (z == 2) * (x * sin(0.9 * x) + sin(x) * sin(y))
+        + (z == 3) * (sqrt(x + 8) * sin(x) + sin(x) * sin(y))
+        + (z == 4) * (x * sin(1.666 * sqrt(x + 8)) + sin(x) * sin(y))
+    )
+
+
+def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame:
+    """Hybrid synthetic test function.
+
+    Inputs:
+        z   discrete   {1,2,3,4}
+        x   continuous [-2*pi, 2*pi]
+        y   continuous [-2*pi, 2*pi]
+    Output: continuous
+    Objective: Maximization
+    Optimal Inputs:
+        {x: 1.610, y: 1.571, z: 3}
+        {x: 1.610, y: -4.712, z: 3}
+    Optimal Output: 4.09685
+    """
+    parameters = [
+        NumericalContinuousParameter("x", (-2 * pi, 2 * pi)),
+        NumericalContinuousParameter("y", (-2 * pi, 2 * pi)),
+        NumericalDiscreteParameter("z", (1, 2, 3, 4)),
+    ]
+
+    objective = NumericalTarget(name="target", mode=TargetMode.MAX).to_objective()
+    search_space = SearchSpace.from_product(parameters=parameters)
+
+    scenarios: dict[str, Campaign] = {
+        "Random Recommender": Campaign(
+            searchspace=search_space,
+            recommender=RandomRecommender(),
+            objective=objective,
+        ),
+        "Default Recommender": Campaign(
+            searchspace=search_space,
+            objective=objective,
+        ),
+    }
+
+    return simulate_scenarios(
+        scenarios,
+        _lookup,
+        batch_size=settings.batch_size,
+        n_doe_iterations=settings.n_doe_iterations,
+        n_mc_iterations=settings.n_mc_iterations,
+        impute_mode="error",
+    )
+
+
+benchmark_config = ConvergenceExperimentSettings(
+    batch_size=5,
+    n_doe_iterations=30,
+    n_mc_iterations=50,
+)
+
+synthetic_2C1D_1C_benchmark = Benchmark(
+    function=synthetic_2C1D_1C,
+    best_possible_result=4.09685,
+    settings=benchmark_config,
+    optimal_function_inputs=[
+        {"x": 1.610, "y": 1.571, "z": 3},
+        {"x": 1.610, "y": -4.712, "z": 3},
+    ],
+)
+
+
+if __name__ == "__main__":
+    #  Visualize the domain
+
+    import matplotlib.pyplot as plt
+
+    X = np.linspace(-2 * pi, 2 * pi)
+    Y = np.linspace(-2 * pi, 2 * pi)
+    Z = [1, 2, 3, 4]
+
+    x_mesh, y_mesh = np.meshgrid(X, Y)
+
+    fig = plt.figure(figsize=(10, 10))
+    for i, z in enumerate(Z):
+        ax: Axes3D = fig.add_subplot(2, 2, i + 1, projection="3d")
+        t_mesh = _lookup(np.asarray(z), x_mesh, y_mesh)
+        ax.plot_surface(x_mesh, y_mesh, t_mesh)
+        plt.title(f"{z=}")
+
+    plt.show()
diff --git a/benchmarks/result/__init__.py b/benchmarks/result/__init__.py
new file mode 100644
index 000000000..fb4380da7
--- /dev/null
+++ b/benchmarks/result/__init__.py
@@ -0,0 +1,6 @@
+"""Benchmark results."""
+
+from benchmarks.result.metadata import ResultMetadata
+from benchmarks.result.result import Result
+
+__all__ = ["Result", "ResultMetadata"]
diff --git a/benchmarks/result/metadata.py b/benchmarks/result/metadata.py
new file mode 100644
index 000000000..44bfdc24b
--- /dev/null
+++ b/benchmarks/result/metadata.py
@@ -0,0 +1,51 @@
+"""Benchmark result metadata."""
+
+from datetime import datetime, timedelta
+
+import git
+from attrs import define, field
+from attrs.validators import instance_of
+from cattrs.gen import make_dict_unstructure_fn
+
+from baybe.serialization.core import converter
+from baybe.serialization.mixin import SerialMixin
+
+
+@define(frozen=True)
+class ResultMetadata(SerialMixin):
+    """The metadata of a benchmark result."""
+
+    start_datetime: datetime = field(validator=instance_of(datetime))
+    """The start datetime of the benchmark."""
+
+    duration: timedelta = field(validator=instance_of(timedelta))
+    """The time it took to complete the benchmark."""
+
+    commit_hash: str = field(validator=instance_of(str), init=False)
+    """The commit hash of the used BayBE code."""
+
+    latest_baybe_tag: str = field(validator=instance_of(str), init=False)
+    """The latest BayBE tag reachable in the ancestor commit history."""
+
+    @commit_hash.default
+    def _default_commit_hash(self) -> str:
+        """Extract the git commit hash."""
+        repo = git.Repo(search_parent_directories=True)
+        sha = repo.head.object.hexsha
+        return sha
+
+    @latest_baybe_tag.default
+    def _default_latest_baybe_tag(self) -> str:
+        """Extract the latest reachable BayBE tag."""
+        repo = git.Repo(search_parent_directories=True)
+        latest_tag = repo.git.describe(tags=True, abbrev=0)
+        return latest_tag
+
+
+# Register un-/structure hooks
+converter.register_unstructure_hook(
+    ResultMetadata,
+    make_dict_unstructure_fn(
+        ResultMetadata, converter, _cattrs_include_init_false=True
+    ),
+)
diff --git a/benchmarks/result/result.py b/benchmarks/result/result.py
new file mode 100644
index 000000000..8fbaa0995
--- /dev/null
+++ b/benchmarks/result/result.py
@@ -0,0 +1,24 @@
+"""Basic result classes for benchmarking."""
+
+from __future__ import annotations
+
+from attrs import define, field
+from attrs.validators import instance_of
+from pandas import DataFrame
+
+from baybe.serialization.mixin import SerialMixin
+from benchmarks.result import ResultMetadata
+
+
+@define(frozen=True)
+class Result(SerialMixin):
+    """A single result of the benchmarking."""
+
+    benchmark_identifier: str = field(validator=instance_of(str))
+    """The identifier of the benchmark that produced the result."""
+
+    data: DataFrame = field(validator=instance_of(DataFrame))
+    """The result of the benchmarked callable."""
+
+    metadata: ResultMetadata = field(validator=instance_of(ResultMetadata))
+    """The metadata associated with the benchmark result."""
diff --git a/mypy.ini b/mypy.ini
index 2eb5718bf..4521667dd 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,5 +1,5 @@
 [mypy]
-packages = baybe
+packages = baybe,benchmarks
 
 ; Avoid false positives for `type[P]` when `P` is abstract.
 ; * https://svcs.hynek.me/en/stable/typing-caveats.html#abstract-classes-and-pep-544
@@ -23,6 +23,9 @@ ignore_missing_imports = True
 [mypy-gpytorch.*]
 ignore_missing_imports = True
 
+[mypy-git.*]
+ignore_missing_imports = True
+
 [mypy-joblib.*]
 ignore_missing_imports = True
 
diff --git a/pyproject.toml b/pyproject.toml
index 69298cc60..fc28a80d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,6 +87,7 @@ dev = [
     "baybe[polars]",
     "baybe[simulation]",
     "baybe[test]",
+    "baybe[benchmarking]",
     "pip-audit>=2.5.5",
     "tox-uv>=1.7.0",
     "uv>=0.3.0", #  `uv lock` (for lockfiles) is stable since 0.3.0: https://github.com/astral-sh/uv/issues/2679#event-13950215962
@@ -140,6 +141,13 @@ simulation = [
     "xyzpy>=1.2.1",
 ]
 
+benchmarking = [
+    "baybe[chem]",
+    "baybe[onnx]",
+    "baybe[simulation]",
+    "GitPython>=3.0.6", # GitPython<3.0.6 is necessary since older versions rely on a specific version of GitDB: https://github.com/gitpython-developers/GitPython/issues/983
+]
+
 test = [
     "hypothesis[pandas]>=6.88.4",
     "tenacity>=8.5.0",