diff --git a/environment.yml b/environment.yml
index a0dbdf5ec49..45898f4fd5b 100644
--- a/environment.yml
+++ b/environment.yml
@@ -65,3 +65,4 @@ dependencies:
   - lazy_loader
   - defusedxml
   - python-neo
+  - formulaic
diff --git a/mne/datasets/config.py b/mne/datasets/config.py
index ccd4babacd9..b09fb87eedf 100644
--- a/mne/datasets/config.py
+++ b/mne/datasets/config.py
@@ -129,7 +129,7 @@
 )
 MNE_DATASETS["misc"] = dict(
     archive_name=f"{MISC_VERSIONED}.tar.gz",  # 'mne-misc-data',
-    hash="md5:e343d3a00cb49f8a2f719d14f4758afe",
+    hash="md5:201d35531d3c03701cf50e38bb73481f",
     url=(
         "https://codeload.github.com/mne-tools/mne-misc-data/tar.gz/"
         f'{RELEASES["misc"]}'
diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 50743c104ef..8f24c0c4a0c 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -4,20 +4,31 @@
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
+from __future__ import annotations
+
+from typing import Literal
+
+import matplotlib.pyplot as plt
 import numpy as np
+from mpl_toolkits.axes_grid1 import make_axes_locatable
 from scipy import ndimage, sparse
 from scipy.sparse.csgraph import connected_components
 from scipy.stats import f as fstat
 from scipy.stats import t as tstat
 
+from ..epochs import BaseEpochs, EvokedArray
+from ..evoked import Evoked
 from ..fixes import has_numba, jit
 from ..parallel import parallel_func
 from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate
 from ..source_space import SourceSpaces
+from ..time_frequency import BaseTFR
 from ..utils import (
+    GetEpochsMixin,
     ProgressBar,
     _check_option,
     _pl,
+    _soft_import,
     _validate_type,
     check_random_state,
     logger,
@@ -25,8 +36,13 @@
     verbose,
     warn,
 )
+from ..viz import plot_compare_evokeds
 from .parametric import f_oneway, ttest_1samp_no_p
 
+# need this at top-level of file due to type hints
+pd = _soft_import("pandas", purpose="DataFrame integration", strict=False)
+DataFrame = getattr(pd, "DataFrame", None)
+
 
 def _get_buddies_fallback(r, s, neighbors, indices=None):
     if indices is None:
@@ -931,7 +947,7 @@ def _permutation_cluster_test(
     sample_shape = X[0].shape[1:]
     for x in X:
         if x.shape[1:] != sample_shape:
-            raise ValueError("All samples mush have the same size")
+            raise ValueError("All samples must have the same size")
 
     # flatten the last dimensions in case the data is high dimensional
     X = [np.reshape(x, (x.shape[0], -1)) for x in X]
@@ -1723,3 +1739,362 @@ def summarize_clusters_stc(
     data_summary[:, 0] = np.sum(data_summary, axis=1)
 
     return klass(data_summary, vertices, tmin, tstep, subject)
+
+
+def _validate_cluster_df(df: DataFrame, dv_name: str, iv_name: str):
+    """Validate the input DataFrame for cluster tests."""
+    # check if all necessary columns are present
+    missing = ({dv_name} | {iv_name}) - set(df.columns)  # should be empty
+    sep = '", "'
+    if missing:  # if not empty, there are missing columns
+        raise ValueError(
+            f"DataFrame must contain a column named for each term in `formula`. "
+            f"Column{_pl(missing)} missing for term{_pl(missing)} "  # _pl = pluralize
+            f'"{sep.join(missing)}".'
+        )
+    # check if the data column contains valid (and consistent) instance types
+    inst = df[dv_name].iloc[0]
+    valid_types = (
+        Evoked,
+        BaseEpochs,
+        BaseTFR,
+        np.ndarray,
+    )  # Base covers all Epochs and TFRs
+    _validate_type(inst, valid_types, f"Data in dependent variable column '{dv_name}'")
+    all_types = set(df[dv_name].map(type))
+    all_type_names = ", ".join([type(x).__name__ for x in all_types])
+    prologue = f"Data in dependent variable column '{dv_name}' must all have "
+    if len(all_types) > 1:
+        raise ValueError(
+            f"{prologue} the same type, but found types {{{all_type_names}}}."
+        )
+    # check if the shape of the data is consistent
+    if isinstance(inst, np.ndarray):
+        all_shapes = set(
+            df[dv_name].map(lambda x: x.shape[1:])
+        )  # first dim may vary (participants or epochs)
+    elif isinstance(inst, (BaseEpochs | BaseTFR)):
+        all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:]))
+    else:
+        all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape))
+    if len(all_shapes) > 1:
+        raise ValueError(
+            f"{prologue} consistent shape, but {len(all_shapes)} different "
+            f"shapes were found: {'; '.join(all_shapes)}."
+        )
+    obj_type = all_types.pop()
+    is_epo = GetEpochsMixin in obj_type.__mro__
+    is_tfr = BaseTFR in obj_type.__mro__
+    is_arr = np.ndarray in obj_type.__mro__
+    return is_epo, is_tfr, is_arr
+
+
+@verbose
+def cluster_test(
+    df: DataFrame,
+    formula: str,
+    *,  # end of positional-only parameters
+    within_id: str | None = None,
+    stat_fun: callable | None = None,
+    tail: Literal[-1, 0, 1] = 0,
+    threshold=None,
+    n_permutations: str | int = 1024,
+    adjacency: sparse.spmatrix | None | False = None,  # should be None (default)
+    max_step: int = 1,  # TODO may need to provide `max_step_time` and `max_step_freq`
+    exclude: list | None = None,  # TODO needs rethink because user passes MNE objects
+    step_down_p: float = 0.0,
+    t_power: float = 1.0,
+    check_disjoint: bool = False,
+    out_type: Literal["indices", "mask"] = "indices",
+    seed: None | int | np.random.RandomState = None,
+    buffer_size: int | None = None,
+    n_jobs: int = 1,
+    verbose=None,
+):
+    """Run a cluster permutation test from a DataFrame and a formula.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Dataframe containing the data, dependent and independent variables.
+    formula : str
+        Wilkinson notation formula for design matrix. The names of the dependent
+        and independent variable should match the columns in ``df``.
+    within_id : None | str
+        Name of column in ``df`` to use in identifying within-group contrasts. If
+        ``None``, will perform a between-group test. Ignored if the number of groups
+        (unique values in the independent variable column of ``df``) is greater than 2.
+    %(stat_fun_clust_both)s
+    %(tail_clust)s
+    %(threshold_clust_both)s
+    %(n_permutations_clust_all)s
+    %(adjacency_clust_both)s
+    max_step : int, optional
+        Maximum distance between samples (time points). Default is 1.
+    exclude : array-like of bool | None
+        Mask to apply to the data to exclude certain points from clustering
+        (e.g., medial wall vertices). Should be the same shape as the channels/vertices
+        dimension of the data objects. If ``None``, no points are excluded.
+    %(step_down_p_clust)s
+    %(t_power_clust)s
+    check_disjoint : bool
+        Whether to check if the ``adjacency`` matrix can be separated into disjoint
+        sets before clustering. This may lead to faster clustering, especially if
+        the "time" and/or "frequency" dimensions are large.
+    %(out_type_clust)s
+    %(seed)s
+    buffer_size : int | None
+        Block size to use when computing test statistics. This can significantly
+        reduce memory usage when ``n_jobs > 1`` and memory sharing between
+        processes is enabled (see :func:`mne.set_cache_dir`), because the data will be
+        shared between processes and each process only needs to allocate space for
+        a small block of locations at a time.
+    %(n_jobs)s
+    %(verbose)s
+
+    Notes
+    -----
+    %(threshold_clust_t_or_f_notes)s
+
+    Returns
+    -------
+    ClusterResult
+        Object containing the results of the cluster permutation test.
+    """
+    # parse formula
+    formulaic = _soft_import("formulaic", purpose="parse formula for clustering")
+    parser = formulaic.parser.DefaultFormulaParser(include_intercept=False)
+    formula = formulaic.Formula(formula, _parser=parser)
+    # extract the dependent and independent variable names
+    dv_name = str(np.array(formula.lhs.root).item())
+    iv_name = str(np.array(formula.rhs.root).item())
+
+    # validate the input dataframe and return the type of the data column entries
+    is_epo, is_tfr, is_arr = _validate_cluster_df(df, dv_name, iv_name)
+
+    # for within_subject designs, check if each subject has 2 observations
+    _validate_type(within_id, (str, None), "within_id")
+    if within_id:
+        df = df.copy(deep=False)  # Don't mutate input dataframe row order!
+        df.sort_values([iv_name, within_id], inplace=True)
+        counts = df[within_id].value_counts()
+        if any(counts != 2):
+            raise ValueError("for paired t-test, each subject must have 2 observations")
+
+    # extract the data from the dataframe
+    outer_func = np.concatenate if is_epo else np.array
+    axes = (-3, -1) if is_tfr else (-2, -1)
+
+    def func_arr(series):
+        return np.concatenate(series.values)
+
+    def func_mne(series):
+        return outer_func(
+            series.map(lambda inst: inst.get_data().swapaxes(*axes)).to_list()
+        )
+
+    func = func_arr if is_arr else func_mne
+
+    # convert to a list-like X for clustering
+    X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list()
+
+    # determine test type
+    if len(X) == 1:
+        kind = "within"  # data already subtracted
+    elif len(X) > 2:
+        kind = "between"
+    elif (
+        len(set(x.shape for x in X)) > 1
+    ):  # check if there are unequal observations in each group
+        kind = "between"
+    # by now we know there are exactly 2 elements in X, and their shapes match
+    elif within_id in df:
+        kind = "within"
+        X = X[0] - X[1]
+    else:  # 2 elements in X but no within_id provided → unpaired test
+        kind = "between"
+
+    # define stat function and threshold
+    stat_fun, threshold = _check_fun(
+        X=X, stat_fun=stat_fun, threshold=threshold, tail=tail, kind=kind
+    )
+
+    # check_fun doesn't work with list input`
+    if kind == "within":  # will this create an issue for already subtracted data?
+        X = [X]
+
+    # Run the cluster-based permutation test
+    stat_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
+        X,
+        n_permutations=n_permutations,
+        threshold=threshold,
+        stat_fun=stat_fun,
+        tail=tail,
+        n_jobs=n_jobs,
+        adjacency=adjacency,
+        max_step=max_step,  # maximum distance between samples (time points)
+        exclude=exclude,  # exclude no time points or channels
+        step_down_p=step_down_p,  # step down in jumps test
+        t_power=t_power,  # weigh each location by its stats score
+        out_type=out_type,
+        check_disjoint=check_disjoint,
+        buffer_size=buffer_size,  # block size for chunking the data
+        seed=seed,
+    )
+
+    return ClusterResult(stat_obs, clusters, cluster_p_values, H0, stat_fun)
+
+
+class ClusterResult:
+    """
+    Object containing the results of the cluster permutation test.
+
+    Parameters
+    ----------
+    stat_obs : np.ndarray
+        The observed test statistic.
+    clusters : list
+        List of clusters.
+    cluster_p_values : np.ndarray
+        P-values for each cluster.
+    H0 : np.ndarray
+        Max cluster level stats observed under permutation.
+    """
+
+    def __init__(
+        self,
+        stat_obs: np.typing.NDArray,
+        clusters: list,
+        cluster_p_values: np.typing.NDArray,
+        H0: np.typing.NDArray,
+        stat_fun: callable,
+    ):
+        self.stat_obs = stat_obs
+        self.clusters = clusters
+        self.cluster_p_values = cluster_p_values
+        self.H0 = H0
+        self.stat_fun = stat_fun
+
+        # unpaired t-test equivalent to f_oneway w/ 2 groups
+        if stat_fun is f_oneway:
+            self.stat_name = "F-statistic"
+        elif stat_fun is ttest_1samp_no_p:
+            self.stat_name = "paired T-statistic"
+        else:
+            self.stat_name = "test statistic"
+
+    def plot_cluster_time_sensor(
+        self,
+        condition_labels: dict,
+        colors: list | dict | None = None,
+        linestyles: list | dict | None = None,
+        cmap_evokeds: None | str | tuple = None,
+        cmap_topo: None | str | tuple = None,
+        ci: float | bool | callable | None = None,
+    ):
+        """
+        Plot the cluster with the lowest p-value.
+
+        2D cluster plotted with topoplot on the left and evoked signals on the right.
+        Timepoints that are part of the cluster are
+        highlighted in green on the evoked signals.
+
+        Parameters
+        ----------
+        condition_labels : dict
+            Dictionary with condition labels as keys and evoked objects as values.
+        colors : list|dict|None
+            Colors to use when plotting the ERP lines and confidence bands.
+        linestyles : list|dict|None
+            Styles to use when plotting the ERP lines.
+        cmap_evokeds : None|str|tuple
+            Colormap from which to draw color values when plotting the ERP lines.
+        cmap_topo: matplotlib colormap
+            Colormap to use for the topomap.
+        ci : float|bool|callable()|None
+            Confidence band around each ERP time series.
+        """
+        # extract condition labels from the dictionary
+        cond_keys = list(condition_labels.keys())
+        # extract the evokeds from the dictionary
+        cond_values = list(condition_labels.values())
+
+        linestyles = {cond_keys[0]: "-", cond_keys[1]: "--"}
+
+        lowest_p_cluster = np.argmin(self.cluster_p_values)
+
+        # plot the cluster with the lowest p-value
+        time_inds, space_inds = np.squeeze(self.clusters[lowest_p_cluster])
+        ch_inds = np.unique(space_inds)
+        time_inds = np.unique(time_inds)
+
+        # get topography for t stat
+        t_map = self.stat_obs[time_inds, ...].mean(axis=0).astype(int)
+
+        # get signals at the sensors contributing to the cluster
+        sig_times = cond_values[0][0].times[time_inds]
+
+        # create spatial mask
+        mask = np.zeros((t_map.shape[0], 1), dtype=bool)
+        mask[ch_inds, :] = True
+
+        # initialize figure
+        fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
+
+        # plot average test statistic and mark significant sensors
+        t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0)
+        t_evoked.plot_topomap(
+            times=0,
+            mask=mask,
+            axes=ax_topo,
+            cmap=cmap_topo,
+            show=False,
+            colorbar=False,
+            mask_params=dict(markersize=10),
+            scalings=1.00,
+        )
+        image = ax_topo.images[0]
+
+        # remove the title that would otherwise say "0.000 s"
+        ax_topo.set_title(
+            "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format(
+                *sig_times[[0, -1]]
+            )
+        )
+
+        # create additional axes (for ERF and colorbar)
+        divider = make_axes_locatable(ax_topo)
+
+        # add axes for colorbar
+        ax_colorbar = divider.append_axes("right", size="5%", pad=0.1)
+        cbar = plt.colorbar(image, cax=ax_colorbar)
+        cbar.set_label(self.stat_name)
+
+        # add new axis for time courses and plot time courses
+        ax_signals = divider.append_axes("right", size="300%", pad=1.3)
+        title = (
+            f"Temporal cluster extent:\nSignal averaged over {len(ch_inds)} sensor(s)"
+        )
+        plot_compare_evokeds(
+            condition_labels,
+            title=title,
+            picks=ch_inds,
+            axes=ax_signals,
+            colors=colors,
+            linestyles=linestyles,
+            cmap=cmap_evokeds,
+            show=False,
+            split_legend=True,
+            truncate_yaxis="auto",
+            truncate_xaxis=False,
+            ci=ci,
+        )
+        plt.legend(frameon=False, loc="upper left")
+
+        # plot temporal cluster extent
+        ymin, ymax = ax_signals.get_ylim()
+        ax_signals.fill_betweenx(
+            (ymin, ymax), sig_times[0], sig_times[-1], color="grey", alpha=0.3
+        )
+
+        plt.show()
diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index e319d018328..06a87a07477 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -15,10 +15,19 @@
 )
 from scipy import linalg, sparse, stats
 
-from mne import MixedSourceEstimate, SourceEstimate, SourceSpaces, VolSourceEstimate
+from mne import (
+    EpochsArray,
+    EvokedArray,
+    MixedSourceEstimate,
+    SourceEstimate,
+    SourceSpaces,
+    VolSourceEstimate,
+    create_info,
+)
 from mne.fixes import _eye_array
 from mne.stats import combine_adjacency, ttest_ind_no_p
 from mne.stats.cluster_level import (
+    cluster_test,
     f_oneway,
     permutation_cluster_1samp_test,
     permutation_cluster_test,
@@ -27,7 +36,8 @@
     summarize_clusters_stc,
     ttest_1samp_no_p,
 )
-from mne.utils import _record_warnings, catch_logging
+from mne.time_frequency import AverageTFRArray, BaseTFR, EpochsTFRArray
+from mne.utils import GetEpochsMixin, _record_warnings, catch_logging
 
 n_space = 50
 
@@ -867,3 +877,109 @@ def test_output_equiv(shape, out_type, adjacency, threshold):
             assert out_type == "indices"
             got_mask[np.ix_(*clu)] = n
     assert_array_equal(got_mask, want_mask)
+
+
+def test_compare_old_and_new_cluster_api():
+    """Test for same results from old and new APIs."""
+    pd = pytest.importorskip("pandas")
+    condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions()
+    df_1d = pd.DataFrame(
+        dict(
+            data=[condition1_1d, condition2_1d],
+            condition=["a", "b"],
+        )
+    )
+    kwargs = dict(n_permutations=100, tail=1, seed=1, buffer_size=None, out_type="mask")
+    F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(
+        [condition1_1d, condition2_1d], **kwargs
+    )
+    formula = "data ~ condition"
+    cluster_result = cluster_test(df_1d, formula, **kwargs)
+    assert_array_equal(cluster_result.H0, H0)
+    assert_array_equal(cluster_result.stat_obs, F_obs)
+    assert_array_equal(cluster_result.cluster_p_values, cluster_pvals)
+    assert cluster_result.clusters == clusters
+
+
+@pytest.mark.parametrize(
+    "Inst", (EpochsArray, EvokedArray, EpochsTFRArray, AverageTFRArray)
+)
+@pytest.mark.filterwarnings('ignore:Ignoring argument "tail":RuntimeWarning')
+def test_new_cluster_api(Inst):
+    """Test handling different MNE objects in the cluster API."""
+    pd = pytest.importorskip("pandas")
+
+    rng = np.random.default_rng(seed=8675309)
+    is_epo = GetEpochsMixin in Inst.__mro__
+    is_tfr = BaseTFR in Inst.__mro__
+
+    n_epo, n_chan, n_freq, n_times = 6, 3, 4, 5
+
+    # prepare the dimensions of the simulated data, then simulate
+    size = (n_chan,)
+    if is_epo:
+        size = (n_epo, *size)
+    if is_tfr:
+        size = (*size, n_freq)
+    size = (*size, n_times)
+    data = rng.normal(size=size)
+
+    # construct the instance
+    info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg")
+    kw = dict(times=np.arange(n_times), freqs=np.arange(n_freq)) if is_tfr else dict()
+    cond_a = Inst(data=data, info=info, **kw)
+    cond_b = cond_a.copy()
+    # introduce a significant difference in a specific region, time, and frequency
+    ch_start, ch_end = 0, 2  # 2 channels
+    t_start, t_end = 2, 4  # 2 times
+    f_start, f_end = 2, 4  # 2 freqs
+    if is_tfr:
+        cond_b._data[..., ch_start:ch_end, f_start:f_end, t_start:t_end] += 2
+    else:
+        cond_b._data[..., ch_start:ch_end, t_start:t_end] += 2
+    # for Evokeds/AverageTFRs, we create fake "subjects" as our observations within each
+    # condition. We add a bit of noise while we do so.
+    if not is_epo:
+        insts = list()
+        for cond in cond_a, cond_b:
+            for _n in range(n_epo):
+                if not _n:
+                    insts.append(cond)
+                    continue
+                _cond = cond.copy()
+                _cond.data += rng.normal(scale=0.1, size=_cond.data.shape)
+                insts.append(_cond)
+        conds = np.repeat(["a", "b"], n_epo).tolist()
+    else:
+        # For Epochs(TFR)Array, each epoch is an observation and they're already
+        # noisy/non-identical, so no duplication / noise-addition necessary.
+        insts = [cond_a, cond_b]
+        conds = ["a", "b"]
+
+    # run new clustering API
+    df = pd.DataFrame(dict(data=insts, condition=conds))
+    kwargs = dict(
+        n_permutations=100, seed=42, tail=1, buffer_size=None, out_type="mask"
+    )
+    result_new_api = cluster_test(df, "data~condition", **kwargs)
+
+    # make sure channels are last dimension for old API
+    if is_epo:
+        axes = (0, 3, 2, 1) if is_tfr else (0, 2, 1)
+        X = [cond_a.get_data().transpose(*axes), cond_b.get_data().transpose(*axes)]
+    else:
+        axes = (2, 1, 0) if is_tfr else (1, 0)
+        Xa = list()
+        Xb = list()
+        for inst, cond in zip(insts, conds):
+            container = Xa if cond == "a" else Xb
+            container.append(inst.get_data().transpose(*axes))
+        X = [np.stack(Xa), np.stack(Xb)]
+
+    F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(X, **kwargs)
+    assert_array_almost_equal(result_new_api.H0, H0)
+    assert_array_almost_equal(result_new_api.stat_obs, F_obs)
+    assert_array_almost_equal(result_new_api.cluster_p_values, cluster_pvals)
+    assert len(result_new_api.clusters) == len(clusters)
+    for clu1, clu2 in zip(result_new_api.clusters, clusters):
+        assert_array_equal(clu1, clu2)
diff --git a/mne/utils/docs.py b/mne/utils/docs.py
index 3e59d751d93..2c84f95f0e0 100644
--- a/mne/utils/docs.py
+++ b/mne/utils/docs.py
@@ -144,61 +144,54 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     formatting. This can add overhead so is meant only for debugging.
 """
 
-docdict["adjacency_clust"] = """
-adjacency : scipy.sparse.spmatrix | None | False
+_adjacency_clust_template = """
+adjacency : scipy.sparse.spmatrix | {param_none}False
     Defines adjacency between locations in the data, where "locations" can be
     spatial vertices, frequency bins, time points, etc. For spatial vertices
     (i.e. sensor space data), see :func:`mne.channels.find_ch_adjacency` or
     :func:`mne.spatial_inter_hemi_adjacency`. For source space data, see
-    :func:`mne.spatial_src_adjacency` or
-    :func:`mne.spatio_temporal_src_adjacency`. If ``False``, assumes
-    no adjacency (each location is treated as independent and unconnected).
-    If ``None``, a regular lattice adjacency is assumed, connecting
-    each {sp} location to its neighbor(s) along the last dimension
-    of {{eachgrp}} ``{{x}}``{lastdim}.
+    :func:`mne.spatial_src_adjacency` or :func:`mne.spatio_temporal_src_adjacency`.
+    If ``False``, assumes no adjacency (each location is treated as independent and
+    unconnected).{if_none}
     If ``adjacency`` is a matrix, it is assumed to be symmetric (only the
     upper triangular half is used) and must be square with dimension equal to
-    ``{{x}}.shape[-1]`` {parone} or ``{{x}}.shape[-1] * {{x}}.shape[-2]``
-    {partwo} or (optionally)
-    ``{{x}}.shape[-1] * {{x}}.shape[-2] * {{x}}.shape[-3]``
-    {parthree}.{memory}
+    the product of the last 1, 2, or 3 data dimensions (e.g., for time-frequency data:
+    n_channels, n_channels * n_freqs, or n_channels * n_freqs * n_times).{memory}
+"""
+_if_none = """ If ``None``, a regular lattice adjacency is assumed, connecting
+    each {spatial}location to its neighbor(s) along the last dimension
+    of {the_data}.
 """
-
-mem = (
-    " If spatial adjacency is uniform in time, it is recommended to use "
-    "a square matrix with dimension ``{x}.shape[-1]`` (n_vertices) to save "
-    "memory and computation, and to use ``max_step`` to define the extent "
-    "of temporal adjacency to consider when clustering."
-)
-comb = " The function `mne.stats.combine_adjacency` may be useful for 4D data."
 st = dict(
-    sp="spatial",
-    lastdim="",
-    parone="(n_vertices)",
-    partwo="(n_times * n_vertices)",
-    parthree="(n_times * n_freqs * n_vertices)",
-    memory=mem,
+    param_none="None | ",
+    if_none=_if_none.format(spatial="spatial ", the_data="{eachgrp} ``{x}``"),
+    memory="""
+    If spatial adjacency is uniform in time, it is recommended to use a square matrix
+    with dimension ``{x}.shape[-1]`` (n_vertices) to save memory and computation,
+    and to use ``max_step`` to define the extent of temporal adjacency to consider when
+    clustering.
+""",
 )
 tf = dict(
-    sp="",
-    lastdim=" (or the last two dimensions if ``{x}`` is 2D)",
-    parone="(for 2D data)",
-    partwo="(for 3D data)",
-    parthree="(for 4D data)",
-    memory=comb,
+    param_none="None | ",
+    if_none=_if_none.format(
+        spatial="",
+        the_data="{eachgrp} ``{x}`` (or the last two dimensions if ``{x}`` is 2D)",
+    ),
+    memory="""
+    The function `mne.stats.combine_adjacency` may be useful for 4D data.
+""",
 )
-nogroups = dict(eachgrp="", x="X")
+nogrps = dict(eachgrp="", x="X")
 groups = dict(eachgrp="each group ", x="X[k]")
-docdict["adjacency_clust_1"] = (
-    docdict["adjacency_clust"].format(**tf).format(**nogroups)
-)
-docdict["adjacency_clust_n"] = docdict["adjacency_clust"].format(**tf).format(**groups)
-docdict["adjacency_clust_st1"] = (
-    docdict["adjacency_clust"].format(**st).format(**nogroups)
-)
-docdict["adjacency_clust_stn"] = (
-    docdict["adjacency_clust"].format(**st).format(**groups)
+
+docdict["adjacency_clust_1"] = _adjacency_clust_template.format(**tf).format(**nogrps)
+docdict["adjacency_clust_both"] = _adjacency_clust_template.format(
+    param_none="", if_none="", memory=""
 )
+docdict["adjacency_clust_n"] = _adjacency_clust_template.format(**tf).format(**groups)
+docdict["adjacency_clust_st1"] = _adjacency_clust_template.format(**st).format(**nogrps)
+docdict["adjacency_clust_stn"] = _adjacency_clust_template.format(**st).format(**groups)
 
 docdict["adjust_dig_chpi"] = """
 adjust_dig : bool
@@ -708,7 +701,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 
 docdict["check_disjoint_clust"] = """
 check_disjoint : bool
-    Whether to check if the connectivity matrix can be separated into disjoint
+    Whether to check if the ``adjacency`` matrix can be separated into disjoint
     sets before clustering. This may lead to faster clustering, especially if
     the second dimension of ``X`` (usually the "time" dimension) is large.
 """
@@ -1416,7 +1409,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 """
 
 docdict["exclude_clust"] = """
-exclude : bool array or None
+exclude : array-like of bool | None
     Mask to apply to the data to exclude certain points from clustering
     (e.g., medial wall vertices). Should be the same shape as ``X``.
     If ``None``, no points are excluded.
@@ -3962,7 +3955,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 seed : None | int | instance of ~numpy.random.RandomState
     A seed for the NumPy random number generator (RNG). If ``None`` (default),
     the seed will be  obtained from the operating system
-    (see  :class:`~numpy.random.RandomState` for details), meaning it will most
+    (see :class:`~numpy.random.RandomState` for details), meaning it will most
     likely produce different output every time this function or method is run.
     To achieve reproducible results, pass a value here to explicitly initialize
     the RNG with a defined state.
@@ -4266,16 +4259,23 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     channel names in the file will be used when possible.
 """
 
-_stat_fun_clust_base = """
+_stat_fun_template = """
 stat_fun : callable | None
     Function called to calculate the test statistic. Must accept 1D-array as
-    input and return a 1D array. If ``None`` (the default), uses
-    `mne.stats.{}`.
+    input and return a 1D array. If ``None`` (the default), uses {}.
 """
 
-docdict["stat_fun_clust_f"] = _stat_fun_clust_base.format("f_oneway")
+docdict["stat_fun_clust_both"] = _stat_fun_template.format(
+    """:func:`mne.stats.ttest_1samp_no_p`
+    for paired tests and :func:`mne.stats.f_oneway` for unpaired tests or tests of
+    more than 2 groups."""
+)
+
+docdict["stat_fun_clust_f"] = _stat_fun_template.format(":func:`mne.stats.f_oneway`")
 
-docdict["stat_fun_clust_t"] = _stat_fun_clust_base.format("ttest_1samp_no_p")
+docdict["stat_fun_clust_t"] = _stat_fun_template.format(
+    ":func:`mne.stats.ttest_1samp_no_p`"
+)
 
 docdict["static"] = """
 static : instance of SpatialImage
@@ -4486,10 +4486,10 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 threshold : float | dict | None
     The so-called "cluster forming threshold" in the form of a test statistic
     (note: this is not an alpha level / "p-value").
-    If numeric, vertices with data values more extreme than ``threshold`` will
-    be used to form clusters. If ``None``, {} will be chosen
+    If numeric, vertices with stat values more extreme than ``threshold`` will
+    be used to form clusters. If ``None``, {which_thresh} will be chosen
     automatically that corresponds to a p-value of 0.05 for the given number of
-    observations (only valid when using {}). If ``threshold`` is a
+    observations (only valid when using {which_stat}). If ``threshold`` is a
     :class:`dict` (with keys ``'start'`` and ``'step'``) then threshold-free
     cluster enhancement (TFCE) will be used (see the
     :ref:`TFCE example <tfce_example>` and :footcite:`SmithNichols2009`).
@@ -4497,8 +4497,14 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     a particular p-value for one-tailed or two-tailed tests.
 """
 
-f_test = ("an F-threshold", "an F-statistic")
-docdict["threshold_clust_f"] = _threshold_clust_base.format(*f_test)
+docdict["threshold_clust_both"] = _threshold_clust_base.format(
+    which_thresh="a t- or F-threshold",
+    which_stat="``stat_fun=None``, i.e., a paired t-test or one-way F-test",
+)
+
+docdict["threshold_clust_f"] = _threshold_clust_base.format(
+    which_thresh="an F-threshold", which_stat="an F-statistic"
+)
 
 docdict["threshold_clust_f_notes"] = """
 For computing a ``threshold`` based on a p-value, use the conversion
@@ -4510,8 +4516,9 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     thresh = scipy.stats.f.ppf(1 - pval, dfn=dfn, dfd=dfd)  # F distribution
 """
 
-t_test = ("a t-threshold", "a t-statistic")
-docdict["threshold_clust_t"] = _threshold_clust_base.format(*t_test)
+docdict["threshold_clust_t"] = _threshold_clust_base.format(
+    which_thresh="a t-threshold", which_stat="a t-statistic"
+)
 
 docdict["threshold_clust_t_notes"] = """
 For computing a ``threshold`` based on a p-value, use the conversion
@@ -4525,6 +4532,23 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 For testing the lower tail (``tail=-1``), don't subtract ``pval`` from 1.
 """
 
+docdict["threshold_clust_t_or_f_notes"] = """
+For computing a ``threshold`` based on a p-value, use the conversion
+from :meth:`scipy.stats.rv_continuous.ppf`::
+
+    pval = 0.001  # arbitrary
+    # for t-statistic
+    df = n_observations - 1  # degrees of freedom for the t-test
+    thresh = scipy.stats.t.ppf(1 - pval / 2, df)  # two-tailed, t distribution
+    # for f-statistic
+    dfn = n_conditions - 1  # degrees of freedom numerator
+    dfd = n_observations - n_conditions  # degrees of freedom denominator
+    thresh = scipy.stats.f.ppf(1 - pval, dfn=dfn, dfd=dfd)  # F distribution
+
+For a one-tailed test (``tail=1``), don't divide the p-value by 2.
+For testing the lower tail (``tail=-1``), don't subtract ``pval`` from 1.
+"""
+
 docdict["time_bandwidth_tfr"] = """
 time_bandwidth : float ``≥ 2.0``
     Product between the temporal window length (in seconds) and the *full*
diff --git a/pyproject.toml b/pyproject.toml
index e82012b8d6d..0c36566021d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -106,7 +106,8 @@ full-no-qt = [
     "snirf",
     "defusedxml",
     "neo",
-    "antio>=0.4.0",
+    "formulaic",
+    "antio>=0.4.0"
 ]
 full = ["mne[full-no-qt]", "PyQt6!=6.6.0", "PyQt6-Qt6!=6.6.0,!=6.7.0"]
 full-pyqt6 = ["mne[full]"]
@@ -157,6 +158,7 @@ doc = [
     "sphinxcontrib-towncrier",
     "memory_profiler",
     "neo",
+    "formulaic",
     "seaborn!=0.11.2",
     "sphinx_copybutton",
     "sphinx-design",
diff --git a/tools/vulture_allowlist.py b/tools/vulture_allowlist.py
index ce5e95124d4..3515c04917a 100644
--- a/tools/vulture_allowlist.py
+++ b/tools/vulture_allowlist.py
@@ -154,3 +154,6 @@
 _qt_raise_window
 _qt_disable_paint
 _qt_get_stylesheet
+
+# used in tutorial, not sure why shows up
+plot_cluster_time_sensor
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
new file mode 100644
index 00000000000..0e5aee91432
--- /dev/null
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -0,0 +1,164 @@
+"""
+.. _tut-new-cluster-test-api:
+
+===============================================================
+New cluster test API that allows for Wilkinson style formulas
+===============================================================
+
+This tutorial shows how to use the new API for cluster testing.
+The new API allows for Wilkinson style formulas and allows for more flexibility in
+the design of the test. Here we will demonstrate how to use the new API for
+a standard paired t-test on evoked data from multiple subjects.
+It uses a non-parametric statistical procedure based on permutations and
+cluster level statistics.
+
+The procedure consists of:
+
+  - loading evoked data from multiple subjects
+  - construct a dataframe that contains the difference between conditions
+  - run the new cluster test function with formula in Wilkinson notation
+  - plot the results with the new ClusterResults API
+
+Here, the unit of observation are evokeds from multiple subjects (2nd level analysis).
+
+For more information on cluster-based permutation testing in MNE-Python,
+see also: :ref:`tut-cluster-one-samp-tfr`.
+"""
+# Authors: Carina Forster <carinaforster0611@gmail.com>
+#
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+# %% Load the required packages
+
+from pathlib import Path
+
+import pandas as pd
+
+import mne
+
+# %% Load the P3 dataset
+
+# Set parameters
+# --------------
+# Define the path to the P3 dataset
+path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3"
+
+# Define the range of participant IDs (we only have 5 participants in the dataset)
+participant_ids = range(15, 20)  # This will cover participant 15 to 19
+
+# store the evoked data of all subjects
+evokeds_allsubs = []
+
+# Loop over each participant ID and generate the corresponding filename
+# to load the evoked data
+for pid in participant_ids:
+    # Create the filename using an f-string, ID is zero-padded to 3 digits
+    filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
+
+    # Create the full path to the file
+    p3_file_path = Path(path_to_p3) / filename_p3
+
+    # load the evoked data
+    evokeds = mne.read_evokeds(p3_file_path)
+
+    # add single subjects evoked data to a list
+    evokeds_allsubs.append(evokeds)
+
+# the P3b dataset is part of the freely available ERP CORE dataset
+# participants were presented with a visual oddball task
+# and the P3b component was analyzed
+# the conditions of interest are the target (rare visual stimuli)
+# and non-target stimuli (frequent visual stimuli)
+
+# %% visually inspect the evoked data for each condition
+
+# let's extract the target and non-target evokeds
+target_only = [evoked[0] for evoked in evokeds_allsubs]
+non_target_only = [evoked[1] for evoked in evokeds_allsubs]
+
+# let's first have a look at the data
+
+# create contrast target - non-target
+diff_evoked = [
+    mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1])
+    for evokeds_a, evokeds_b in zip(target_only, non_target_only)
+]
+
+# plot the grand average of the difference signal
+mne.grand_average(diff_evoked).plot()
+# plot the topography of the difference signal
+mne.grand_average(diff_evoked).plot_topomap()
+
+# we can see that the strongest difference is around 400 ms in
+# central-parietal channels with a stronger evoked signal for target stimuli
+
+# %% Prepare the dataframe for the new cluster test API
+
+# the dataframe should contain the contrast evoked data and the subject index
+# each row in the dataframe should represent one observation (evoked data)
+
+# save the evoked data for both conditions in one list
+evokeds_conditions = target_only + non_target_only
+
+# create a list that defines the condition for each evoked data
+# this will be used to create the conditions column in the dataframe
+conditions = ["target"] * len(target_only) + ["non-target"] * len(non_target_only)
+
+# finally add a column that defines the subject index
+# this will be used to create the subject_index column in the dataframe
+# we multiply the participant_ids by 2 to account for the two conditions
+subject_index = list(participant_ids) * 2
+
+# create the dataframe containing the evoked data, the condition and the subject index
+df = pd.DataFrame(
+    {
+        "evoked": evokeds_conditions,
+        "condition": conditions,
+        "subject_index": subject_index,
+    }
+)
+
+# %% run the cluster test function with formulaic input
+
+# we will use the new API that allows for Wilkinson style formulas
+# the formula should be a string in Wilkinson notation
+
+# we want to test whether there is a significant difference between
+# target and non-target stimuli in the post-stimulus window
+# we will use a cluster-based permutation paired t-test for this
+
+# let's first define the formula based on Wilkinson notation
+# we want to predict the evoked difference signal based on the subject
+# the cluster test randomly permutes the subject label
+# the 1 in the formula represents the intercept which is always included
+# C is a categorical variable that will be dummy coded
+formula = "evoked ~ condition"
+
+# run the new cluster test API and return the new cluster_result object
+cluster_result = mne.stats.cluster_level.cluster_test(
+    df=df, formula=formula, within_id="subject_index"
+)
+# TODO: add n_permutations to cluster_result
+
+# print the lowest cluster p-value
+print(f"The lowest cluster p-value is: {cluster_result.cluster_p_values.min()}")
+
+# note that we ran an exact test due to the small sample size
+# (only 15 permutations)
+
+# %% plot the results
+
+# set up conditions dictionary for cluster plots
+# this is necessary for plotting the evoked data and the cluster result on top
+conditions_dict = {"target": target_only, "non-target": non_target_only}
+
+# finally let's plot the results using the ClusterResults class
+
+# we plot the cluster with the lowest p-value
+cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict, ci=True)
+# we can see that there is something going on around 400 ms
+# with a stronger signal for target trials in right central-parietal channels
+
+# however the cluster is not significant which is unsurprising
+# given the small sample size (only 5 subjects)