From 73765d4ba00142c5de7a83d4942ba85e152a7a99 Mon Sep 17 00:00:00 2001
From: Ben Mares <services-git-throwaway1@tensorial.com>
Date: Thu, 17 Aug 2023 19:09:25 +0200
Subject: [PATCH] Vendor model_builder from pymc-experimental (#339)

* Vendor model_builder from pymc-experimental

* Remove pymc-experimental dependency

* Add `pymc_marketing.vendored` to setuptools packages

* Fix setuptools package reference

* Exclude vendored from codecov

* Fix codecov paths

* Move model_builder directly inside pymc-marketing

* Remove vendored.pymc_experimental as setuptools package

* Start cleaning up mypy errors

* fixing mypy errors

* implementing abstract methods

* Add model_builder tests

* Satisfy linter for test_model_builder.py

---------

Co-authored-by: Michal Raczycki <michalr265@gmail.com>
---
 codecov.yml                                   |   2 +-
 mypy.ini                                      |   3 -
 pymc_marketing/clv/models/basic.py            |  29 +-
 pymc_marketing/clv/models/beta_geo.py         |   2 +-
 pymc_marketing/clv/models/gamma_gamma.py      |   3 +-
 pymc_marketing/clv/models/pareto_nbd.py       |   3 +-
 pymc_marketing/clv/models/shifted_beta_geo.py |   2 +-
 pymc_marketing/mmm/base.py                    |   8 +-
 pymc_marketing/mmm/delayed_saturated_mmm.py   |   2 +-
 pymc_marketing/model_builder.py               | 770 ++++++++++++++++++
 pyproject.toml                                |   3 +-
 tests/mmm/test_base.py                        |   4 +
 tests/model_builder/test_model_builder.py     | 246 ++++++
 13 files changed, 1056 insertions(+), 21 deletions(-)
 create mode 100644 pymc_marketing/model_builder.py
 create mode 100644 tests/model_builder/test_model_builder.py

diff --git a/codecov.yml b/codecov.yml
index 2eda28a2d..b16612ff3 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -7,7 +7,7 @@ coverage:
         threshold: 2%
         base: auto
         paths:
-          - "pymc-marketing/"
+          - "pymc_marketing"
        # advanced settings
         branches:
           - main
diff --git a/mypy.ini b/mypy.ini
index 7b73073ab..76d940a36 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -17,6 +17,3 @@ ignore_missing_imports = True
 
 [mypy-scipy.*]
 ignore_missing_imports = True
-
-[mypy-pymc_experimental.*]
-ignore_missing_imports = True
diff --git a/pymc_marketing/clv/models/basic.py b/pymc_marketing/clv/models/basic.py
index 5b35d1869..9173d6fc4 100644
--- a/pymc_marketing/clv/models/basic.py
+++ b/pymc_marketing/clv/models/basic.py
@@ -2,17 +2,20 @@
 import types
 import warnings
 from pathlib import Path
-from typing import Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 
 import arviz as az
+import numpy as np
+import pandas as pd
 import pymc as pm
 from pymc import str_for_dist
 from pymc.backends import NDArray
 from pymc.backends.base import MultiTrace
-from pymc_experimental.model_builder import ModelBuilder
 from pytensor.tensor import TensorVariable
 from xarray import Dataset
 
+from pymc_marketing.model_builder import ModelBuilder
+
 
 class CLVModel(ModelBuilder):
     _model_type = ""
@@ -27,7 +30,7 @@ def __init__(
     def __repr__(self):
         return f"{self._model_type}\n{self.model.str_repr()}"
 
-    def fit(
+    def fit(  # type: ignore
         self,
         fit_method: str = "mcmc",
         **kwargs,
@@ -44,7 +47,7 @@ def fit(
             Other keyword arguments passed to the underlying PyMC routines
         """
 
-        self.build_model()
+        self.build_model()  # type: ignore
 
         if fit_method == "mcmc":
             self._fit_mcmc(**kwargs)
@@ -179,7 +182,7 @@ def load(cls, fname: str):
         )
         model.idata = idata
 
-        model.build_model()
+        model.build_model()  # type: ignore
 
         if model.id != idata.attrs["id"]:
             raise ValueError(
@@ -225,7 +228,7 @@ def default_sampler_config(self) -> Dict:
     def _serializable_model_config(self) -> Dict:
         return self.model_config
 
-    def sample_prior_predictive(
+    def sample_prior_predictive(  # type: ignore
         self,
         samples: int = 1000,
         extend_idata: bool = True,
@@ -285,3 +288,17 @@ def fit_summary(self, **kwargs):
             return res["mean"].rename("value")
         else:
             return az.summary(self.fit_result, **kwargs)
+
+    @property
+    def output_var(self):
+        pass
+
+    def generate_and_preprocess_model_data(
+        self,
+        X: Union[pd.DataFrame, pd.Series],
+        y: Union[pd.Series, np.ndarray[Any, Any]],
+    ) -> None:
+        pass
+
+    def _data_setter(self):
+        pass
diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py
index a193ac65f..09920abe7 100644
--- a/pymc_marketing/clv/models/beta_geo.py
+++ b/pymc_marketing/clv/models/beta_geo.py
@@ -155,7 +155,7 @@ def default_model_config(self) -> Dict[str, Dict]:
             "r_prior": {"dist": "HalfFlat", "kwargs": {}},
         }
 
-    def build_model(
+    def build_model(  # type: ignore
         self,
     ) -> None:
         with pm.Model(coords=self.coords) as self.model:
diff --git a/pymc_marketing/clv/models/gamma_gamma.py b/pymc_marketing/clv/models/gamma_gamma.py
index 6f9f00e3b..b1c535100 100644
--- a/pymc_marketing/clv/models/gamma_gamma.py
+++ b/pymc_marketing/clv/models/gamma_gamma.py
@@ -71,7 +71,7 @@ def expected_customer_spend(
         mean_transaction_value, frequency = to_xarray(
             customer_id, mean_transaction_value, frequency
         )
-
+        assert self.idata is not None, "Model must be fitted first"
         p = self.idata.posterior["p"]
         q = self.idata.posterior["q"]
         v = self.idata.posterior["v"]
@@ -104,6 +104,7 @@ def distribution_new_customer_spend(
     def expected_new_customer_spend(self) -> xarray.DataArray:
         """Expected transaction value for a new customer"""
 
+        assert self.idata is not None, "Model must be fitted first"
         p_mean = self.idata.posterior["p"]
         q_mean = self.idata.posterior["q"]
         v_mean = self.idata.posterior["v"]
diff --git a/pymc_marketing/clv/models/pareto_nbd.py b/pymc_marketing/clv/models/pareto_nbd.py
index b75a3fadd..40c4f9b80 100644
--- a/pymc_marketing/clv/models/pareto_nbd.py
+++ b/pymc_marketing/clv/models/pareto_nbd.py
@@ -218,7 +218,7 @@ def default_model_config(self) -> Dict[str, Dict]:
             "beta_prior": {"dist": "Weibull", "kwargs": {"alpha": 2, "beta": 10}},
         }
 
-    def build_model(
+    def build_model(  # type: ignore
         self,
     ) -> None:
         with pm.Model(coords=self.coords) as self.model:
@@ -245,6 +245,7 @@ def _unload_params(
         self,
     ) -> Tuple[Any, ...]:
         """Utility function retrieving posterior parameters for predictive methods"""
+        assert self.idata is not None, "Model must be fit first."
         return tuple([self.idata.posterior[param] for param in self._params])
 
     # TODO: Convert to list comprehension to support covariates?
diff --git a/pymc_marketing/clv/models/shifted_beta_geo.py b/pymc_marketing/clv/models/shifted_beta_geo.py
index ff5dee52f..e5a548bfb 100644
--- a/pymc_marketing/clv/models/shifted_beta_geo.py
+++ b/pymc_marketing/clv/models/shifted_beta_geo.py
@@ -126,7 +126,7 @@ def default_model_config(self) -> Dict:
             "beta_prior": {"dist": "HalfFlat", "kwargs": {}},
         }
 
-    def build_model(
+    def build_model(  # type: ignore
         self,
     ) -> None:
         with pm.Model(coords=self.coords) as self.model:
diff --git a/pymc_marketing/mmm/base.py b/pymc_marketing/mmm/base.py
index d0583f8c2..49f5f3268 100644
--- a/pymc_marketing/mmm/base.py
+++ b/pymc_marketing/mmm/base.py
@@ -14,7 +14,6 @@
 import pandas as pd
 import pymc as pm
 import seaborn as sns
-from pymc_experimental.model_builder import ModelBuilder
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import FunctionTransformer
 from xarray import DataArray, Dataset
@@ -24,6 +23,7 @@
     ValidateDateColumn,
     ValidateTargetColumn,
 )
+from pymc_marketing.model_builder import ModelBuilder
 
 __all__ = ("BaseMMM", "MMM")
 
@@ -271,7 +271,7 @@ def plot_prior_predictive(
 
             ax.plot(
                 np.asarray(self.X[self.date_column]),
-                np.asarray(self.preprocessed_data["y"]),
+                np.asarray(self.preprocessed_data["y"]),  # type: ignore
                 color="black",
             )
             ax.set(
@@ -323,7 +323,7 @@ def plot_posterior_predictive(
             )
 
             target_to_plot: np.ndarray = np.asarray(
-                self.y if original_scale else self.preprocessed_data["y"]
+                self.y if original_scale else self.preprocessed_data["y"]  # type: ignore
             )
             ax.plot(
                 np.asarray(self.X[self.date_column]),
@@ -423,7 +423,7 @@ def plot_components_contributions(self, **plt_kwargs: Any) -> plt.Figure:
             )
             ax.plot(
                 np.asarray(self.X[self.date_column]),
-                np.asarray(self.preprocessed_data["y"]),
+                np.asarray(self.preprocessed_data["y"]),  # type: ignore
                 color="black",
             )
             ax.legend(title="components", loc="center left", bbox_to_anchor=(1, 0.5))
diff --git a/pymc_marketing/mmm/delayed_saturated_mmm.py b/pymc_marketing/mmm/delayed_saturated_mmm.py
index 7333d6174..4c476a28a 100644
--- a/pymc_marketing/mmm/delayed_saturated_mmm.py
+++ b/pymc_marketing/mmm/delayed_saturated_mmm.py
@@ -83,7 +83,7 @@ def default_sampler_config(self) -> Dict:
     def output_var(self):
         return "y"
 
-    def generate_and_preprocess_model_data(
+    def generate_and_preprocess_model_data(  # type: ignore
         self, X: Union[pd.DataFrame, pd.Series], y: pd.Series
     ) -> None:
         """
diff --git a/pymc_marketing/model_builder.py b/pymc_marketing/model_builder.py
new file mode 100644
index 000000000..1e3654b06
--- /dev/null
+++ b/pymc_marketing/model_builder.py
@@ -0,0 +1,770 @@
+#   Copyright 2023 The PyMC Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+
+import hashlib
+import json
+import warnings
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+import arviz as az
+import numpy as np
+import pandas as pd
+import pymc as pm
+import xarray as xr
+from pymc.util import RandomState
+
+# If scikit-learn is available, use its data validator
+try:
+    from sklearn.utils.validation import check_array, check_X_y
+# If scikit-learn is not available, return the data unchanged
+except ImportError:
+
+    def check_X_y(X, y, **kwargs):
+        return X, y
+
+    def check_array(X, **kwargs):
+        return X
+
+
+class ModelBuilder(ABC):
+    """
+    ModelBuilder can be used to provide an easy-to-use API (similar to scikit-learn) for models
+    and help with deployment.
+    """
+
+    _model_type = "BaseClass"
+    version = "None"
+
+    X: Optional[pd.DataFrame] = None
+    y: Optional[pd.Series] = None
+
+    def __init__(
+        self,
+        model_config: Optional[Dict] = None,
+        sampler_config: Optional[Dict] = None,
+    ):
+        """
+        Initializes model configuration and sampler configuration for the model
+
+        Parameters
+        ----------
+        data : Dictionary, optional
+            It is the data we need to train the model on.
+        model_config : Dictionary, optional
+            dictionary of parameters that initialise model configuration. Class-default defined by the user default_model_config method.
+        sampler_config : Dictionary, optional
+            dictionary of parameters that initialise sampler configuration. Class-default defined by the user default_sampler_config method.
+        Examples
+        --------
+        >>> class MyModel(ModelBuilder):
+        >>>     ...
+        >>> model = MyModel(model_config, sampler_config)
+        """
+        if sampler_config is None:
+            sampler_config = self.default_sampler_config
+        if model_config is None:
+            model_config = self.default_model_config
+        self.sampler_config = sampler_config
+        self.model_config = model_config  # parameters for priors etc.
+        self.model: Optional[pm.Model] = None  # Set by build_model
+        self.idata: Optional[
+            az.InferenceData
+        ] = None  # idata is generated during fitting
+        self.is_fitted_ = False
+
+    def _validate_data(self, X, y=None):
+        if y is not None:
+            return check_X_y(
+                X, y, accept_sparse=False, y_numeric=True, multi_output=False
+            )
+        else:
+            return check_array(X, accept_sparse=False)
+
+    @abstractmethod
+    def _data_setter(
+        self,
+        X: Union[np.ndarray, pd.DataFrame],
+        y: Optional[Union[np.ndarray, pd.Series]] = None,
+    ) -> None:
+        """
+        Sets new data in the model.
+
+        Parameters
+        ----------
+        X : array, shape (n_obs, n_features)
+            The training input samples.
+        y : array, shape (n_obs,)
+            The target values (real numbers).
+
+        Returns:
+        ----------
+        None
+
+        Examples
+        --------
+        >>> def _data_setter(self, data : pd.DataFrame):
+        >>>     with self.model:
+        >>>         pm.set_data({'x': X['x'].values})
+        >>>         try: # if y values in new data
+        >>>             pm.set_data({'y_data': y.values})
+        >>>         except: # dummies otherwise
+        >>>             pm.set_data({'y_data': np.zeros(len(data))})
+
+        """
+
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def output_var(self):
+        """
+        Returns the name of the output variable of the model.
+
+        Returns
+        -------
+        output_var : str
+            Name of the output variable of the model.
+        """
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def default_model_config(self) -> Dict:
+        """
+        Returns a class default config dict for model builder if no model_config is provided on class initialization
+        Useful for understanding structure of required model_config to allow its customization by users
+        Examples
+        --------
+        >>>     @classmethod
+        >>>     def default_model_config(self):
+        >>>         Return {
+        >>>             'a' : {
+        >>>                 'loc': 7,
+        >>>                 'scale' : 3
+        >>>             },
+        >>>             'b' : {
+        >>>                 'loc': 3,
+        >>>                 'scale': 5
+        >>>             }
+        >>>              'obs_error': 2
+        >>>         }
+
+        Returns
+        -------
+        model_config : dict
+            A set of default parameters for predictor distributions that allow to save and recreate the model.
+        """
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def default_sampler_config(self) -> Dict:
+        """
+        Returns a class default sampler dict for model builder if no sampler_config is provided on class initialization
+        Useful for understanding structure of required sampler_config to allow its customization by users
+        Examples
+        --------
+        >>>     @classmethod
+        >>>     def default_sampler_config(self):
+        >>>         Return {
+        >>>             'draws': 1_000,
+        >>>             'tune': 1_000,
+        >>>             'chains': 1,
+        >>>             'target_accept': 0.95,
+        >>>         }
+
+        Returns
+        -------
+        sampler_config : dict
+            A set of default settings for used by model in fit process.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def generate_and_preprocess_model_data(
+        self,
+        X: Union[pd.DataFrame, pd.Series],
+        y: Union[pd.Series, np.ndarray[Any, Any]],
+    ) -> None:
+        """
+        Applies preprocessing to the data before fitting the model.
+        if validate is True, it will check if the data is valid for the model.
+        sets self.model_coords based on provided dataset
+
+        Parameters:
+        X : array, shape (n_obs, n_features)
+        y : array, shape (n_obs,)
+
+        Examples
+        --------
+        >>>     @classmethod
+        >>>     def generate_and_preprocess_model_data(self, X, y):
+        >>>         x = np.linspace(start=1, stop=50, num=100)
+        >>>         y = 5 * x + 3 + np.random.normal(0, 1, len(x)) * np.random.rand(100)*10 +  np.random.rand(100)*6.4
+        >>>         X = pd.DataFrame(x, columns=['x'])
+        >>>         y = pd.Series(y, name='y')
+        >>>         self.X = X
+        >>>         self.y = y
+
+        Returns
+        -------
+        None
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def build_model(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        **kwargs,
+    ) -> None:
+        """
+        Creates an instance of pm.Model based on provided data and model_config, and
+        attaches it to self.
+
+        Parameters
+        ----------
+        X : pd.DataFrame
+            The input data that is going to be used in the model. This should be a DataFrame
+            containing the features (predictors) for the model. For efficiency reasons, it should
+            only contain the necessary data columns, not the entire available dataset, as this
+            will be encoded into the data used to recreate the model.
+
+        y : pd.Series
+            The target data for the model. This should be a Series representing the output
+            or dependent variable for the model.
+
+        kwargs : dict
+            Additional keyword arguments that may be used for model configuration.
+
+        See Also
+        --------
+        default_model_config : returns default model config
+
+        Returns
+        -------
+        None
+
+        Raises
+        ------
+        NotImplementedError
+            This is an abstract method and must be implemented in a subclass.
+        """
+        raise NotImplementedError
+
+    def sample_model(self, **kwargs):
+        """
+        Sample from the PyMC model.
+
+        Parameters
+        ----------
+        **kwargs : dict
+            Additional keyword arguments to pass to the PyMC sampler.
+
+        Returns
+        -------
+        xarray.Dataset
+            The PyMC samples dataset.
+
+        Raises
+        ------
+        RuntimeError
+            If the PyMC model hasn't been built yet.
+
+        Examples
+        --------
+        >>> self.build_model()
+        >>> idata = self.sample_model(draws=100, tune=10)
+        >>> assert isinstance(idata, xr.Dataset)
+        >>> assert "posterior" in idata
+        >>> assert "prior" in idata
+        >>> assert "observed_data" in idata
+        >>> assert "log_likelihood" in idata
+        """
+        if self.model is None:
+            raise RuntimeError(
+                "The model hasn't been built yet, call .build_model() first or call .fit() instead."
+            )
+
+        with self.model:
+            sampler_args = {**self.sampler_config, **kwargs}
+            idata = pm.sample(**sampler_args)
+            idata.extend(pm.sample_prior_predictive())
+            idata.extend(pm.sample_posterior_predictive(idata))
+
+        idata = self.set_idata_attrs(idata)
+        return idata
+
+    def set_idata_attrs(self, idata=None):
+        """
+        Set attributes on an InferenceData object.
+
+        Parameters
+        ----------
+        idata : arviz.InferenceData, optional
+            The InferenceData object to set attributes on.
+
+        Raises
+        ------
+        RuntimeError
+            If no InferenceData object is provided.
+
+        Returns
+        -------
+        None
+
+        Examples
+        --------
+        >>> model = MyModel(ModelBuilder)
+        >>> idata = az.InferenceData(your_dataset)
+        >>> model.set_idata_attrs(idata=idata)
+        >>> assert "id" in idata.attrs #this and the following lines are part of doctest, not user manual
+        >>> assert "model_type" in idata.attrs
+        >>> assert "version" in idata.attrs
+        >>> assert "sampler_config" in idata.attrs
+        >>> assert "model_config" in idata.attrs
+        """
+        if idata is None:
+            idata = self.idata
+        if idata is None:
+            raise RuntimeError("No idata provided to set attrs on.")
+        idata.attrs["id"] = self.id
+        idata.attrs["model_type"] = self._model_type
+        idata.attrs["version"] = self.version
+        idata.attrs["sampler_config"] = json.dumps(self.sampler_config)
+        idata.attrs["model_config"] = json.dumps(self._serializable_model_config)
+        # Only classes with non-dataset parameters will implement save_input_params
+        if hasattr(self, "_save_input_params"):
+            self._save_input_params(idata)
+        return idata
+
+    def save(self, fname: str) -> None:
+        """
+        Save the model's inference data to a file.
+
+        Parameters
+        ----------
+        fname : str
+            The name and path of the file to save the inference data with model parameters.
+
+        Returns
+        -------
+        None
+
+        Raises
+        ------
+        RuntimeError
+            If the model hasn't been fit yet (no inference data available).
+
+        Examples
+        --------
+        This method is meant to be overridden and implemented by subclasses.
+        It should not be called directly on the base abstract class or its instances.
+
+        >>> class MyModel(ModelBuilder):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>> model = MyModel()
+        >>> model.fit(data)
+        >>> model.save('model_results.nc')  # This will call the overridden method in MyModel
+        """
+        if self.idata is not None and "posterior" in self.idata:
+            file = Path(str(fname))
+            self.idata.to_netcdf(str(file))
+        else:
+            raise RuntimeError("The model hasn't been fit yet, call .fit() first")
+
+    @classmethod
+    def _model_config_formatting(cls, model_config: Dict) -> Dict:
+        """
+        Because of json serialization, model_config values that were originally tuples or numpy are being encoded as lists.
+        This function converts them back to tuples and numpy arrays to ensure correct id encoding.
+        """
+        for key in model_config:
+            if isinstance(model_config[key], dict):
+                for sub_key in model_config[key]:
+                    if isinstance(model_config[key][sub_key], list):
+                        # Check if "dims" key to convert it to tuple
+                        if sub_key == "dims":
+                            model_config[key][sub_key] = tuple(
+                                model_config[key][sub_key]
+                            )
+                        # Convert all other lists to numpy arrays
+                        else:
+                            model_config[key][sub_key] = np.array(
+                                model_config[key][sub_key]
+                            )
+        return model_config
+
+    @classmethod
+    def load(cls, fname: str):
+        """
+        Creates a ModelBuilder instance from a file,
+        Loads inference data for the model.
+
+        Parameters
+        ----------
+        fname : string
+            This denotes the name with path from where idata should be loaded from.
+
+        Returns
+        -------
+        Returns an instance of ModelBuilder.
+
+        Raises
+        ------
+        ValueError
+            If the inference data that is loaded doesn't match with the model.
+        Examples
+        --------
+        >>> class MyModel(ModelBuilder):
+        >>>     ...
+        >>> name = './mymodel.nc'
+        >>> imported_model = MyModel.load(name)
+        """
+        filepath = Path(str(fname))
+        idata = az.from_netcdf(filepath)
+        # needs to be converted, because json.loads was changing tuple to list
+        model_config = cls._model_config_formatting(
+            json.loads(idata.attrs["model_config"])
+        )
+        model = cls(
+            model_config=model_config,
+            sampler_config=json.loads(idata.attrs["sampler_config"]),
+        )
+        model.idata = idata
+        dataset = idata.fit_data.to_dataframe()
+        X = dataset.drop(columns=[model.output_var])
+        y = dataset[model.output_var]
+        model.build_model(X, y)
+        # All previously used data is in idata.
+
+        if model.id != idata.attrs["id"]:
+            raise ValueError(
+                f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'"
+            )
+
+        return model
+
+    def fit(
+        self,
+        X: pd.DataFrame,
+        y: Optional[Union[pd.Series, np.ndarray]] = None,
+        progressbar: bool = True,
+        predictor_names: Optional[List[str]] = None,
+        random_seed: RandomState = None,
+        **kwargs: Any,
+    ) -> az.InferenceData:
+        """
+        Fit a model using the data passed as a parameter.
+        Sets attrs to inference data of the model.
+
+
+        Parameters
+        ----------
+        X : array-like if sklearn is available, otherwise array, shape (n_obs, n_features)
+            The training input samples.
+        y : array-like if sklearn is available, otherwise array, shape (n_obs,)
+            The target values (real numbers).
+        progressbar : bool
+            Specifies whether the fit progressbar should be displayed
+        predictor_names: List[str] = None,
+            Allows for custom naming of predictors given in a form of 2dArray
+            allows for naming of predictors when given in a form of np.ndarray, if not provided the predictors will be named like predictor1, predictor2...
+        random_seed : RandomState
+            Provides sampler with initial random seed for obtaining reproducible samples
+        **kwargs : Any
+            Custom sampler settings can be provided in form of keyword arguments.
+
+        Returns
+        -------
+        self : az.InferenceData
+            returns inference data of the fitted model.
+        Examples
+        --------
+        >>> model = MyModel()
+        >>> idata = model.fit(data)
+        Auto-assigning NUTS sampler...
+        Initializing NUTS using jitter+adapt_diag...
+        """
+        if predictor_names is None:
+            predictor_names = []
+        if y is None:
+            y = np.zeros(X.shape[0])
+        y_df = pd.DataFrame({self.output_var: y})
+        self.generate_and_preprocess_model_data(X, y_df.values.flatten())
+        if self.X is None or self.y is None:
+            raise ValueError("X and y must be set before calling build_model!")
+        self.build_model(self.X, self.y)
+
+        sampler_config = self.sampler_config.copy()
+        sampler_config["progressbar"] = progressbar
+        sampler_config["random_seed"] = random_seed
+        sampler_config.update(**kwargs)
+        self.idata = self.sample_model(**sampler_config)
+
+        X_df = pd.DataFrame(X, columns=X.columns)
+        combined_data = pd.concat([X_df, y_df], axis=1)
+        assert all(combined_data.columns), "All columns must have non-empty names"
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                category=UserWarning,
+                message="The group fit_data is not defined in the InferenceData scheme",
+            )
+            self.idata.add_groups(fit_data=combined_data.to_xarray())  # type: ignore
+
+        return self.idata  # type: ignore
+
+    def predict(
+        self,
+        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
+        extend_idata: bool = True,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Uses model to predict on unseen data and return point prediction of all the samples. The point prediction
+        for each input row is the expected output value, computed as the mean of MCMC samples.
+
+        Parameters
+        ---------
+        X_pred : array-like if sklearn is available, otherwise array, shape (n_pred, n_features)
+            The input data used for prediction.
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to True.
+        **kwargs: Additional arguments to pass to pymc.sample_posterior_predictive
+
+        Returns
+        -------
+        y_pred : ndarray, shape (n_pred,)
+            Predicted output corresponding to input X_pred.
+
+        Examples
+        --------
+        >>> model = MyModel()
+        >>> idata = model.fit(data)
+        >>> x_pred = []
+        >>> prediction_data = pd.DataFrame({'input':x_pred})
+        >>> pred_mean = model.predict(prediction_data)
+        """
+
+        posterior_predictive_samples = self.sample_posterior_predictive(
+            X_pred, extend_idata, combined=False, **kwargs
+        )
+
+        if self.output_var not in posterior_predictive_samples:
+            raise KeyError(
+                f"Output variable {self.output_var} not found in posterior predictive samples."
+            )
+
+        posterior_means = posterior_predictive_samples[self.output_var].mean(
+            dim=["chain", "draw"], keep_attrs=True
+        )
+        return posterior_means.data
+
+    def sample_prior_predictive(
+        self,
+        X_pred,
+        y_pred=None,
+        samples: Optional[int] = None,
+        extend_idata: bool = False,
+        combined: bool = True,
+        **kwargs,
+    ):
+        """
+        Sample from the model's prior predictive distribution.
+
+        Parameters
+        ---------
+        X_pred : array, shape (n_pred, n_features)
+            The input data used for prediction using prior distribution.
+        samples : int
+            Number of samples from the prior parameter distributions to generate.
+            If not set, uses sampler_config['draws'] if that is available, otherwise defaults to 500.
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to False.
+        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
+            Defaults to True.
+        **kwargs: Additional arguments to pass to pymc.sample_prior_predictive
+
+        Returns
+        -------
+        prior_predictive_samples : DataArray, shape (n_pred, samples)
+            Prior predictive samples for each input X_pred
+        """
+        if y_pred is None:
+            y_pred = np.zeros(len(X_pred))
+        if samples is None:
+            samples = self.sampler_config.get("draws", 500)
+
+        if self.model is None:
+            self.build_model(X_pred, y_pred)
+
+        self._data_setter(X_pred, y_pred)
+        if self.model is not None:
+            with self.model:  # sample with new input data
+                prior_pred: az.InferenceData = pm.sample_prior_predictive(
+                    samples, **kwargs
+                )
+                self.set_idata_attrs(prior_pred)
+                if extend_idata:
+                    if self.idata is not None:
+                        self.idata.extend(prior_pred)
+                    else:
+                        self.idata = prior_pred
+
+        prior_predictive_samples = az.extract(
+            prior_pred, "prior_predictive", combined=combined
+        )
+
+        return prior_predictive_samples
+
+    def sample_posterior_predictive(self, X_pred, extend_idata, combined, **kwargs):
+        """
+        Sample from the model's posterior predictive distribution.
+
+        Parameters
+        ---------
+        X_pred : array, shape (n_pred, n_features)
+            The input data used for prediction using prior distribution..
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to False.
+        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
+            Defaults to True.
+        **kwargs: Additional arguments to pass to pymc.sample_posterior_predictive
+
+        Returns
+        -------
+        posterior_predictive_samples : DataArray, shape (n_pred, samples)
+            Posterior predictive samples for each input X_pred
+        """
+        self._data_setter(X_pred)
+
+        with self.model:  # sample with new input data
+            post_pred = pm.sample_posterior_predictive(self.idata, **kwargs)
+            if extend_idata:
+                self.idata.extend(post_pred)
+
+        posterior_predictive_samples = az.extract(
+            post_pred, "posterior_predictive", combined=combined
+        )
+
+        return posterior_predictive_samples
+
+    def get_params(self, deep=True):
+        """
+        Get all the model parameters needed to instantiate a copy of the model, not including training data.
+        """
+        return {
+            "model_config": self.model_config,
+            "sampler_config": self.sampler_config,
+        }
+
+    def set_params(self, **params):
+        """
+        Set all the model parameters needed to instantiate the model, not including training data.
+        """
+        self.model_config = params["model_config"]
+        self.sampler_config = params["sampler_config"]
+
+    @property
+    @abstractmethod
+    def _serializable_model_config(self) -> Dict[str, Union[int, float, Dict]]:
+        """
+        Converts non-serializable values from model_config to their serializable reversable equivalent.
+        Data types like pandas DataFrame, Series or datetime aren't JSON serializable,
+        so in order to save the model they need to be formatted.
+
+        Returns
+        -------
+        model_config: dict
+        """
+
+    def predict_proba(
+        self,
+        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
+        extend_idata: bool = True,
+        combined: bool = False,
+        **kwargs,
+    ) -> xr.DataArray:
+        """Alias for `predict_posterior`, for consistency with scikit-learn probabilistic estimators."""
+        return self.predict_posterior(X_pred, extend_idata, combined, **kwargs)
+
+    def predict_posterior(
+        self,
+        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
+        extend_idata: bool = True,
+        combined: bool = True,
+        **kwargs,
+    ) -> xr.DataArray:
+        """
+        Generate posterior predictive samples on unseen data.
+
+        Parameters
+        ---------
+        X_pred : array-like if sklearn is available, otherwise array, shape (n_pred, n_features)
+            The input data used for prediction.
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to True.
+        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
+            Defaults to True.
+        **kwargs: Additional arguments to pass to pymc.sample_posterior_predictive
+
+        Returns
+        -------
+        y_pred : DataArray, shape (n_pred, chains * draws) if combined is True, otherwise (chains, draws, n_pred)
+            Posterior predictive samples for each input X_pred
+        """
+
+        X_pred = self._validate_data(X_pred)
+        posterior_predictive_samples = self.sample_posterior_predictive(
+            X_pred, extend_idata, combined, **kwargs
+        )
+
+        if self.output_var not in posterior_predictive_samples:
+            raise KeyError(
+                f"Output variable {self.output_var} not found in posterior predictive samples."
+            )
+
+        return posterior_predictive_samples[self.output_var]
+
+    @property
+    def id(self) -> str:
+        """
+        Generate a unique hash value for the model.
+
+        The hash value is created using the last 16 characters of the SHA256 hash encoding, based on the model configuration,
+        version, and model type.
+
+        Returns
+        -------
+        str
+            A string of length 16 characters containing a unique hash of the model.
+
+        Examples
+        --------
+        >>> model = MyModel()
+        >>> model.id
+        '0123456789abcdef'
+        """
+        hasher = hashlib.sha256()
+        hasher.update(str(self.model_config.values()).encode())
+        hasher.update(self.version.encode())
+        hasher.update(self._model_type.encode())
+        return hasher.hexdigest()[:16]
diff --git a/pyproject.toml b/pyproject.toml
index 183a8292d..d78d61327 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,6 @@ dependencies = [
     "seaborn>=0.12.2",
     "xarray",
     "xarray-einstats>=0.5.1",
-    "pymc-experimental==0.0.9",
 ]
 
 [project.optional-dependencies]
@@ -58,7 +57,7 @@ packages = [
     "pymc_marketing",
     "pymc_marketing.mmm",
     "pymc_marketing.clv",
-    "pymc_marketing.clv.models"
+    "pymc_marketing.clv.models",
 ]
 
 [tool.setuptools.dynamic]
diff --git a/tests/mmm/test_base.py b/tests/mmm/test_base.py
index e1f18370c..55a643b2a 100644
--- a/tests/mmm/test_base.py
+++ b/tests/mmm/test_base.py
@@ -74,6 +74,10 @@ def default_model_config(self):
         def default_sampler_config(self):
             pass
 
+        @property
+        def output_var(self):
+            pass
+
         def _data_setter(self, X, y=None):
             pass
 
diff --git a/tests/model_builder/test_model_builder.py b/tests/model_builder/test_model_builder.py
new file mode 100644
index 000000000..34b08c905
--- /dev/null
+++ b/tests/model_builder/test_model_builder.py
@@ -0,0 +1,246 @@
+#   Copyright 2023 The PyMC Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import hashlib
+import json
+import sys
+import tempfile
+from typing import Dict
+
+import numpy as np
+import pandas as pd
+import pymc as pm
+import pytest
+
+from pymc_marketing.model_builder import ModelBuilder
+
+
+@pytest.fixture(scope="module")
+def toy_X():
+    x = np.linspace(start=0, stop=1, num=100)
+    X = pd.DataFrame({"input": x})
+    return X
+
+
+@pytest.fixture(scope="module")
+def toy_y(toy_X):
+    y = 5 * toy_X["input"] + 3
+    y = y + np.random.normal(0, 1, size=len(toy_X))
+    y = pd.Series(y, name="output")
+    return y
+
+
+@pytest.fixture(scope="module")
+def fitted_model_instance(toy_X, toy_y):
+    sampler_config = {
+        "draws": 100,
+        "tune": 100,
+        "chains": 2,
+        "target_accept": 0.95,
+    }
+    model_config = {
+        "a": {"loc": 0, "scale": 10, "dims": ("numbers",)},
+        "b": {"loc": 0, "scale": 10},
+        "obs_error": 2,
+    }
+    model = test_ModelBuilder(
+        model_config=model_config,
+        sampler_config=sampler_config,
+        test_parameter="test_paramter",
+    )
+    model.fit(toy_X)
+    return model
+
+
+class test_ModelBuilder(ModelBuilder):
+    def __init__(self, model_config=None, sampler_config=None, test_parameter=None):
+        self.test_parameter = test_parameter
+        super().__init__(model_config=model_config, sampler_config=sampler_config)
+
+    _model_type = "test_model"
+    version = "0.1"
+
+    def build_model(self, X: pd.DataFrame, y: pd.Series, model_config=None):
+        coords = {"numbers": np.arange(len(X))}
+        self.generate_and_preprocess_model_data(X, y)
+        with pm.Model(coords=coords) as self.model:
+            if model_config is None:
+                model_config = self.default_model_config
+            x = pm.MutableData("x", self.X["input"].values)
+            y_data = pm.MutableData("y_data", self.y)
+
+            # prior parameters
+            a_loc = model_config["a"]["loc"]
+            a_scale = model_config["a"]["scale"]
+            b_loc = model_config["b"]["loc"]
+            b_scale = model_config["b"]["scale"]
+            obs_error = model_config["obs_error"]
+
+            # priors
+            a = pm.Normal("a", a_loc, sigma=a_scale, dims=model_config["a"]["dims"])
+            b = pm.Normal("b", b_loc, sigma=b_scale)
+            obs_error = pm.HalfNormal("σ_model_fmc", obs_error)
+
+            # observed data
+            pm.Normal("output", a + b * x, obs_error, shape=x.shape, observed=y_data)
+
+    def _save_input_params(self, idata):
+        idata.attrs["test_paramter"] = json.dumps(self.test_parameter)
+
+    @property
+    def output_var(self):
+        return "output"
+
+    def _data_setter(self, x: pd.Series, y: pd.Series = None):
+        with self.model:
+            pm.set_data({"x": x.values})
+            if y is not None:
+                pm.set_data({"y_data": y.values})
+
+    @property
+    def _serializable_model_config(self):
+        return self.model_config
+
+    def generate_and_preprocess_model_data(self, X: pd.DataFrame, y: pd.Series):
+        self.X = X
+        self.y = y
+
+    @property
+    def default_model_config(self) -> Dict:
+        return {
+            "a": {"loc": 0, "scale": 10, "dims": ("numbers",)},
+            "b": {"loc": 0, "scale": 10},
+            "obs_error": 2,
+        }
+
+    @property
+    def default_sampler_config(self) -> Dict:
+        return {
+            "draws": 1_000,
+            "tune": 1_000,
+            "chains": 3,
+            "target_accept": 0.95,
+        }
+
+
+def test_save_input_params(fitted_model_instance):
+    assert fitted_model_instance.idata.attrs["test_paramter"] == '"test_paramter"'
+
+
+def test_save_load(fitted_model_instance):
+    temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False)
+    fitted_model_instance.save(temp.name)
+    test_builder2 = test_ModelBuilder.load(temp.name)
+    assert fitted_model_instance.idata.groups() == test_builder2.idata.groups()
+    assert fitted_model_instance.id == test_builder2.id
+    x_pred = np.random.uniform(low=0, high=1, size=100)
+    prediction_data = pd.DataFrame({"input": x_pred})
+    pred1 = fitted_model_instance.predict(prediction_data["input"])
+    pred2 = test_builder2.predict(prediction_data["input"])
+    assert pred1.shape == pred2.shape
+    temp.close()
+
+
+def test_initial_build_and_fit(fitted_model_instance, check_idata=True) -> ModelBuilder:
+    if check_idata:
+        assert fitted_model_instance.idata is not None
+        assert "posterior" in fitted_model_instance.idata.groups()
+
+
+def test_save_without_fit_raises_runtime_error():
+    model_builder = test_ModelBuilder()
+    with pytest.raises(RuntimeError):
+        model_builder.save("saved_model")
+
+
+def test_empty_sampler_config_fit(toy_X, toy_y):
+    sampler_config = {}
+    model_builder = test_ModelBuilder(sampler_config=sampler_config)
+    model_builder.idata = model_builder.fit(X=toy_X, y=toy_y)
+    assert model_builder.idata is not None
+    assert "posterior" in model_builder.idata.groups()
+
+
+def test_fit(fitted_model_instance):
+    prediction_data = pd.DataFrame(
+        {"input": np.random.uniform(low=0, high=1, size=100)}
+    )
+    fitted_model_instance.predict(prediction_data["input"])
+    post_pred = fitted_model_instance.sample_posterior_predictive(
+        prediction_data["input"], extend_idata=True, combined=True
+    )
+    post_pred[fitted_model_instance.output_var].shape[0] == prediction_data.input.shape
+
+
+def test_fit_no_y(toy_X):
+    model_builder = test_ModelBuilder()
+    model_builder.idata = model_builder.fit(X=toy_X)
+    assert model_builder.model is not None
+    assert model_builder.idata is not None
+    assert "posterior" in model_builder.idata.groups()
+
+
+@pytest.mark.skipif(
+    sys.platform == "win32",
+    reason="Permissions for temp files not granted on windows CI.",
+)
+def test_predict(fitted_model_instance):
+    x_pred = np.random.uniform(low=0, high=1, size=100)
+    prediction_data = pd.DataFrame({"input": x_pred})
+    pred = fitted_model_instance.predict(prediction_data["input"])
+    # Perform elementwise comparison using numpy
+    assert type(pred) == np.ndarray
+    assert len(pred) > 0
+
+
+@pytest.mark.parametrize("combined", [True, False])
+def test_sample_posterior_predictive(fitted_model_instance, combined):
+    n_pred = 100
+    x_pred = np.random.uniform(low=0, high=1, size=n_pred)
+    prediction_data = pd.DataFrame({"input": x_pred})
+    pred = fitted_model_instance.sample_posterior_predictive(
+        prediction_data["input"], combined=combined, extend_idata=True
+    )
+    chains = fitted_model_instance.idata.sample_stats.dims["chain"]
+    draws = fitted_model_instance.idata.sample_stats.dims["draw"]
+    expected_shape = (n_pred, chains * draws) if combined else (chains, draws, n_pred)
+    assert pred[fitted_model_instance.output_var].shape == expected_shape
+    assert np.issubdtype(pred[fitted_model_instance.output_var].dtype, np.floating)
+
+
+def test_model_config_formatting():
+    model_config = {
+        "a": {
+            "loc": [0, 0],
+            "scale": 10,
+            "dims": [
+                "x",
+            ],
+        },
+    }
+    model_builder = test_ModelBuilder()
+    converted_model_config = model_builder._model_config_formatting(model_config)
+    np.testing.assert_equal(converted_model_config["a"]["dims"], ("x",))
+    np.testing.assert_equal(converted_model_config["a"]["loc"], np.array([0, 0]))
+
+
+def test_id():
+    model_builder = test_ModelBuilder()
+    expected_id = hashlib.sha256(
+        str(model_builder.model_config.values()).encode()
+        + model_builder.version.encode()
+        + model_builder._model_type.encode()
+    ).hexdigest()[:16]
+
+    assert model_builder.id == expected_id