From 73765d4ba00142c5de7a83d4942ba85e152a7a99 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Thu, 17 Aug 2023 19:09:25 +0200 Subject: [PATCH] Vendor model_builder from pymc-experimental (#339) * Vendor model_builder from pymc-experimental * Remove pymc-experimental dependency * Add `pymc_marketing.vendored` to setuptools packages * Fix setuptools package reference * Exclude vendored from codecov * Fix codecov paths * Move model_builder directly inside pymc-marketing * Remove vendored.pymc_experimental as setuptools package * Start cleaning up mypy errors * fixing mypy errors * implementing abstract methods * Add model_builder tests * Satisfy linter for test_model_builder.py --------- Co-authored-by: Michal Raczycki --- codecov.yml | 2 +- mypy.ini | 3 - pymc_marketing/clv/models/basic.py | 29 +- pymc_marketing/clv/models/beta_geo.py | 2 +- pymc_marketing/clv/models/gamma_gamma.py | 3 +- pymc_marketing/clv/models/pareto_nbd.py | 3 +- pymc_marketing/clv/models/shifted_beta_geo.py | 2 +- pymc_marketing/mmm/base.py | 8 +- pymc_marketing/mmm/delayed_saturated_mmm.py | 2 +- pymc_marketing/model_builder.py | 770 ++++++++++++++++++ pyproject.toml | 3 +- tests/mmm/test_base.py | 4 + tests/model_builder/test_model_builder.py | 246 ++++++ 13 files changed, 1056 insertions(+), 21 deletions(-) create mode 100644 pymc_marketing/model_builder.py create mode 100644 tests/model_builder/test_model_builder.py diff --git a/codecov.yml b/codecov.yml index 2eda28a2d..b16612ff3 100644 --- a/codecov.yml +++ b/codecov.yml @@ -7,7 +7,7 @@ coverage: threshold: 2% base: auto paths: - - "pymc-marketing/" + - "pymc_marketing" # advanced settings branches: - main diff --git a/mypy.ini b/mypy.ini index 7b73073ab..76d940a36 100644 --- a/mypy.ini +++ b/mypy.ini @@ -17,6 +17,3 @@ ignore_missing_imports = True [mypy-scipy.*] ignore_missing_imports = True - -[mypy-pymc_experimental.*] -ignore_missing_imports = True diff --git a/pymc_marketing/clv/models/basic.py b/pymc_marketing/clv/models/basic.py index 5b35d1869..9173d6fc4 100644 --- a/pymc_marketing/clv/models/basic.py +++ b/pymc_marketing/clv/models/basic.py @@ -2,17 +2,20 @@ import types import warnings from pathlib import Path -from typing import Dict, Optional, Tuple +from typing import Any, Dict, Optional, Tuple, Union import arviz as az +import numpy as np +import pandas as pd import pymc as pm from pymc import str_for_dist from pymc.backends import NDArray from pymc.backends.base import MultiTrace -from pymc_experimental.model_builder import ModelBuilder from pytensor.tensor import TensorVariable from xarray import Dataset +from pymc_marketing.model_builder import ModelBuilder + class CLVModel(ModelBuilder): _model_type = "" @@ -27,7 +30,7 @@ def __init__( def __repr__(self): return f"{self._model_type}\n{self.model.str_repr()}" - def fit( + def fit( # type: ignore self, fit_method: str = "mcmc", **kwargs, @@ -44,7 +47,7 @@ def fit( Other keyword arguments passed to the underlying PyMC routines """ - self.build_model() + self.build_model() # type: ignore if fit_method == "mcmc": self._fit_mcmc(**kwargs) @@ -179,7 +182,7 @@ def load(cls, fname: str): ) model.idata = idata - model.build_model() + model.build_model() # type: ignore if model.id != idata.attrs["id"]: raise ValueError( @@ -225,7 +228,7 @@ def default_sampler_config(self) -> Dict: def _serializable_model_config(self) -> Dict: return self.model_config - def sample_prior_predictive( + def sample_prior_predictive( # type: ignore self, samples: int = 1000, extend_idata: bool = True, @@ -285,3 +288,17 @@ def fit_summary(self, **kwargs): return res["mean"].rename("value") else: return az.summary(self.fit_result, **kwargs) + + @property + def output_var(self): + pass + + def generate_and_preprocess_model_data( + self, + X: Union[pd.DataFrame, pd.Series], + y: Union[pd.Series, np.ndarray[Any, Any]], + ) -> None: + pass + + def _data_setter(self): + pass diff --git a/pymc_marketing/clv/models/beta_geo.py b/pymc_marketing/clv/models/beta_geo.py index a193ac65f..09920abe7 100644 --- a/pymc_marketing/clv/models/beta_geo.py +++ b/pymc_marketing/clv/models/beta_geo.py @@ -155,7 +155,7 @@ def default_model_config(self) -> Dict[str, Dict]: "r_prior": {"dist": "HalfFlat", "kwargs": {}}, } - def build_model( + def build_model( # type: ignore self, ) -> None: with pm.Model(coords=self.coords) as self.model: diff --git a/pymc_marketing/clv/models/gamma_gamma.py b/pymc_marketing/clv/models/gamma_gamma.py index 6f9f00e3b..b1c535100 100644 --- a/pymc_marketing/clv/models/gamma_gamma.py +++ b/pymc_marketing/clv/models/gamma_gamma.py @@ -71,7 +71,7 @@ def expected_customer_spend( mean_transaction_value, frequency = to_xarray( customer_id, mean_transaction_value, frequency ) - + assert self.idata is not None, "Model must be fitted first" p = self.idata.posterior["p"] q = self.idata.posterior["q"] v = self.idata.posterior["v"] @@ -104,6 +104,7 @@ def distribution_new_customer_spend( def expected_new_customer_spend(self) -> xarray.DataArray: """Expected transaction value for a new customer""" + assert self.idata is not None, "Model must be fitted first" p_mean = self.idata.posterior["p"] q_mean = self.idata.posterior["q"] v_mean = self.idata.posterior["v"] diff --git a/pymc_marketing/clv/models/pareto_nbd.py b/pymc_marketing/clv/models/pareto_nbd.py index b75a3fadd..40c4f9b80 100644 --- a/pymc_marketing/clv/models/pareto_nbd.py +++ b/pymc_marketing/clv/models/pareto_nbd.py @@ -218,7 +218,7 @@ def default_model_config(self) -> Dict[str, Dict]: "beta_prior": {"dist": "Weibull", "kwargs": {"alpha": 2, "beta": 10}}, } - def build_model( + def build_model( # type: ignore self, ) -> None: with pm.Model(coords=self.coords) as self.model: @@ -245,6 +245,7 @@ def _unload_params( self, ) -> Tuple[Any, ...]: """Utility function retrieving posterior parameters for predictive methods""" + assert self.idata is not None, "Model must be fit first." return tuple([self.idata.posterior[param] for param in self._params]) # TODO: Convert to list comprehension to support covariates? diff --git a/pymc_marketing/clv/models/shifted_beta_geo.py b/pymc_marketing/clv/models/shifted_beta_geo.py index ff5dee52f..e5a548bfb 100644 --- a/pymc_marketing/clv/models/shifted_beta_geo.py +++ b/pymc_marketing/clv/models/shifted_beta_geo.py @@ -126,7 +126,7 @@ def default_model_config(self) -> Dict: "beta_prior": {"dist": "HalfFlat", "kwargs": {}}, } - def build_model( + def build_model( # type: ignore self, ) -> None: with pm.Model(coords=self.coords) as self.model: diff --git a/pymc_marketing/mmm/base.py b/pymc_marketing/mmm/base.py index d0583f8c2..49f5f3268 100644 --- a/pymc_marketing/mmm/base.py +++ b/pymc_marketing/mmm/base.py @@ -14,7 +14,6 @@ import pandas as pd import pymc as pm import seaborn as sns -from pymc_experimental.model_builder import ModelBuilder from sklearn.pipeline import Pipeline from sklearn.preprocessing import FunctionTransformer from xarray import DataArray, Dataset @@ -24,6 +23,7 @@ ValidateDateColumn, ValidateTargetColumn, ) +from pymc_marketing.model_builder import ModelBuilder __all__ = ("BaseMMM", "MMM") @@ -271,7 +271,7 @@ def plot_prior_predictive( ax.plot( np.asarray(self.X[self.date_column]), - np.asarray(self.preprocessed_data["y"]), + np.asarray(self.preprocessed_data["y"]), # type: ignore color="black", ) ax.set( @@ -323,7 +323,7 @@ def plot_posterior_predictive( ) target_to_plot: np.ndarray = np.asarray( - self.y if original_scale else self.preprocessed_data["y"] + self.y if original_scale else self.preprocessed_data["y"] # type: ignore ) ax.plot( np.asarray(self.X[self.date_column]), @@ -423,7 +423,7 @@ def plot_components_contributions(self, **plt_kwargs: Any) -> plt.Figure: ) ax.plot( np.asarray(self.X[self.date_column]), - np.asarray(self.preprocessed_data["y"]), + np.asarray(self.preprocessed_data["y"]), # type: ignore color="black", ) ax.legend(title="components", loc="center left", bbox_to_anchor=(1, 0.5)) diff --git a/pymc_marketing/mmm/delayed_saturated_mmm.py b/pymc_marketing/mmm/delayed_saturated_mmm.py index 7333d6174..4c476a28a 100644 --- a/pymc_marketing/mmm/delayed_saturated_mmm.py +++ b/pymc_marketing/mmm/delayed_saturated_mmm.py @@ -83,7 +83,7 @@ def default_sampler_config(self) -> Dict: def output_var(self): return "y" - def generate_and_preprocess_model_data( + def generate_and_preprocess_model_data( # type: ignore self, X: Union[pd.DataFrame, pd.Series], y: pd.Series ) -> None: """ diff --git a/pymc_marketing/model_builder.py b/pymc_marketing/model_builder.py new file mode 100644 index 000000000..1e3654b06 --- /dev/null +++ b/pymc_marketing/model_builder.py @@ -0,0 +1,770 @@ +# Copyright 2023 The PyMC Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import hashlib +import json +import warnings +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import arviz as az +import numpy as np +import pandas as pd +import pymc as pm +import xarray as xr +from pymc.util import RandomState + +# If scikit-learn is available, use its data validator +try: + from sklearn.utils.validation import check_array, check_X_y +# If scikit-learn is not available, return the data unchanged +except ImportError: + + def check_X_y(X, y, **kwargs): + return X, y + + def check_array(X, **kwargs): + return X + + +class ModelBuilder(ABC): + """ + ModelBuilder can be used to provide an easy-to-use API (similar to scikit-learn) for models + and help with deployment. + """ + + _model_type = "BaseClass" + version = "None" + + X: Optional[pd.DataFrame] = None + y: Optional[pd.Series] = None + + def __init__( + self, + model_config: Optional[Dict] = None, + sampler_config: Optional[Dict] = None, + ): + """ + Initializes model configuration and sampler configuration for the model + + Parameters + ---------- + data : Dictionary, optional + It is the data we need to train the model on. + model_config : Dictionary, optional + dictionary of parameters that initialise model configuration. Class-default defined by the user default_model_config method. + sampler_config : Dictionary, optional + dictionary of parameters that initialise sampler configuration. Class-default defined by the user default_sampler_config method. + Examples + -------- + >>> class MyModel(ModelBuilder): + >>> ... + >>> model = MyModel(model_config, sampler_config) + """ + if sampler_config is None: + sampler_config = self.default_sampler_config + if model_config is None: + model_config = self.default_model_config + self.sampler_config = sampler_config + self.model_config = model_config # parameters for priors etc. + self.model: Optional[pm.Model] = None # Set by build_model + self.idata: Optional[ + az.InferenceData + ] = None # idata is generated during fitting + self.is_fitted_ = False + + def _validate_data(self, X, y=None): + if y is not None: + return check_X_y( + X, y, accept_sparse=False, y_numeric=True, multi_output=False + ) + else: + return check_array(X, accept_sparse=False) + + @abstractmethod + def _data_setter( + self, + X: Union[np.ndarray, pd.DataFrame], + y: Optional[Union[np.ndarray, pd.Series]] = None, + ) -> None: + """ + Sets new data in the model. + + Parameters + ---------- + X : array, shape (n_obs, n_features) + The training input samples. + y : array, shape (n_obs,) + The target values (real numbers). + + Returns: + ---------- + None + + Examples + -------- + >>> def _data_setter(self, data : pd.DataFrame): + >>> with self.model: + >>> pm.set_data({'x': X['x'].values}) + >>> try: # if y values in new data + >>> pm.set_data({'y_data': y.values}) + >>> except: # dummies otherwise + >>> pm.set_data({'y_data': np.zeros(len(data))}) + + """ + + raise NotImplementedError + + @property + @abstractmethod + def output_var(self): + """ + Returns the name of the output variable of the model. + + Returns + ------- + output_var : str + Name of the output variable of the model. + """ + raise NotImplementedError + + @property + @abstractmethod + def default_model_config(self) -> Dict: + """ + Returns a class default config dict for model builder if no model_config is provided on class initialization + Useful for understanding structure of required model_config to allow its customization by users + Examples + -------- + >>> @classmethod + >>> def default_model_config(self): + >>> Return { + >>> 'a' : { + >>> 'loc': 7, + >>> 'scale' : 3 + >>> }, + >>> 'b' : { + >>> 'loc': 3, + >>> 'scale': 5 + >>> } + >>> 'obs_error': 2 + >>> } + + Returns + ------- + model_config : dict + A set of default parameters for predictor distributions that allow to save and recreate the model. + """ + raise NotImplementedError + + @property + @abstractmethod + def default_sampler_config(self) -> Dict: + """ + Returns a class default sampler dict for model builder if no sampler_config is provided on class initialization + Useful for understanding structure of required sampler_config to allow its customization by users + Examples + -------- + >>> @classmethod + >>> def default_sampler_config(self): + >>> Return { + >>> 'draws': 1_000, + >>> 'tune': 1_000, + >>> 'chains': 1, + >>> 'target_accept': 0.95, + >>> } + + Returns + ------- + sampler_config : dict + A set of default settings for used by model in fit process. + """ + raise NotImplementedError + + @abstractmethod + def generate_and_preprocess_model_data( + self, + X: Union[pd.DataFrame, pd.Series], + y: Union[pd.Series, np.ndarray[Any, Any]], + ) -> None: + """ + Applies preprocessing to the data before fitting the model. + if validate is True, it will check if the data is valid for the model. + sets self.model_coords based on provided dataset + + Parameters: + X : array, shape (n_obs, n_features) + y : array, shape (n_obs,) + + Examples + -------- + >>> @classmethod + >>> def generate_and_preprocess_model_data(self, X, y): + >>> x = np.linspace(start=1, stop=50, num=100) + >>> y = 5 * x + 3 + np.random.normal(0, 1, len(x)) * np.random.rand(100)*10 + np.random.rand(100)*6.4 + >>> X = pd.DataFrame(x, columns=['x']) + >>> y = pd.Series(y, name='y') + >>> self.X = X + >>> self.y = y + + Returns + ------- + None + + """ + raise NotImplementedError + + @abstractmethod + def build_model( + self, + X: pd.DataFrame, + y: pd.Series, + **kwargs, + ) -> None: + """ + Creates an instance of pm.Model based on provided data and model_config, and + attaches it to self. + + Parameters + ---------- + X : pd.DataFrame + The input data that is going to be used in the model. This should be a DataFrame + containing the features (predictors) for the model. For efficiency reasons, it should + only contain the necessary data columns, not the entire available dataset, as this + will be encoded into the data used to recreate the model. + + y : pd.Series + The target data for the model. This should be a Series representing the output + or dependent variable for the model. + + kwargs : dict + Additional keyword arguments that may be used for model configuration. + + See Also + -------- + default_model_config : returns default model config + + Returns + ------- + None + + Raises + ------ + NotImplementedError + This is an abstract method and must be implemented in a subclass. + """ + raise NotImplementedError + + def sample_model(self, **kwargs): + """ + Sample from the PyMC model. + + Parameters + ---------- + **kwargs : dict + Additional keyword arguments to pass to the PyMC sampler. + + Returns + ------- + xarray.Dataset + The PyMC samples dataset. + + Raises + ------ + RuntimeError + If the PyMC model hasn't been built yet. + + Examples + -------- + >>> self.build_model() + >>> idata = self.sample_model(draws=100, tune=10) + >>> assert isinstance(idata, xr.Dataset) + >>> assert "posterior" in idata + >>> assert "prior" in idata + >>> assert "observed_data" in idata + >>> assert "log_likelihood" in idata + """ + if self.model is None: + raise RuntimeError( + "The model hasn't been built yet, call .build_model() first or call .fit() instead." + ) + + with self.model: + sampler_args = {**self.sampler_config, **kwargs} + idata = pm.sample(**sampler_args) + idata.extend(pm.sample_prior_predictive()) + idata.extend(pm.sample_posterior_predictive(idata)) + + idata = self.set_idata_attrs(idata) + return idata + + def set_idata_attrs(self, idata=None): + """ + Set attributes on an InferenceData object. + + Parameters + ---------- + idata : arviz.InferenceData, optional + The InferenceData object to set attributes on. + + Raises + ------ + RuntimeError + If no InferenceData object is provided. + + Returns + ------- + None + + Examples + -------- + >>> model = MyModel(ModelBuilder) + >>> idata = az.InferenceData(your_dataset) + >>> model.set_idata_attrs(idata=idata) + >>> assert "id" in idata.attrs #this and the following lines are part of doctest, not user manual + >>> assert "model_type" in idata.attrs + >>> assert "version" in idata.attrs + >>> assert "sampler_config" in idata.attrs + >>> assert "model_config" in idata.attrs + """ + if idata is None: + idata = self.idata + if idata is None: + raise RuntimeError("No idata provided to set attrs on.") + idata.attrs["id"] = self.id + idata.attrs["model_type"] = self._model_type + idata.attrs["version"] = self.version + idata.attrs["sampler_config"] = json.dumps(self.sampler_config) + idata.attrs["model_config"] = json.dumps(self._serializable_model_config) + # Only classes with non-dataset parameters will implement save_input_params + if hasattr(self, "_save_input_params"): + self._save_input_params(idata) + return idata + + def save(self, fname: str) -> None: + """ + Save the model's inference data to a file. + + Parameters + ---------- + fname : str + The name and path of the file to save the inference data with model parameters. + + Returns + ------- + None + + Raises + ------ + RuntimeError + If the model hasn't been fit yet (no inference data available). + + Examples + -------- + This method is meant to be overridden and implemented by subclasses. + It should not be called directly on the base abstract class or its instances. + + >>> class MyModel(ModelBuilder): + >>> def __init__(self): + >>> super().__init__() + >>> model = MyModel() + >>> model.fit(data) + >>> model.save('model_results.nc') # This will call the overridden method in MyModel + """ + if self.idata is not None and "posterior" in self.idata: + file = Path(str(fname)) + self.idata.to_netcdf(str(file)) + else: + raise RuntimeError("The model hasn't been fit yet, call .fit() first") + + @classmethod + def _model_config_formatting(cls, model_config: Dict) -> Dict: + """ + Because of json serialization, model_config values that were originally tuples or numpy are being encoded as lists. + This function converts them back to tuples and numpy arrays to ensure correct id encoding. + """ + for key in model_config: + if isinstance(model_config[key], dict): + for sub_key in model_config[key]: + if isinstance(model_config[key][sub_key], list): + # Check if "dims" key to convert it to tuple + if sub_key == "dims": + model_config[key][sub_key] = tuple( + model_config[key][sub_key] + ) + # Convert all other lists to numpy arrays + else: + model_config[key][sub_key] = np.array( + model_config[key][sub_key] + ) + return model_config + + @classmethod + def load(cls, fname: str): + """ + Creates a ModelBuilder instance from a file, + Loads inference data for the model. + + Parameters + ---------- + fname : string + This denotes the name with path from where idata should be loaded from. + + Returns + ------- + Returns an instance of ModelBuilder. + + Raises + ------ + ValueError + If the inference data that is loaded doesn't match with the model. + Examples + -------- + >>> class MyModel(ModelBuilder): + >>> ... + >>> name = './mymodel.nc' + >>> imported_model = MyModel.load(name) + """ + filepath = Path(str(fname)) + idata = az.from_netcdf(filepath) + # needs to be converted, because json.loads was changing tuple to list + model_config = cls._model_config_formatting( + json.loads(idata.attrs["model_config"]) + ) + model = cls( + model_config=model_config, + sampler_config=json.loads(idata.attrs["sampler_config"]), + ) + model.idata = idata + dataset = idata.fit_data.to_dataframe() + X = dataset.drop(columns=[model.output_var]) + y = dataset[model.output_var] + model.build_model(X, y) + # All previously used data is in idata. + + if model.id != idata.attrs["id"]: + raise ValueError( + f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'" + ) + + return model + + def fit( + self, + X: pd.DataFrame, + y: Optional[Union[pd.Series, np.ndarray]] = None, + progressbar: bool = True, + predictor_names: Optional[List[str]] = None, + random_seed: RandomState = None, + **kwargs: Any, + ) -> az.InferenceData: + """ + Fit a model using the data passed as a parameter. + Sets attrs to inference data of the model. + + + Parameters + ---------- + X : array-like if sklearn is available, otherwise array, shape (n_obs, n_features) + The training input samples. + y : array-like if sklearn is available, otherwise array, shape (n_obs,) + The target values (real numbers). + progressbar : bool + Specifies whether the fit progressbar should be displayed + predictor_names: List[str] = None, + Allows for custom naming of predictors given in a form of 2dArray + allows for naming of predictors when given in a form of np.ndarray, if not provided the predictors will be named like predictor1, predictor2... + random_seed : RandomState + Provides sampler with initial random seed for obtaining reproducible samples + **kwargs : Any + Custom sampler settings can be provided in form of keyword arguments. + + Returns + ------- + self : az.InferenceData + returns inference data of the fitted model. + Examples + -------- + >>> model = MyModel() + >>> idata = model.fit(data) + Auto-assigning NUTS sampler... + Initializing NUTS using jitter+adapt_diag... + """ + if predictor_names is None: + predictor_names = [] + if y is None: + y = np.zeros(X.shape[0]) + y_df = pd.DataFrame({self.output_var: y}) + self.generate_and_preprocess_model_data(X, y_df.values.flatten()) + if self.X is None or self.y is None: + raise ValueError("X and y must be set before calling build_model!") + self.build_model(self.X, self.y) + + sampler_config = self.sampler_config.copy() + sampler_config["progressbar"] = progressbar + sampler_config["random_seed"] = random_seed + sampler_config.update(**kwargs) + self.idata = self.sample_model(**sampler_config) + + X_df = pd.DataFrame(X, columns=X.columns) + combined_data = pd.concat([X_df, y_df], axis=1) + assert all(combined_data.columns), "All columns must have non-empty names" + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=UserWarning, + message="The group fit_data is not defined in the InferenceData scheme", + ) + self.idata.add_groups(fit_data=combined_data.to_xarray()) # type: ignore + + return self.idata # type: ignore + + def predict( + self, + X_pred: Union[np.ndarray, pd.DataFrame, pd.Series], + extend_idata: bool = True, + **kwargs, + ) -> np.ndarray: + """ + Uses model to predict on unseen data and return point prediction of all the samples. The point prediction + for each input row is the expected output value, computed as the mean of MCMC samples. + + Parameters + --------- + X_pred : array-like if sklearn is available, otherwise array, shape (n_pred, n_features) + The input data used for prediction. + extend_idata : Boolean determining whether the predictions should be added to inference data object. + Defaults to True. + **kwargs: Additional arguments to pass to pymc.sample_posterior_predictive + + Returns + ------- + y_pred : ndarray, shape (n_pred,) + Predicted output corresponding to input X_pred. + + Examples + -------- + >>> model = MyModel() + >>> idata = model.fit(data) + >>> x_pred = [] + >>> prediction_data = pd.DataFrame({'input':x_pred}) + >>> pred_mean = model.predict(prediction_data) + """ + + posterior_predictive_samples = self.sample_posterior_predictive( + X_pred, extend_idata, combined=False, **kwargs + ) + + if self.output_var not in posterior_predictive_samples: + raise KeyError( + f"Output variable {self.output_var} not found in posterior predictive samples." + ) + + posterior_means = posterior_predictive_samples[self.output_var].mean( + dim=["chain", "draw"], keep_attrs=True + ) + return posterior_means.data + + def sample_prior_predictive( + self, + X_pred, + y_pred=None, + samples: Optional[int] = None, + extend_idata: bool = False, + combined: bool = True, + **kwargs, + ): + """ + Sample from the model's prior predictive distribution. + + Parameters + --------- + X_pred : array, shape (n_pred, n_features) + The input data used for prediction using prior distribution. + samples : int + Number of samples from the prior parameter distributions to generate. + If not set, uses sampler_config['draws'] if that is available, otherwise defaults to 500. + extend_idata : Boolean determining whether the predictions should be added to inference data object. + Defaults to False. + combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists. + Defaults to True. + **kwargs: Additional arguments to pass to pymc.sample_prior_predictive + + Returns + ------- + prior_predictive_samples : DataArray, shape (n_pred, samples) + Prior predictive samples for each input X_pred + """ + if y_pred is None: + y_pred = np.zeros(len(X_pred)) + if samples is None: + samples = self.sampler_config.get("draws", 500) + + if self.model is None: + self.build_model(X_pred, y_pred) + + self._data_setter(X_pred, y_pred) + if self.model is not None: + with self.model: # sample with new input data + prior_pred: az.InferenceData = pm.sample_prior_predictive( + samples, **kwargs + ) + self.set_idata_attrs(prior_pred) + if extend_idata: + if self.idata is not None: + self.idata.extend(prior_pred) + else: + self.idata = prior_pred + + prior_predictive_samples = az.extract( + prior_pred, "prior_predictive", combined=combined + ) + + return prior_predictive_samples + + def sample_posterior_predictive(self, X_pred, extend_idata, combined, **kwargs): + """ + Sample from the model's posterior predictive distribution. + + Parameters + --------- + X_pred : array, shape (n_pred, n_features) + The input data used for prediction using prior distribution.. + extend_idata : Boolean determining whether the predictions should be added to inference data object. + Defaults to False. + combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists. + Defaults to True. + **kwargs: Additional arguments to pass to pymc.sample_posterior_predictive + + Returns + ------- + posterior_predictive_samples : DataArray, shape (n_pred, samples) + Posterior predictive samples for each input X_pred + """ + self._data_setter(X_pred) + + with self.model: # sample with new input data + post_pred = pm.sample_posterior_predictive(self.idata, **kwargs) + if extend_idata: + self.idata.extend(post_pred) + + posterior_predictive_samples = az.extract( + post_pred, "posterior_predictive", combined=combined + ) + + return posterior_predictive_samples + + def get_params(self, deep=True): + """ + Get all the model parameters needed to instantiate a copy of the model, not including training data. + """ + return { + "model_config": self.model_config, + "sampler_config": self.sampler_config, + } + + def set_params(self, **params): + """ + Set all the model parameters needed to instantiate the model, not including training data. + """ + self.model_config = params["model_config"] + self.sampler_config = params["sampler_config"] + + @property + @abstractmethod + def _serializable_model_config(self) -> Dict[str, Union[int, float, Dict]]: + """ + Converts non-serializable values from model_config to their serializable reversable equivalent. + Data types like pandas DataFrame, Series or datetime aren't JSON serializable, + so in order to save the model they need to be formatted. + + Returns + ------- + model_config: dict + """ + + def predict_proba( + self, + X_pred: Union[np.ndarray, pd.DataFrame, pd.Series], + extend_idata: bool = True, + combined: bool = False, + **kwargs, + ) -> xr.DataArray: + """Alias for `predict_posterior`, for consistency with scikit-learn probabilistic estimators.""" + return self.predict_posterior(X_pred, extend_idata, combined, **kwargs) + + def predict_posterior( + self, + X_pred: Union[np.ndarray, pd.DataFrame, pd.Series], + extend_idata: bool = True, + combined: bool = True, + **kwargs, + ) -> xr.DataArray: + """ + Generate posterior predictive samples on unseen data. + + Parameters + --------- + X_pred : array-like if sklearn is available, otherwise array, shape (n_pred, n_features) + The input data used for prediction. + extend_idata : Boolean determining whether the predictions should be added to inference data object. + Defaults to True. + combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists. + Defaults to True. + **kwargs: Additional arguments to pass to pymc.sample_posterior_predictive + + Returns + ------- + y_pred : DataArray, shape (n_pred, chains * draws) if combined is True, otherwise (chains, draws, n_pred) + Posterior predictive samples for each input X_pred + """ + + X_pred = self._validate_data(X_pred) + posterior_predictive_samples = self.sample_posterior_predictive( + X_pred, extend_idata, combined, **kwargs + ) + + if self.output_var not in posterior_predictive_samples: + raise KeyError( + f"Output variable {self.output_var} not found in posterior predictive samples." + ) + + return posterior_predictive_samples[self.output_var] + + @property + def id(self) -> str: + """ + Generate a unique hash value for the model. + + The hash value is created using the last 16 characters of the SHA256 hash encoding, based on the model configuration, + version, and model type. + + Returns + ------- + str + A string of length 16 characters containing a unique hash of the model. + + Examples + -------- + >>> model = MyModel() + >>> model.id + '0123456789abcdef' + """ + hasher = hashlib.sha256() + hasher.update(str(self.model_config.values()).encode()) + hasher.update(self.version.encode()) + hasher.update(self._model_type.encode()) + return hasher.hexdigest()[:16] diff --git a/pyproject.toml b/pyproject.toml index 183a8292d..d78d61327 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,6 @@ dependencies = [ "seaborn>=0.12.2", "xarray", "xarray-einstats>=0.5.1", - "pymc-experimental==0.0.9", ] [project.optional-dependencies] @@ -58,7 +57,7 @@ packages = [ "pymc_marketing", "pymc_marketing.mmm", "pymc_marketing.clv", - "pymc_marketing.clv.models" + "pymc_marketing.clv.models", ] [tool.setuptools.dynamic] diff --git a/tests/mmm/test_base.py b/tests/mmm/test_base.py index e1f18370c..55a643b2a 100644 --- a/tests/mmm/test_base.py +++ b/tests/mmm/test_base.py @@ -74,6 +74,10 @@ def default_model_config(self): def default_sampler_config(self): pass + @property + def output_var(self): + pass + def _data_setter(self, X, y=None): pass diff --git a/tests/model_builder/test_model_builder.py b/tests/model_builder/test_model_builder.py new file mode 100644 index 000000000..34b08c905 --- /dev/null +++ b/tests/model_builder/test_model_builder.py @@ -0,0 +1,246 @@ +# Copyright 2023 The PyMC Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import json +import sys +import tempfile +from typing import Dict + +import numpy as np +import pandas as pd +import pymc as pm +import pytest + +from pymc_marketing.model_builder import ModelBuilder + + +@pytest.fixture(scope="module") +def toy_X(): + x = np.linspace(start=0, stop=1, num=100) + X = pd.DataFrame({"input": x}) + return X + + +@pytest.fixture(scope="module") +def toy_y(toy_X): + y = 5 * toy_X["input"] + 3 + y = y + np.random.normal(0, 1, size=len(toy_X)) + y = pd.Series(y, name="output") + return y + + +@pytest.fixture(scope="module") +def fitted_model_instance(toy_X, toy_y): + sampler_config = { + "draws": 100, + "tune": 100, + "chains": 2, + "target_accept": 0.95, + } + model_config = { + "a": {"loc": 0, "scale": 10, "dims": ("numbers",)}, + "b": {"loc": 0, "scale": 10}, + "obs_error": 2, + } + model = test_ModelBuilder( + model_config=model_config, + sampler_config=sampler_config, + test_parameter="test_paramter", + ) + model.fit(toy_X) + return model + + +class test_ModelBuilder(ModelBuilder): + def __init__(self, model_config=None, sampler_config=None, test_parameter=None): + self.test_parameter = test_parameter + super().__init__(model_config=model_config, sampler_config=sampler_config) + + _model_type = "test_model" + version = "0.1" + + def build_model(self, X: pd.DataFrame, y: pd.Series, model_config=None): + coords = {"numbers": np.arange(len(X))} + self.generate_and_preprocess_model_data(X, y) + with pm.Model(coords=coords) as self.model: + if model_config is None: + model_config = self.default_model_config + x = pm.MutableData("x", self.X["input"].values) + y_data = pm.MutableData("y_data", self.y) + + # prior parameters + a_loc = model_config["a"]["loc"] + a_scale = model_config["a"]["scale"] + b_loc = model_config["b"]["loc"] + b_scale = model_config["b"]["scale"] + obs_error = model_config["obs_error"] + + # priors + a = pm.Normal("a", a_loc, sigma=a_scale, dims=model_config["a"]["dims"]) + b = pm.Normal("b", b_loc, sigma=b_scale) + obs_error = pm.HalfNormal("σ_model_fmc", obs_error) + + # observed data + pm.Normal("output", a + b * x, obs_error, shape=x.shape, observed=y_data) + + def _save_input_params(self, idata): + idata.attrs["test_paramter"] = json.dumps(self.test_parameter) + + @property + def output_var(self): + return "output" + + def _data_setter(self, x: pd.Series, y: pd.Series = None): + with self.model: + pm.set_data({"x": x.values}) + if y is not None: + pm.set_data({"y_data": y.values}) + + @property + def _serializable_model_config(self): + return self.model_config + + def generate_and_preprocess_model_data(self, X: pd.DataFrame, y: pd.Series): + self.X = X + self.y = y + + @property + def default_model_config(self) -> Dict: + return { + "a": {"loc": 0, "scale": 10, "dims": ("numbers",)}, + "b": {"loc": 0, "scale": 10}, + "obs_error": 2, + } + + @property + def default_sampler_config(self) -> Dict: + return { + "draws": 1_000, + "tune": 1_000, + "chains": 3, + "target_accept": 0.95, + } + + +def test_save_input_params(fitted_model_instance): + assert fitted_model_instance.idata.attrs["test_paramter"] == '"test_paramter"' + + +def test_save_load(fitted_model_instance): + temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) + fitted_model_instance.save(temp.name) + test_builder2 = test_ModelBuilder.load(temp.name) + assert fitted_model_instance.idata.groups() == test_builder2.idata.groups() + assert fitted_model_instance.id == test_builder2.id + x_pred = np.random.uniform(low=0, high=1, size=100) + prediction_data = pd.DataFrame({"input": x_pred}) + pred1 = fitted_model_instance.predict(prediction_data["input"]) + pred2 = test_builder2.predict(prediction_data["input"]) + assert pred1.shape == pred2.shape + temp.close() + + +def test_initial_build_and_fit(fitted_model_instance, check_idata=True) -> ModelBuilder: + if check_idata: + assert fitted_model_instance.idata is not None + assert "posterior" in fitted_model_instance.idata.groups() + + +def test_save_without_fit_raises_runtime_error(): + model_builder = test_ModelBuilder() + with pytest.raises(RuntimeError): + model_builder.save("saved_model") + + +def test_empty_sampler_config_fit(toy_X, toy_y): + sampler_config = {} + model_builder = test_ModelBuilder(sampler_config=sampler_config) + model_builder.idata = model_builder.fit(X=toy_X, y=toy_y) + assert model_builder.idata is not None + assert "posterior" in model_builder.idata.groups() + + +def test_fit(fitted_model_instance): + prediction_data = pd.DataFrame( + {"input": np.random.uniform(low=0, high=1, size=100)} + ) + fitted_model_instance.predict(prediction_data["input"]) + post_pred = fitted_model_instance.sample_posterior_predictive( + prediction_data["input"], extend_idata=True, combined=True + ) + post_pred[fitted_model_instance.output_var].shape[0] == prediction_data.input.shape + + +def test_fit_no_y(toy_X): + model_builder = test_ModelBuilder() + model_builder.idata = model_builder.fit(X=toy_X) + assert model_builder.model is not None + assert model_builder.idata is not None + assert "posterior" in model_builder.idata.groups() + + +@pytest.mark.skipif( + sys.platform == "win32", + reason="Permissions for temp files not granted on windows CI.", +) +def test_predict(fitted_model_instance): + x_pred = np.random.uniform(low=0, high=1, size=100) + prediction_data = pd.DataFrame({"input": x_pred}) + pred = fitted_model_instance.predict(prediction_data["input"]) + # Perform elementwise comparison using numpy + assert type(pred) == np.ndarray + assert len(pred) > 0 + + +@pytest.mark.parametrize("combined", [True, False]) +def test_sample_posterior_predictive(fitted_model_instance, combined): + n_pred = 100 + x_pred = np.random.uniform(low=0, high=1, size=n_pred) + prediction_data = pd.DataFrame({"input": x_pred}) + pred = fitted_model_instance.sample_posterior_predictive( + prediction_data["input"], combined=combined, extend_idata=True + ) + chains = fitted_model_instance.idata.sample_stats.dims["chain"] + draws = fitted_model_instance.idata.sample_stats.dims["draw"] + expected_shape = (n_pred, chains * draws) if combined else (chains, draws, n_pred) + assert pred[fitted_model_instance.output_var].shape == expected_shape + assert np.issubdtype(pred[fitted_model_instance.output_var].dtype, np.floating) + + +def test_model_config_formatting(): + model_config = { + "a": { + "loc": [0, 0], + "scale": 10, + "dims": [ + "x", + ], + }, + } + model_builder = test_ModelBuilder() + converted_model_config = model_builder._model_config_formatting(model_config) + np.testing.assert_equal(converted_model_config["a"]["dims"], ("x",)) + np.testing.assert_equal(converted_model_config["a"]["loc"], np.array([0, 0])) + + +def test_id(): + model_builder = test_ModelBuilder() + expected_id = hashlib.sha256( + str(model_builder.model_config.values()).encode() + + model_builder.version.encode() + + model_builder._model_type.encode() + ).hexdigest()[:16] + + assert model_builder.id == expected_id