From 6255ada67cd37c501bc40bc8dd2940d95be1fd70 Mon Sep 17 00:00:00 2001
From: Thomas S <thomas@probabl.ai>
Date: Mon, 13 Jan 2025 17:06:18 +0100
Subject: [PATCH] CrossValidationReporterItem's factory now pickles the
 reporter

---
 .../item/cross_validation_reporter_item.py    | 370 +++++++-----------
 .../sklearn/test_cross_validate.py            |   4 +-
 skore/tests/integration/ui/test_ui.py         |   5 +-
 ...=> test_cross_validation_reporter_item.py} |  58 ++-
 4 files changed, 173 insertions(+), 264 deletions(-)
 rename skore/tests/unit/item/{test_cross_validation_item.py => test_cross_validation_reporter_item.py} (80%)

diff --git a/skore/src/skore/persistence/item/cross_validation_reporter_item.py b/skore/src/skore/persistence/item/cross_validation_reporter_item.py
index 23c81dab9..801cd39b7 100644
--- a/skore/src/skore/persistence/item/cross_validation_reporter_item.py
+++ b/skore/src/skore/persistence/item/cross_validation_reporter_item.py
@@ -1,20 +1,21 @@
-"""CrossValidationItem class.
+"""CrossValidationReporterItem.
 
-This class represents the output of a cross-validation workflow.
+This module defines the CrossValidationReporterItem class, which is used to persist
+reporters of cross-validation.
 """
 
 from __future__ import annotations
 
 import contextlib
-import copy
 import dataclasses
 import hashlib
 import importlib
 import json
+import pickle
 import re
 import statistics
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Literal, TypedDict, Union
+from typing import TYPE_CHECKING, Literal, Optional, TypedDict
 
 import numpy
 import plotly.graph_objects
@@ -27,8 +28,6 @@
 if TYPE_CHECKING:
     import sklearn.base
 
-    CVSplitter = Any
-
     class EstimatorParamInfo(TypedDict):
         """Information about an estimator parameter."""
 
@@ -43,6 +42,12 @@ class EstimatorInfo(TypedDict):
         params: dict[str, EstimatorParamInfo]
 
 
+HUMANIZED_PLOT_NAMES = {
+    "scores": "Scores",
+    "timing": "Timings",
+}
+
+
 def _hash_numpy(arr: numpy.ndarray) -> str:
     """Compute a hash string from a numpy array.
 
@@ -117,86 +122,123 @@ def _params_to_str(estimator_info) -> str:
     return "\n".join(params_list)
 
 
-# Data used for training, passed as input to scikit-learn
-Data = Any
-# Target used for training, passed as input to scikit-learn
-Target = Any
+@staticmethod
+def _estimator_info(estimator: sklearn.base.BaseEstimator) -> EstimatorInfo:
+    estimator_params = (
+        estimator.get_params() if hasattr(estimator, "get_params") else {}
+    )
+
+    name = estimator.__class__.__name__
+    module = estimator.__module__
+
+    # Figure out the default parameters of the estimator,
+    # so that we can highlight the non-default ones in the UI
+
+    # This is done by instantiating the class with no arguments and
+    # computing the diff between the default and ours
+    try:
+        estimator_module = importlib.import_module(module)
+        EstimatorClass = getattr(estimator_module, name)
+        default_estimator_params = EstimatorClass().get_params()
+    except Exception:
+        default_estimator_params = {}
+
+    final_estimator_params: dict[str, EstimatorParamInfo] = {}
+    for k, v in estimator_params.items():
+        param_is_default: bool = (
+            k in default_estimator_params and default_estimator_params[k] == v
+        )
+        final_estimator_params[str(k)] = {
+            "value": repr(v),
+            "default": param_is_default,
+        }
 
+    return {
+        "name": name,
+        "module": module,
+        "params": final_estimator_params,
+    }
 
-class CrossValidationItem(Item):
-    """
-    A class to represent the output of a cross-validation workflow.
 
-    This class encapsulates the output of the
-    :func:`sklearn.model_selection.cross_validate` function along with its creation and
-    update timestamps.
-    """
+class CrossValidationReporterItem(Item):
+    """Class to persist the reporter of cross-validation."""
 
     def __init__(
         self,
-        cv_results_serialized: dict,
-        estimator_info: EstimatorInfo,
-        X_info: dict,
-        y_info: Union[dict, None],
-        plots_bytes: dict[str, bytes],
-        cv_info: dict,
-        created_at: Union[str, None] = None,
-        updated_at: Union[str, None] = None,
+        reporter_bytes: bytes,
+        created_at: Optional[str] = None,
+        updated_at: Optional[str] = None,
     ):
         """
-        Initialize a CrossValidationItem.
+        Initialize a CrossValidationReporterItem.
 
         Parameters
         ----------
-        cv_results_serialized : dict
-            The dict output of the :func:`sklearn.model_selection.cross_validate`
-            function, in a form suitable for serialization.
-        estimator_info : dict
-            The estimator that was cross-validated.
-        X_info : dict
-            A summary of the data, input of the
-            :func:`sklearn.model_selection.cross_validate` function.
-        y_info : dict
-            A summary of the target, input of the
-            :func:`sklearn.model_selection.cross_validate` function.
-        plots_bytes : dict[str, bytes]
-            A collection of plots of the cross-validation results, in the form of bytes.
-        cv_info: dict
-            A dict containing cross validation splitting strategy params.
-        created_at : str
+        reporter_bytes : bytes
+            The raw bytes of the reporter pickle representation.
+        created_at : str, optional
             The creation timestamp in ISO format.
-        updated_at : str
+        updated_at : str, optional
             The last update timestamp in ISO format.
         """
         super().__init__(created_at, updated_at)
 
-        self.cv_results_serialized = cv_results_serialized
-        self.estimator_info = estimator_info
-        self.X_info = X_info
-        self.y_info = y_info
-        self.plots_bytes = plots_bytes
-        self.cv_info = cv_info
+        self.reporter_bytes = reporter_bytes
 
-    def as_serializable_dict(self):
-        """Get a serializable dict from the item.
+    @classmethod
+    def factory(cls, reporter: CrossValidationReporter) -> CrossValidationReporterItem:
+        """
+        Create a CrossValidationReporterItem instance from a CrossValidationReporter.
 
-        Derived class must call their super implementation
-        and merge the result with their output.
+        Parameters
+        ----------
+        reporter : CrossValidationReporter
+
+        Returns
+        -------
+        CrossValidationReporterItem
+            A new CrossValidationReporterItem instance.
         """
+        if not isinstance(reporter, CrossValidationReporter):
+            raise ItemTypeError(f"Type '{reporter.__class__}' is not supported.")
+
+        instance = cls(pickle.dumps(reporter))
+
+        # add reporter as cached property
+        instance.reporter = reporter
+
+        return instance
+
+    @cached_property
+    def reporter(self) -> CrossValidationReporter:
+        """The CrossValidationReporter from the persistence."""
+        return pickle.loads(self.reporter_bytes)
+
+    def as_serializable_dict(self):
+        """Get a serializable dict from the item."""
         # Get tabular results (the cv results in a dataframe-like structure)
-        cv_results = copy.deepcopy(self.cv_results_serialized)
-        cv_results.pop("indices", None)
-
-        metrics_names = list(cv_results.keys())
-        tabular_results = {
-            "name": "Cross validation results",
-            "columns": metrics_names,
-            "data": list(zip(*cv_results.values())),
-            "favorability": [_metric_favorability(m) for m in metrics_names],
+        cv_results = {
+            key: value.tolist()
+            for key, value in self.reporter.cv_results.items()
+            if (
+                key != "estimator"
+                and key != "indices"
+                and isinstance(value, numpy.ndarray)
+            )
         }
 
+        metrics_names = list(cv_results)
+        tabular_results = [
+            {
+                "name": "Cross validation results",
+                "columns": metrics_names,
+                "data": list(zip(*cv_results.values())),
+                "favorability": [_metric_favorability(m) for m in metrics_names],
+            }
+        ]
+
         # Get scalar results (summary statistics of the cv results)
-        mean_cv_results = [
+        scalar_results = [
             {
                 "name": _metric_title(k),
                 "value": statistics.mean(v),
@@ -206,163 +248,22 @@ def as_serializable_dict(self):
             for k, v in cv_results.items()
         ]
 
-        scalar_results = mean_cv_results
-
-        params_as_str = _params_to_str(self.estimator_info)
-
         # If the estimator is from sklearn, make the class name a hyperlink
         # to the relevant docs
-        name = self.estimator_info["name"]
-        module = re.sub(r"\.\_.+", "", self.estimator_info["module"])
+        estimator_info = _estimator_info(self.reporter.estimator)
+        name = estimator_info["name"]
+        module = re.sub(r"\.\_.+", "", estimator_info["module"])
         if module.startswith("sklearn"):
             doc_url = f"https://scikit-learn.org/stable/modules/generated/{module}.{name}.html"
             doc_link = f'<a href="{doc_url}" target="_blank"><code>{name}</code></a>'
         else:
             doc_link = f"`{name}`"
 
+        params_as_str = _params_to_str(estimator_info)
         estimator_params_as_str = f"{doc_link}\n{params_as_str}"
 
-        # Get cross-validation details
-        cv_params_as_str = ", ".join(f"{k}: *{v}*" for k, v in self.cv_info.items())
-
-        r = super().as_serializable_dict()
-        sections = [
-            {
-                "title": "Model",
-                "icon": "icon-square-cursor",
-                "items": [
-                    {
-                        "name": "Estimator parameters",
-                        "description": "Core model configuration used for training",
-                        "value": estimator_params_as_str,
-                    },
-                    {
-                        "name": "Cross-validation parameters",
-                        "description": "Controls how data is split and validated",
-                        "value": cv_params_as_str,
-                    },
-                ],
-            }
-        ]
-        value = {
-            "scalar_results": scalar_results,
-            "tabular_results": [tabular_results],
-            "plots": [
-                {
-                    "name": plot_name,
-                    "value": json.loads(plot_bytes.decode("utf-8")),
-                }
-                for plot_name, plot_bytes in self.plots_bytes.items()
-            ],
-            "sections": sections,
-        }
-        r.update(
-            {
-                "media_type": "application/vnd.skore.cross_validation+json",
-                "value": value,
-            }
-        )
-        return r
-
-    @staticmethod
-    def _estimator_info(estimator: sklearn.base.BaseEstimator) -> EstimatorInfo:
-        estimator_params = (
-            estimator.get_params() if hasattr(estimator, "get_params") else {}
-        )
-
-        name = estimator.__class__.__name__
-        module = estimator.__module__
-
-        # Figure out the default parameters of the estimator,
-        # so that we can highlight the non-default ones in the UI
-
-        # This is done by instantiating the class with no arguments and
-        # computing the diff between the default and ours
-        try:
-            estimator_module = importlib.import_module(module)
-            EstimatorClass = getattr(estimator_module, name)
-            default_estimator_params = EstimatorClass().get_params()
-        except Exception:
-            default_estimator_params = {}
-
-        final_estimator_params: dict[str, EstimatorParamInfo] = {}
-        for k, v in estimator_params.items():
-            param_is_default: bool = (
-                k in default_estimator_params and default_estimator_params[k] == v
-            )
-            final_estimator_params[str(k)] = {
-                "value": repr(v),
-                "default": param_is_default,
-            }
-
-        return {
-            "name": name,
-            "module": module,
-            "params": final_estimator_params,
-        }
-
-    @classmethod
-    def factory(cls, reporter: CrossValidationReporter) -> CrossValidationItem:
-        """
-        Create a new CrossValidationItem instance from a CrossValidationReporter.
-
-        Parameters
-        ----------
-        reporter : CrossValidationReporter
-
-        Returns
-        -------
-        CrossValidationItem
-            A new CrossValidationItem instance.
-        """
-        if not isinstance(reporter, CrossValidationReporter):
-            raise ItemTypeError(
-                f"Type '{reporter.__class__}' is not supported, "
-                f"only '{CrossValidationReporter.__name__}' is."
-            )
-
-        cv_results = reporter._cv_results
-        estimator = reporter.estimator
-        X = reporter.X
-        y = reporter.y
-        plots = reporter.plots
-        cv = reporter.cv
-
-        cv_results_serialized = {}
-        for k, v in cv_results.items():
-            if k == "estimator":
-                continue
-            if k == "indices":
-                cv_results_serialized["indices"] = {
-                    "train": tuple(arr.tolist() for arr in v["train"]),
-                    "test": tuple(arr.tolist() for arr in v["test"]),
-                }
-            if isinstance(v, numpy.ndarray):
-                cv_results_serialized[k] = v.tolist()
-
-        estimator_info = CrossValidationItem._estimator_info(estimator)
-
-        y_array = y if isinstance(y, numpy.ndarray) else numpy.array(y)
-        y_info = None if y is None else {"hash": _hash_numpy(y_array)}
-
-        X_array = X if isinstance(X, numpy.ndarray) else numpy.array(X)
-        X_info = {
-            "nb_rows": X_array.shape[0],
-            "nb_cols": X_array.shape[1],
-            "hash": _hash_numpy(X_array),
-        }
-
-        humanized_plot_names = {
-            "scores": "Scores",
-            "timing": "Timings",
-        }
-        plots_bytes = {
-            humanized_plot_names[plot_name]: (
-                plotly.io.to_json(plot, engine="json").encode("utf-8")
-            )
-            for plot_name, plot in dataclasses.asdict(plots).items()
-        }
-
+        #
+        cv = self.reporter.cv
         cv_info: dict[str, str] = {}
         if isinstance(cv, int):
             cv_info["n_splits"] = repr(cv)
@@ -376,19 +277,40 @@ def factory(cls, reporter: CrossValidationReporter) -> CrossValidationItem:
                     attr = getattr(cv, attr_name)
                     cv_info[attr_name] = repr(attr)
 
-        return cls(
-            cv_results_serialized=cv_results_serialized,
-            estimator_info=estimator_info,
-            X_info=X_info,
-            y_info=y_info,
-            plots_bytes=plots_bytes,
-            cv_info=cv_info,
-        )
+        cv_params_as_str = ", ".join(f"{k}: *{v}*" for k, v in cv_info.items())
 
-    @cached_property
-    def plots(self) -> dict:
-        """Various plots of the cross-validation results."""
-        return {
-            name: plotly.io.from_json(plot_bytes.decode("utf-8"))
-            for name, plot_bytes in self.plots_bytes.items()
+        #
+        value = {
+            "scalar_results": scalar_results,
+            "tabular_results": tabular_results,
+            "plots": [
+                {
+                    "name": HUMANIZED_PLOT_NAMES[plot_name],
+                    "value": json.loads(plotly.io.to_json(plot, engine="json")),
+                }
+                for plot_name, plot in dataclasses.asdict(self.reporter.plots).items()
+            ],
+            "sections": [
+                {
+                    "title": "Model",
+                    "icon": "icon-square-cursor",
+                    "items": [
+                        {
+                            "name": "Estimator parameters",
+                            "description": "Core model configuration used for training",
+                            "value": estimator_params_as_str,
+                        },
+                        {
+                            "name": "Cross-validation parameters",
+                            "description": "Controls how data is split and validated",
+                            "value": cv_params_as_str,
+                        },
+                    ],
+                }
+            ],
+        }
+
+        return super().as_serializable_dict() | {
+            "media_type": "application/vnd.skore.cross_validation+json",
+            "value": value,
         }
diff --git a/skore/tests/integration/sklearn/test_cross_validate.py b/skore/tests/integration/sklearn/test_cross_validate.py
index e39f6ba58..00758f751 100644
--- a/skore/tests/integration/sklearn/test_cross_validate.py
+++ b/skore/tests/integration/sklearn/test_cross_validate.py
@@ -10,7 +10,7 @@
 from sklearn.multiclass import OneVsOneClassifier
 from sklearn.svm import SVC
 from skore import CrossValidationReporter
-from skore.persistence.item.cross_validation_item import CrossValidationItem
+from skore.persistence.item import CrossValidationReporterItem
 from skore.sklearn.cross_validation.cross_validation_helpers import _get_scorers_to_add
 
 
@@ -201,7 +201,7 @@ def test_cross_validation_reporter(in_memory_project, fixture_name, request):
     in_memory_project.put("cross-validation", reporter)
 
     retrieved_item = in_memory_project.item_repository.get_item("cross-validation")
-    assert isinstance(retrieved_item, CrossValidationItem)
+    assert isinstance(retrieved_item, CrossValidationReporterItem)
 
 
 @pytest.mark.parametrize(
diff --git a/skore/tests/integration/ui/test_ui.py b/skore/tests/integration/ui/test_ui.py
index 360761a6f..44b15d7a3 100644
--- a/skore/tests/integration/ui/test_ui.py
+++ b/skore/tests/integration/ui/test_ui.py
@@ -167,7 +167,7 @@ def _fake_cross_validate(*args, **kwargs):
     monkeypatch.setattr("sklearn.model_selection.cross_validate", _fake_cross_validate)
 
 
-def test_serialize_cross_validation_item(
+def test_serialize_cross_validation_reporter_item(
     client,
     in_memory_project,
     monkeypatch,
@@ -176,9 +176,6 @@ def test_serialize_cross_validation_item(
     fake_cross_validate,
 ):
     monkeypatch.setattr("skore.persistence.item.item.datetime", MockDatetime)
-    monkeypatch.setattr(
-        "skore.persistence.item.cross_validation_item.CrossValidationItem.plots", {}
-    )
     monkeypatch.setattr(
         "skore.sklearn.cross_validation.cross_validation_reporter.plot_cross_validation_compare_scores",
         lambda _: {},
diff --git a/skore/tests/unit/item/test_cross_validation_item.py b/skore/tests/unit/item/test_cross_validation_reporter_item.py
similarity index 80%
rename from skore/tests/unit/item/test_cross_validation_item.py
rename to skore/tests/unit/item/test_cross_validation_reporter_item.py
index 909eff872..973fc933d 100644
--- a/skore/tests/unit/item/test_cross_validation_item.py
+++ b/skore/tests/unit/item/test_cross_validation_reporter_item.py
@@ -1,13 +1,13 @@
 from dataclasses import dataclass
+from pickle import dumps
 
 import numpy
 import plotly.graph_objects
 import pytest
 from sklearn.model_selection import StratifiedKFold
 from skore.persistence.item import ItemTypeError
-from skore.persistence.item.cross_validation_item import (
-    CrossValidationItem,
-    _hash_numpy,
+from skore.persistence.item.cross_validation_reporter_item import (
+    CrossValidationReporterItem,
     _metric_favorability,
 )
 from skore.sklearn.cross_validation import CrossValidationReporter
@@ -27,7 +27,7 @@ class FakeEstimatorNoGetParams:
 
 @dataclass
 class FakeCrossValidationReporter(CrossValidationReporter):
-    _cv_results = {
+    cv_results = {
         "test_score": numpy.array([1, 2, 3]),
         "estimator": [FakeEstimator(), FakeEstimator(), FakeEstimator()],
         "fit_time": [1, 2, 3],
@@ -44,7 +44,7 @@ class FakeCrossValidationReporter(CrossValidationReporter):
 
 @dataclass
 class FakeCrossValidationReporterNoGetParams(CrossValidationReporter):
-    _cv_results = {
+    cv_results = {
         "test_score": numpy.array([1, 2, 3]),
         "estimator": [
             FakeEstimatorNoGetParams(),
@@ -63,14 +63,14 @@ class FakeCrossValidationReporterNoGetParams(CrossValidationReporter):
     cv = StratifiedKFold(n_splits=5)
 
 
-class TestCrossValidationItem:
+class TestCrossValidationReporterItem:
     @pytest.fixture(autouse=True)
     def monkeypatch_datetime(self, monkeypatch, MockDatetime):
         monkeypatch.setattr("skore.persistence.item.item.datetime", MockDatetime)
 
     def test_factory_exception(self):
         with pytest.raises(ItemTypeError):
-            CrossValidationItem.factory(None)
+            CrossValidationReporterItem.factory(None)
 
     @pytest.mark.parametrize(
         "reporter",
@@ -82,42 +82,32 @@ def test_factory_exception(self):
         ],
     )
     def test_factory(self, mock_nowstr, reporter):
-        item = CrossValidationItem.factory(reporter)
-
-        assert item.cv_results_serialized == {"test_score": [1, 2, 3]}
-        assert item.estimator_info == {
-            "name": reporter.estimator.__class__.__name__,
-            "params": (
-                {}
-                if isinstance(reporter.estimator, FakeEstimatorNoGetParams)
-                else {"alpha": {"value": "3", "default": True}}
-            ),
-            "module": "tests.unit.item.test_cross_validation_item",
-        }
-        assert item.X_info == {
-            "nb_cols": 1,
-            "nb_rows": 1,
-            "hash": _hash_numpy(FakeCrossValidationReporter.X),
-        }
-        assert item.y_info == {"hash": _hash_numpy(FakeCrossValidationReporter.y)}
-        assert item.cv_info == {
-            "n_splits": "5",
-            "random_state": "None",
-            "shuffle": "False",
-        }
-        assert isinstance(item.plots_bytes, dict)
-        assert isinstance(item.plots, dict)
+        item = CrossValidationReporterItem.factory(reporter)
+
+        assert item.reporter_bytes == dumps(reporter)
         assert item.created_at == mock_nowstr
         assert item.updated_at == mock_nowstr
 
+    def test_reporter(self, mock_nowstr):
+        reporter = FakeCrossValidationReporter()
+        item1 = CrossValidationReporterItem.factory(reporter)
+        item2 = CrossValidationReporterItem(
+            reporter_bytes=dumps(reporter),
+            created_at=mock_nowstr,
+            updated_at=mock_nowstr,
+        )
+
+        assert item1.reporter == reporter
+        assert item2.reporter == reporter
+
     def test_get_serializable_dict(self, monkeypatch, mock_nowstr):
         monkeypatch.setattr(
-            "skore.persistence.item.cross_validation_item.CrossValidationReporter",
+            "skore.persistence.item.cross_validation_reporter_item.CrossValidationReporter",
             FakeCrossValidationReporter,
         )
 
         reporter = FakeCrossValidationReporter()
-        item = CrossValidationItem.factory(reporter)
+        item = CrossValidationReporterItem.factory(reporter)
         serializable = item.as_serializable_dict()
 
         assert serializable["updated_at"] == mock_nowstr