From f7892b9f90daea5d6b82f621d5be4f74504af923 Mon Sep 17 00:00:00 2001 From: MarieS-WiMLDS <79304610+MarieS-WiMLDS@users.noreply.github.com> Date: Wed, 22 Jan 2025 15:52:05 +0100 Subject: [PATCH 1/3] docs: Remove tempfiles from docs (#1193) fixes https://github.com/probabl-ai/skore/issues/1178 --------- Co-authored-by: Sylvain Combettes <48064216+sylvaincom@users.noreply.github.com> --- examples/getting_started/plot_quick_start.py | 11 ++++++- .../plot_skore_getting_started.py | 33 +++---------------- .../getting_started/plot_tracking_items.py | 19 +++-------- .../plot_working_with_projects.py | 19 +++-------- .../model_evaluation/plot_cross_validate.py | 19 +++-------- .../model_evaluation/plot_train_test_split.py | 20 +++-------- skore/src/skore/project/project.py | 11 +++++++ skore/tests/unit/project/test_project.py | 14 ++++++++ sphinx/Makefile | 2 +- sphinx/conf.py | 1 + 10 files changed, 57 insertions(+), 92 deletions(-) diff --git a/examples/getting_started/plot_quick_start.py b/examples/getting_started/plot_quick_start.py index 5739bace0..ed8456469 100644 --- a/examples/getting_started/plot_quick_start.py +++ b/examples/getting_started/plot_quick_start.py @@ -12,7 +12,7 @@ # %% import skore -my_project = skore.open("quick_start", overwrite=True) +my_project = skore.open("my_project", create=True) # %% # This will create a skore project directory named ``quick_start.skore`` in your @@ -69,3 +69,12 @@ # .. admonition:: What's next? # # For a more in-depth guide, see our :ref:`example_skore_getting_started` page! + +# %% +# Cleanup the project +# ------------------- +# +# Let's clear the skore project (to avoid any conflict with other documentation examples). + +# %% +my_project.clear() diff --git a/examples/getting_started/plot_skore_getting_started.py b/examples/getting_started/plot_skore_getting_started.py index c107fa6a7..16893d8c6 100644 --- a/examples/getting_started/plot_skore_getting_started.py +++ b/examples/getting_started/plot_skore_getting_started.py @@ -34,38 +34,13 @@ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # %% -# .. note:: -# -# If we do not wish for our skore project to be stored in a *temporary* folder, we -# can simply create and load the project in the current directory with: -# -# .. code-block:: python -# -# import skore -# -# my_project = skore.open("my_project") -# -# This would create a skore project directory named ``my_project.skore`` in our +# Let's start by creating a skore project directory named ``my_project.skore`` in our # current directory. -# %% -# Here, we start by creating a temporary directory to store our project so that we can -# easily clean it after executing this example: - -# %% -import tempfile -from pathlib import Path - -temp_dir = tempfile.TemporaryDirectory(prefix="skore_example_") -temp_dir_path = Path(temp_dir.name) - -# %% -# Then, we create and load the skore project from this temporary directory: - # %% import skore -my_project = skore.open(temp_dir_path / "my_project") +my_project = skore.open("my_project", create=True) # %% # Now that the project exists, we can write some Python code (in the same @@ -347,7 +322,7 @@ # Cleanup the project # ------------------- # -# Removing the temporary directory: +# Let's clear the skore project (to avoid conflict with other documentation examples). # %% -temp_dir.cleanup() +my_project.clear() diff --git a/examples/getting_started/plot_tracking_items.py b/examples/getting_started/plot_tracking_items.py index ec24e45c1..3eb82b0d1 100644 --- a/examples/getting_started/plot_tracking_items.py +++ b/examples/getting_started/plot_tracking_items.py @@ -14,23 +14,12 @@ # ====================================== # %% -# We start by creating a temporary directory to store our project so that we can -# easily clean it after executing this example: - -# %% -import tempfile -from pathlib import Path - -temp_dir = tempfile.TemporaryDirectory(prefix="skore_example_") -temp_dir_path = Path(temp_dir.name) - -# %% -# We create and load the skore project from this temporary directory: +# We create and load the skore project in the current directory: # %% import skore -my_project = skore.open(temp_dir_path / "my_project") +my_project = skore.open("my_project", create=True) # %% # Tracking an integer @@ -130,7 +119,7 @@ # Cleanup the project # ------------------- # -# Removing the temporary directory: +# Let's clear the skore project (to avoid any conflict with other documentation examples). # %% -temp_dir.cleanup() +my_project.clear() diff --git a/examples/getting_started/plot_working_with_projects.py b/examples/getting_started/plot_working_with_projects.py index 8863d5429..5492d127b 100644 --- a/examples/getting_started/plot_working_with_projects.py +++ b/examples/getting_started/plot_working_with_projects.py @@ -14,23 +14,12 @@ # ====================================== # %% -# We start by creating a temporary directory to store our project so that we can -# easily clean it after executing this example: - -# %% -import tempfile -from pathlib import Path - -temp_dir = tempfile.TemporaryDirectory(prefix="skore_example_") -temp_dir_path = Path(temp_dir.name) - -# %% -# We create and load the skore project from this temporary directory: +# We create and load the skore project from the current directory: # %% import skore -my_project = skore.open(temp_dir_path / "my_project") +my_project = skore.open("my_project", create=True) # %% # Storing integers @@ -369,7 +358,7 @@ def my_func(x): # Cleanup the project # ------------------- # -# Removing the temporary directory: +# Let's clean the skore project to avoid conflict with other examples. # %% -temp_dir.cleanup() +my_project.clear() diff --git a/examples/model_evaluation/plot_cross_validate.py b/examples/model_evaluation/plot_cross_validate.py index f198e72e9..27689b283 100644 --- a/examples/model_evaluation/plot_cross_validate.py +++ b/examples/model_evaluation/plot_cross_validate.py @@ -19,23 +19,12 @@ # ====================================== # %% -# We start by creating a temporary directory to store our project so that we can -# easily clean it after executing this example: - -# %% -import tempfile -from pathlib import Path - -temp_dir = tempfile.TemporaryDirectory(prefix="skore_example_") -temp_dir_path = Path(temp_dir.name) - -# %% -# We create and load the skore project from this temporary directory: +# We create and load the skore project from the current directory: # %% import skore -my_project = skore.open(temp_dir_path / "my_project") +my_project = skore.open("my_project", create=True) # %% # Cross-validation in scikit-learn @@ -170,7 +159,7 @@ # Cleanup the project # ------------------- # -# Removing the temporary directory: +# Let's clear the skore project (to avoid any conflict with other documentation examples). # %% -temp_dir.cleanup() +my_project.clear() diff --git a/examples/model_evaluation/plot_train_test_split.py b/examples/model_evaluation/plot_train_test_split.py index 7a973e036..91fc0235d 100644 --- a/examples/model_evaluation/plot_train_test_split.py +++ b/examples/model_evaluation/plot_train_test_split.py @@ -12,25 +12,13 @@ # %% # Creating and loading the skore project # ====================================== - -# %% -# We start by creating a temporary directory to store our project so that we can -# easily clean it after executing this example: - -# %% -import tempfile -from pathlib import Path - -temp_dir = tempfile.TemporaryDirectory(prefix="skore_example_") -temp_dir_path = Path(temp_dir.name) - # %% -# We create and load the skore project from this temporary directory: +# We create and load the skore project from the current directory: # %% import skore -my_project = skore.open(temp_dir_path / "my_project") +my_project = skore.open("my_project", create=True) # %% # Train-test split in scikit-learn @@ -258,7 +246,7 @@ # Cleanup the project # ------------------- # -# Removing the temporary directory: +# Let's clear the skore project (to avoid any conflict with other documentation examples). # %% -temp_dir.cleanup() +my_project.clear() diff --git a/skore/src/skore/project/project.py b/skore/src/skore/project/project.py index c91dce954..fe0edcd4c 100644 --- a/skore/src/skore/project/project.py +++ b/skore/src/skore/project/project.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, Union from skore.persistence.item import item_to_object, object_to_item +from skore.persistence.view.view import View if TYPE_CHECKING: from skore.persistence import ( @@ -272,3 +273,13 @@ def delete_note(self, key: str, *, version=-1): >>> project.delete_note("key", version=0) # doctest: +SKIP """ return self.item_repository.delete_item_note(key=key, version=version) + + def clear(self): + """Delete all the contents of the project.""" + # delete all the items + for item_key in self.keys(): + self.delete(item_key) + for view_key in self.view_repository.keys(): # noqa: SIM118 + self.view_repository.delete_view(view_key) + # recreate default view + self.view_repository.put_view("default", View(layout=[])) diff --git a/skore/tests/unit/project/test_project.py b/skore/tests/unit/project/test_project.py index eacd24d95..1c0b8bbb6 100644 --- a/skore/tests/unit/project/test_project.py +++ b/skore/tests/unit/project/test_project.py @@ -15,6 +15,7 @@ InvalidProjectNameError, ProjectCreationError, ) +from skore.persistence.view.view import View from skore.project.create import _create, _validate_project_name @@ -266,6 +267,19 @@ def test_put_wrong_key_and_value_raise(in_memory_project): in_memory_project.put(0, (lambda: "unsupported object")) +def test_clear(in_memory_project): + in_memory_project.put("key1", 1) + in_memory_project.put("key1", 2) + in_memory_project.put("a str", "some text here to have fun") + in_memory_project.view_repository.put_view( + "default_test_", View(layout=["key1", "key2"]) + ) + in_memory_project.clear() + assert len(in_memory_project.keys()) == 0 + assert len(in_memory_project.view_repository.keys()) == 1 + assert in_memory_project.view_repository.keys()[0] == "default" + + test_cases = [ ( "a" * 250, diff --git a/sphinx/Makefile b/sphinx/Makefile index af7ea0f5f..c4f49cdf1 100644 --- a/sphinx/Makefile +++ b/sphinx/Makefile @@ -21,4 +21,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/sphinx/conf.py b/sphinx/conf.py index 7cf6afebc..a160b3a72 100644 --- a/sphinx/conf.py +++ b/sphinx/conf.py @@ -93,6 +93,7 @@ def reset_mpl(gallery_conf, fname): "doc_module": "skore", "default_thumb_file": "./_static/images/Logo_Skore_Light@2x.svg", "reset_modules": (reset_mpl, "seaborn"), + "abort_on_example_error": True, } # intersphinx configuration From 501ad110acc360b0892cd6448b2481330cdc6cfb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 22 Jan 2025 16:14:52 +0100 Subject: [PATCH 2/3] docs: Add example to showcase EstimatorReport and CrossValidationReport (#1156) Build upon https://github.com/probabl-ai/skore/pull/1091 We should merge https://github.com/probabl-ai/skore/pull/1091 to make it easy to review. This PR revisits the documentation for the `EstimatorReport` and `CrossValidationReport`. The idea is to provide different examples for different purposes: - technical section: it is a section to explain more in details some internal for user that might be interested. I think that we should move those inside this specific section because we should have more a pure data science section to only show the added values of our tools on data science - use-case section: it is indeed the section to show case our tools on real data. Here we want to show the added value on the data science side. --------- Co-authored-by: Sylvain Combettes <48064216+sylvaincom@users.noreply.github.com> Co-authored-by: Auguste Baum <52001167+augustebaum@users.noreply.github.com> Co-authored-by: Matt J. Co-authored-by: Thomas S. Co-authored-by: MarieS-WiMLDS <79304610+MarieS-WiMLDS@users.noreply.github.com> --- .github/workflows/sphinx.yml | 2 +- examples/technical_details/README.txt | 5 + .../technical_details/plot_cache_mechanism.py | 247 +++++++++++++++ examples/use_cases/README.txt | 9 + examples/use_cases/plot_employee_salaries.py | 291 ++++++++++++++++++ skore/pyproject.toml | 1 + sphinx/conf.py | 2 + 7 files changed, 556 insertions(+), 1 deletion(-) create mode 100644 examples/technical_details/README.txt create mode 100644 examples/technical_details/plot_cache_mechanism.py create mode 100644 examples/use_cases/README.txt create mode 100644 examples/use_cases/plot_employee_salaries.py diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml index 2cbf58e02..46f6373c8 100644 --- a/.github/workflows/sphinx.yml +++ b/.github/workflows/sphinx.yml @@ -112,7 +112,7 @@ jobs: SKRUB_DATA_URL: https://skore.probabl.ai/f355443be646d49eab1aa76e29dfd0a8/skrub-data.tar.gz HOME: ${{ github.workspace }} - uses: ./.github/actions/sphinx/build - timeout-minutes: 10 + timeout-minutes: 60 with: SPHINX_VERSION: ${{ needs.sphinx-version.outputs.SPHINX_VERSION }} SPHINX_RELEASE: ${{ needs.sphinx-version.outputs.SPHINX_RELEASE }} diff --git a/examples/technical_details/README.txt b/examples/technical_details/README.txt new file mode 100644 index 000000000..b4c3abe1e --- /dev/null +++ b/examples/technical_details/README.txt @@ -0,0 +1,5 @@ +Technical details +----------------- + +These examples shows some technical details at the core of `skore` to better understand +some of the mechanic under the hood. diff --git a/examples/technical_details/plot_cache_mechanism.py b/examples/technical_details/plot_cache_mechanism.py new file mode 100644 index 000000000..79ea944c9 --- /dev/null +++ b/examples/technical_details/plot_cache_mechanism.py @@ -0,0 +1,247 @@ +""" +=============== +Cache mechanism +=============== + +This example shows how :class:`~skore.EstimatorReport` and +:class:`~skore.CrossValidationReport` use caching to speed up computations. +""" + +# %% +# +# We set some environment variables to avoid some spurious warnings related to +# parallelism. +import os + +os.environ["POLARS_ALLOW_FORKING_THREAD"] = "1" + +# %% +# +# First, we load a dataset from `skrub`. Our goal is to predict if a company paid a +# physician. The ultimate goal is to detect potential conflict of interest when it comes +# to the actual problem that we want to solve. +from skrub.datasets import fetch_open_payments + +dataset = fetch_open_payments() +df = dataset.X +y = dataset.y + +# %% +from skrub import TableReport + +TableReport(df) + +# %% +# +# The dataset has over 70,000 records with only categorical features. Some categories +# are not well-defined. We use `skrub` to create a simple predictive model that handles +# this. +from skrub import tabular_learner + +model = tabular_learner("classifier") +model + +# %% +# +# This model handles all types of data: numbers, categories, dates, and missing values. +# Let's train it on part of our dataset. +from skore import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42) + +# %% +# +# Let's explore how :class:`~skore.EstimatorReport` uses caching to speed up +# predictions. We start by training the model: +from skore import EstimatorReport + +report = EstimatorReport( + model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test +) +report.help() + +# %% +# +# Let's compute the accuracy on our test set and measure how long it takes: +import time + +start = time.time() +result = report.metrics.accuracy() +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# For comparison, here's how scikit-learn computes the same accuracy score: +from sklearn.metrics import accuracy_score + +start = time.time() +result = accuracy_score(report.y_test, report.estimator_.predict(report.X_test)) +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# Both approaches take similar time. Now watch what happens when we compute accuracy +# again: +start = time.time() +result = report.metrics.accuracy() +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# The second calculation is instant! This happens because the report saves previous +# calculations in its cache. Let's look inside the cache: +report._cache + +# %% +# +# The cache stores predictions by type and data source. This means metrics that use +# the same type of predictions will be faster. Let's try the precision metric: +start = time.time() +result = report.metrics.precision() +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# We observe that it takes only a few milliseconds to compute the precision because we +# don't need to re-compute the predictions and only have to compute the precision +# metric itself. Since the predictions are the bottleneck in terms of time, we observe +# an interesting speedup. +# +# We can pre-compute all predictions at once using parallel processing: +report.cache_predictions(n_jobs=2) + +# %% +# +# Now all possible predictions are stored. Any metric calculation will be much faster, +# even on different data (like the training set): +start = time.time() +result = report.metrics.log_loss(data_source="train") +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# The report can also work with external data. We use `data_source="X_y"` to indicate +# that we want to pass those external data. +start = time.time() +result = report.metrics.log_loss(data_source="X_y", X=X_test, y=y_test) +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# The first calculation is slower than when using the internal train or test sets +# because it needs to compute a hash of the new data for later retrieval. Let's +# calculate it again: +start = time.time() +result = report.metrics.log_loss(data_source="X_y", X=X_test, y=y_test) +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# Much faster! The remaining time is related to the hash computation. Let's compute the +# ROC AUC on the same data: +start = time.time() +result = report.metrics.roc_auc(data_source="X_y", X=X_test, y=y_test) +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# We observe that the computation is already efficient because it boils down to two +# computations: the hash of the data and the ROC-AUC metric. We save a lot of time +# because we don't need to re-compute the predictions. +# +# The cache also speeds up plots. Let's create a ROC curve: +import matplotlib.pyplot as plt + +start = time.time() +display = report.metrics.plot.roc(pos_label="allowed") +end = time.time() +plt.tight_layout() + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# The second plot is instant because it uses cached data: +start = time.time() +display = report.metrics.plot.roc(pos_label="allowed") +end = time.time() +plt.tight_layout() + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# We only use the cache to retrieve the `display` object and not directly the matplotlib +# figure. It means that you can still customize the cached plot before displaying it: +display.plot(roc_curve_kwargs={"color": "tab:orange"}) +plt.tight_layout() + +# %% +# +# Be aware that you can clear the cache if you want to: +report.clear_cache() +report._cache + +# %% +# +# It means that nothing is stored anymore in the cache. +# +# :class:`~skore.CrossValidationReport` uses the same caching system for each fold +# in cross-validation by leveraging the previous :class:`~skore.EstimatorReport`: +from skore import CrossValidationReport + +report = CrossValidationReport(model, X=df, y=y, cv_splitter=5, n_jobs=2) +report.help() + +# %% +# +# We can pre-compute all predictions at once using parallel processing: +report.cache_predictions(n_jobs=2) + +# %% +# +# Now all possible predictions are stored. Any metric calculation will be much faster, +# even on different data as we show for the :class:`~skore.EstimatorReport`. +start = time.time() +result = report.metrics.report_metrics(aggregate=["mean", "std"]) +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# So we observe the same type of behaviour as we previously exposed. diff --git a/examples/use_cases/README.txt b/examples/use_cases/README.txt new file mode 100644 index 000000000..81d42a1ff --- /dev/null +++ b/examples/use_cases/README.txt @@ -0,0 +1,9 @@ +End-to-end data science use cases +--------------------------------- + +These examples show `skore` in action on real use case. We aimed at showing `skore` +ability to: + +- be compatible with `scikit-learn` +- reduce boilerplate code for some standard *de facto* data science analysis +- speed-up exploration by optimizing some internal computation diff --git a/examples/use_cases/plot_employee_salaries.py b/examples/use_cases/plot_employee_salaries.py new file mode 100644 index 000000000..64056171b --- /dev/null +++ b/examples/use_cases/plot_employee_salaries.py @@ -0,0 +1,291 @@ +""" +=============================== +Simplified experiment reporting +=============================== + +This example shows how to leverage `skore` for reporting model evaluation and +storing the results for further analysis. +""" + +# %% +# +# We set some environment variables to avoid some spurious warnings related to +# parallelism. +import os + +os.environ["POLARS_ALLOW_FORKING_THREAD"] = "1" +os.environ["TOKENIZERS_PARALLELISM"] = "true" + +# %% +# +# Let's open a `skore` project in which we will be able to store artifacts from our +# experiments. +import skore + +project = skore.open("my_project", create=True) + +# %% +# +# We use a `skrub` dataset that is non-trivial dataset. +from skrub.datasets import fetch_employee_salaries + +datasets = fetch_employee_salaries() +df, y = datasets.X, datasets.y + +# %% +# +# Let's first have a condensed summary of the input data using +# :class:`~skrub.TableReport`. +from skrub import TableReport + +table_report = TableReport(df) +table_report + +# %% +# +# First, we can check that the type of data is heterogeneous: we mainly have categorical +# features and feature related to dates. +# +# We can observe that the year related to the first hired is also present in the date. +# Hence, we should beware of not creating twice the same feature during the feature +# engineering. +# +# By looking at the "Associations" tab, we observe that two features are exactly holding +# the same information: "department" and "department_name". So during our feature +# engineering, we could potentially drop one of them if the final predictive model +# is sensitive to the collinearity. +# +# We can store the report in the project so that we can easily retrieve it later +# without necessarily having to reload the dataset and recomputing the report. +project.put("Input data summary", table_report) + +# %% +# +# In terms of target and thus the task that we want to solve, we are interested in +# predicting the salary of an employee given the previous features. We therefore have +# a regression task at end. +y + +# %% +# +# In a first attempt, we will define a rather complex predictive model that will use +# a linear model as a base estimator. +import numpy as np +from sklearn.compose import make_column_transformer +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder, SplineTransformer +from sklearn.linear_model import RidgeCV +from skrub import DatetimeEncoder, ToDatetime, DropCols + + +def periodic_spline_transformer(period, n_splines=None, degree=3): + if n_splines is None: + n_splines = period + n_knots = n_splines + 1 # periodic and include_bias is True + return SplineTransformer( + degree=degree, + n_knots=n_knots, + knots=np.linspace(0, period, n_knots).reshape(n_knots, 1), + extrapolation="periodic", + include_bias=True, + ) + + +categorical_features = [ + "gender", + "department_name", + "division", + "assignment_category", + "employee_position_title", + "year_first_hired", +] +datetime_features = "date_first_hired" + +date_encoder = make_pipeline( + ToDatetime(), + DatetimeEncoder(resolution="day", add_weekday=True, add_total_seconds=False), + DropCols("date_first_hired_year"), +) + +date_engineering = make_column_transformer( + (periodic_spline_transformer(12, n_splines=6), ["date_first_hired_month"]), + (periodic_spline_transformer(31, n_splines=15), ["date_first_hired_day"]), + (periodic_spline_transformer(7, n_splines=3), ["date_first_hired_weekday"]), +) + +feature_engineering_date = make_pipeline(date_encoder, date_engineering) + +preprocessing = make_column_transformer( + (feature_engineering_date, datetime_features), + (OneHotEncoder(drop="if_binary", handle_unknown="ignore"), categorical_features), +) + +model = make_pipeline(preprocessing, RidgeCV(alphas=np.logspace(-3, 3, 100))) +model + +# %% +# +# In the diagram above, we can see what we intend to do as feature engineering. For +# categorical features, we use a `OneHotEncoder` to transform the categorical features. +# From the previous data exploration, we could have check the unique values from the +# "Stats" tab and observe that we have large cardinality features. In such cases, +# one-hot encoding might not be the best choice but it is our starting point to get the +# ball rolling. +# +# Then, we have another transformation to encode the date features. We first split the +# date into multiple features (day, month, and year). Then, we apply a periodic spline +# transformation to each of the date features to capture the periodicity of the data. +# +# Finally, we fit a :class:`~sklearn.linear_model.RidgeCV` model. +# +# Now, we want to evaluate this complex model via cross-validation. We would like to +# use 5 folds. We use :class:`~skore.CrossValidationReport` to allow us to investigate +# the performance of the model. +from skore import CrossValidationReport + +report = CrossValidationReport(estimator=model, X=df, y=y, cv_splitter=5) +report.help() + +# %% +# +# We observe that the report detected that we have a regression task and provide us only +# a subset of the metrics and plots that make sense for our problem at hand. To later +# accelerate the computation, we cache once for all the predictions of the model. Note +# that we don't necessarily need to cache the predictions as the report will compute +# them on the fly if not cached and cache them for us. + +# %% +import warnings + +with warnings.catch_warnings(): + # catch the warnings raised by the OneHotEncoder for seeing unknown categories + # at transform time + warnings.simplefilter(action="ignore", category=UserWarning) + report.cache_predictions(n_jobs=3) + +# %% +# +# To not lose the report, let's store it in our `skore` project. +project.put("Linear model report", report) + +# %% +# +# We can now have a look at the performance of the model with some standard metrics. +report.metrics.report_metrics(aggregate=["mean", "std"]) + +# %% +# +# So now, that we have our first baseline model, we can try an out-of-the-box model +# using `skrub` that makes feature engineering for us. To deal with the high +# cardinality of the categorical features, we use a :class:`~skrub.TextEncoder` that +# use a language model and an embedding model to encode the categorical features. +# +# Finally, we use a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` as a +# base estimator that is a rather robust model. +from skrub import TableVectorizer, TextEncoder +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.pipeline import make_pipeline + +model = make_pipeline( + TableVectorizer(high_cardinality=TextEncoder()), + HistGradientBoostingRegressor(), +) +model + +# %% +# +# Let's compute the cross-validation report for this model. +from skore import CrossValidationReport + +report = CrossValidationReport(estimator=model, X=df, y=y, cv_splitter=5, n_jobs=3) +report.help() + +# %% +# +# We cache the predictions for later use. +report.cache_predictions(n_jobs=3) + +# %% +# +# We store the report in our `skore` project. +project.put("HGBDT model report", report) + +# %% +# +# We can now have a look at the performance of the model with some standard metrics. +report.metrics.report_metrics(aggregate=["mean", "std"]) + +# %% +# +# At this stage, I might not been careful and have already overwritten the report and +# model from my first attempt. Hopefully, because we stored the reports in our `skore` +# project, we can easily retrieve them. So let's retrieve the reports. +linear_model_report = project.get("Linear model report") +hgbdt_model_report = project.get("HGBDT model report") + +# %% +# +# Now that we retrieved the reports, I can make further comparison and build upon some +# usual pandas operations to concatenate the results. +import pandas as pd + +results = pd.concat( + [ + linear_model_report.metrics.report_metrics(aggregate=["mean", "std"]), + hgbdt_model_report.metrics.report_metrics(aggregate=["mean", "std"]), + ] +) +results + +# %% +# +# In addition, if I forget to compute a specific metric, I can easily add it to the +# the report, without retraining the model and even recomputing the predictions since +# they are cached internally in the report. It allows to save some time. +from sklearn.metrics import mean_absolute_error + +scoring = ["r2", "rmse", mean_absolute_error] +scoring_kwargs = {"response_method": "predict"} +scoring_names = ["R2", "RMSE", "MAE"] +results = pd.concat( + [ + linear_model_report.metrics.report_metrics( + scoring=scoring, + scoring_kwargs=scoring_kwargs, + scoring_names=scoring_names, + aggregate=["mean", "std"], + ), + hgbdt_model_report.metrics.report_metrics( + scoring=scoring, + scoring_kwargs=scoring_kwargs, + scoring_names=scoring_names, + aggregate=["mean", "std"], + ), + ] +) +results + +# %% +# +# Finally, we can even get individual :class:`~skore.EstimatorReport` from the +# cross-validation to make further analysis. Here, we plot the actual vs predicted +# values for each of the splits. +from itertools import zip_longest +import matplotlib.pyplot as plt + +fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(12, 18)) +for split_idx, (ax, estimator_report) in enumerate( + zip_longest(axs.flatten(), linear_model_report.estimator_reports_) +): + if estimator_report is None: + ax.axis("off") + continue + estimator_report.metrics.plot.prediction_error(kind="actual_vs_predicted", ax=ax) + ax.set_title(f"Split #{split_idx + 1}") + ax.legend(loc="lower right") +plt.tight_layout() + +# %% +# +# Finally, we clean up the project by removing the temporary directory. +project.clear() diff --git a/skore/pyproject.toml b/skore/pyproject.toml index ec348beb8..425416249 100644 --- a/skore/pyproject.toml +++ b/skore/pyproject.toml @@ -90,6 +90,7 @@ sphinx = [ "polars", "kaleido", "pydata-sphinx-theme", + "sentence-transformers", "sphinx", "sphinx_autosummary_accessors", "sphinx-design", diff --git a/sphinx/conf.py b/sphinx/conf.py index a160b3a72..729dd3a76 100644 --- a/sphinx/conf.py +++ b/sphinx/conf.py @@ -64,7 +64,9 @@ # list of examples in explicit order subsections_order = [ "../examples/getting_started", + "../examples/use_cases", "../examples/model_evaluation", + "../examples/technical_details", ] From cde319392d4f515093efd46a6a02fe0c5afd42c5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 22 Jan 2025 16:30:36 +0100 Subject: [PATCH 3/3] chore(deps): Bump katex from 0.16.11 to 0.16.21 in /skore-ui in the npm_and_yarn group (#1196) Bumps the npm_and_yarn group with 1 update in the /skore-ui directory: [katex](https://github.com/KaTeX/KaTeX). Updates `katex` from 0.16.11 to 0.16.21
Release notes

Sourced from katex's releases.

v0.16.21

0.16.21 (2025-01-17)

Bug Fixes

v0.16.20

0.16.20 (2025-01-12)

Bug Fixes

v0.16.19

0.16.19 (2024-12-29)

Bug Fixes

v0.16.18

0.16.18 (2024-12-18)

Bug Fixes

  • Actually publish TypeScript type definitions (#4008) (629b873)

v0.16.17

0.16.17 (2024-12-17)

Bug Fixes

  • MathML combines multidigit numbers with sup/subscript, comma separators, and multicharacter text when outputting to DOM (#3999) (7d79e22), closes #3995

v0.16.16

0.16.16 (2024-12-17)

Features

... (truncated)

Changelog

Sourced from katex's changelog.

0.16.21 (2025-01-17)

Bug Fixes

  • escape \htmlData attribute name (57914ad)

0.16.20 (2025-01-12)

Bug Fixes

0.16.19 (2024-12-29)

Bug Fixes

0.16.18 (2024-12-18)

Bug Fixes

  • Actually publish TypeScript type definitions (#4008) (629b873)

0.16.17 (2024-12-17)

Bug Fixes

  • MathML combines multidigit numbers with sup/subscript, comma separators, and multicharacter text when outputting to DOM (#3999) (7d79e22), closes #3995

0.16.16 (2024-12-17)

Features

0.16.15 (2024-12-09)

Features

  • italic sans-serif in math mode via \mathsfit command (#3998) (2218901)

0.16.14 (2024-12-08)

... (truncated)

Commits
  • 923f2aa chore(release): 0.16.21 [ci skip]
  • 57914ad fix: escape \htmlData attribute name
  • ff28995 Merge commit from fork
  • 28a0bf5 chore(release): 0.16.20 [ci skip]
  • 6d30fe4 fix: \providecommand does not overwrite existing macro (#4000)
  • 8f47dba chore(deps): update actions/upload-artifact to v4 (#4012)
  • 88b5056 chore(release): 0.16.19 [ci skip]
  • 4228b4e fix(types): improve strict function type (#4009)
  • f934646 chore(release): 0.16.18 [ci skip]
  • 629b873 fix: Actually publish TypeScript type definitions (#4008)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=katex&package-manager=npm_and_yarn&previous-version=0.16.11&new-version=0.16.21)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore major version` will close this group update PR and stop Dependabot creating any more for the specific dependency's major version (unless you unignore this specific dependency's major version or upgrade to it yourself) - `@dependabot ignore minor version` will close this group update PR and stop Dependabot creating any more for the specific dependency's minor version (unless you unignore this specific dependency's minor version or upgrade to it yourself) - `@dependabot ignore ` will close this group update PR and stop Dependabot creating any more for the specific dependency (unless you unignore this specific dependency or upgrade to it yourself) - `@dependabot unignore ` will remove all of the ignore conditions of the specified dependency - `@dependabot unignore ` will remove the ignore condition of the specified dependency and ignore conditions You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/probabl-ai/skore/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Matt J. --- skore-ui/package-lock.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skore-ui/package-lock.json b/skore-ui/package-lock.json index b719e4d10..f0b16e232 100644 --- a/skore-ui/package-lock.json +++ b/skore-ui/package-lock.json @@ -5358,9 +5358,9 @@ } }, "node_modules/katex": { - "version": "0.16.11", - "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.11.tgz", - "integrity": "sha512-RQrI8rlHY92OLf3rho/Ts8i/XvjgguEjOkO1BEXcU3N8BqPpSzBNwV/G0Ukr+P/l3ivvJUE/Fa/CwbS6HesGNQ==", + "version": "0.16.21", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.21.tgz", + "integrity": "sha512-XvqR7FgOHtWupfMiigNzmh+MgUVmDGU2kXZm899ZkPfcuoPuFxyHmXsgATDpFZDAXCI8tvinaVcDo8PIIJSo4A==", "funding": [ "https://opencollective.com/katex", "https://github.com/sponsors/katex"