diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml index 2cbf58e02..46f6373c8 100644 --- a/.github/workflows/sphinx.yml +++ b/.github/workflows/sphinx.yml @@ -112,7 +112,7 @@ jobs: SKRUB_DATA_URL: https://skore.probabl.ai/f355443be646d49eab1aa76e29dfd0a8/skrub-data.tar.gz HOME: ${{ github.workspace }} - uses: ./.github/actions/sphinx/build - timeout-minutes: 10 + timeout-minutes: 60 with: SPHINX_VERSION: ${{ needs.sphinx-version.outputs.SPHINX_VERSION }} SPHINX_RELEASE: ${{ needs.sphinx-version.outputs.SPHINX_RELEASE }} diff --git a/examples/getting_started/plot_quick_start.py b/examples/getting_started/plot_quick_start.py index 5739bace0..ed8456469 100644 --- a/examples/getting_started/plot_quick_start.py +++ b/examples/getting_started/plot_quick_start.py @@ -12,7 +12,7 @@ # %% import skore -my_project = skore.open("quick_start", overwrite=True) +my_project = skore.open("my_project", create=True) # %% # This will create a skore project directory named ``quick_start.skore`` in your @@ -69,3 +69,12 @@ # .. admonition:: What's next? # # For a more in-depth guide, see our :ref:`example_skore_getting_started` page! + +# %% +# Cleanup the project +# ------------------- +# +# Let's clear the skore project (to avoid any conflict with other documentation examples). + +# %% +my_project.clear() diff --git a/examples/getting_started/plot_skore_getting_started.py b/examples/getting_started/plot_skore_getting_started.py index c107fa6a7..16893d8c6 100644 --- a/examples/getting_started/plot_skore_getting_started.py +++ b/examples/getting_started/plot_skore_getting_started.py @@ -34,38 +34,13 @@ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # %% -# .. note:: -# -# If we do not wish for our skore project to be stored in a *temporary* folder, we -# can simply create and load the project in the current directory with: -# -# .. code-block:: python -# -# import skore -# -# my_project = skore.open("my_project") -# -# This would create a skore project directory named ``my_project.skore`` in our +# Let's start by creating a skore project directory named ``my_project.skore`` in our # current directory. -# %% -# Here, we start by creating a temporary directory to store our project so that we can -# easily clean it after executing this example: - -# %% -import tempfile -from pathlib import Path - -temp_dir = tempfile.TemporaryDirectory(prefix="skore_example_") -temp_dir_path = Path(temp_dir.name) - -# %% -# Then, we create and load the skore project from this temporary directory: - # %% import skore -my_project = skore.open(temp_dir_path / "my_project") +my_project = skore.open("my_project", create=True) # %% # Now that the project exists, we can write some Python code (in the same @@ -347,7 +322,7 @@ # Cleanup the project # ------------------- # -# Removing the temporary directory: +# Let's clear the skore project (to avoid conflict with other documentation examples). # %% -temp_dir.cleanup() +my_project.clear() diff --git a/examples/getting_started/plot_tracking_items.py b/examples/getting_started/plot_tracking_items.py index ec24e45c1..3eb82b0d1 100644 --- a/examples/getting_started/plot_tracking_items.py +++ b/examples/getting_started/plot_tracking_items.py @@ -14,23 +14,12 @@ # ====================================== # %% -# We start by creating a temporary directory to store our project so that we can -# easily clean it after executing this example: - -# %% -import tempfile -from pathlib import Path - -temp_dir = tempfile.TemporaryDirectory(prefix="skore_example_") -temp_dir_path = Path(temp_dir.name) - -# %% -# We create and load the skore project from this temporary directory: +# We create and load the skore project in the current directory: # %% import skore -my_project = skore.open(temp_dir_path / "my_project") +my_project = skore.open("my_project", create=True) # %% # Tracking an integer @@ -130,7 +119,7 @@ # Cleanup the project # ------------------- # -# Removing the temporary directory: +# Let's clear the skore project (to avoid any conflict with other documentation examples). # %% -temp_dir.cleanup() +my_project.clear() diff --git a/examples/getting_started/plot_working_with_projects.py b/examples/getting_started/plot_working_with_projects.py index 8863d5429..5492d127b 100644 --- a/examples/getting_started/plot_working_with_projects.py +++ b/examples/getting_started/plot_working_with_projects.py @@ -14,23 +14,12 @@ # ====================================== # %% -# We start by creating a temporary directory to store our project so that we can -# easily clean it after executing this example: - -# %% -import tempfile -from pathlib import Path - -temp_dir = tempfile.TemporaryDirectory(prefix="skore_example_") -temp_dir_path = Path(temp_dir.name) - -# %% -# We create and load the skore project from this temporary directory: +# We create and load the skore project from the current directory: # %% import skore -my_project = skore.open(temp_dir_path / "my_project") +my_project = skore.open("my_project", create=True) # %% # Storing integers @@ -369,7 +358,7 @@ def my_func(x): # Cleanup the project # ------------------- # -# Removing the temporary directory: +# Let's clean the skore project to avoid conflict with other examples. # %% -temp_dir.cleanup() +my_project.clear() diff --git a/examples/model_evaluation/plot_cross_validate.py b/examples/model_evaluation/plot_cross_validate.py index f198e72e9..27689b283 100644 --- a/examples/model_evaluation/plot_cross_validate.py +++ b/examples/model_evaluation/plot_cross_validate.py @@ -19,23 +19,12 @@ # ====================================== # %% -# We start by creating a temporary directory to store our project so that we can -# easily clean it after executing this example: - -# %% -import tempfile -from pathlib import Path - -temp_dir = tempfile.TemporaryDirectory(prefix="skore_example_") -temp_dir_path = Path(temp_dir.name) - -# %% -# We create and load the skore project from this temporary directory: +# We create and load the skore project from the current directory: # %% import skore -my_project = skore.open(temp_dir_path / "my_project") +my_project = skore.open("my_project", create=True) # %% # Cross-validation in scikit-learn @@ -170,7 +159,7 @@ # Cleanup the project # ------------------- # -# Removing the temporary directory: +# Let's clear the skore project (to avoid any conflict with other documentation examples). # %% -temp_dir.cleanup() +my_project.clear() diff --git a/examples/model_evaluation/plot_train_test_split.py b/examples/model_evaluation/plot_train_test_split.py index 7a973e036..91fc0235d 100644 --- a/examples/model_evaluation/plot_train_test_split.py +++ b/examples/model_evaluation/plot_train_test_split.py @@ -12,25 +12,13 @@ # %% # Creating and loading the skore project # ====================================== - -# %% -# We start by creating a temporary directory to store our project so that we can -# easily clean it after executing this example: - -# %% -import tempfile -from pathlib import Path - -temp_dir = tempfile.TemporaryDirectory(prefix="skore_example_") -temp_dir_path = Path(temp_dir.name) - # %% -# We create and load the skore project from this temporary directory: +# We create and load the skore project from the current directory: # %% import skore -my_project = skore.open(temp_dir_path / "my_project") +my_project = skore.open("my_project", create=True) # %% # Train-test split in scikit-learn @@ -258,7 +246,7 @@ # Cleanup the project # ------------------- # -# Removing the temporary directory: +# Let's clear the skore project (to avoid any conflict with other documentation examples). # %% -temp_dir.cleanup() +my_project.clear() diff --git a/examples/technical_details/README.txt b/examples/technical_details/README.txt new file mode 100644 index 000000000..b4c3abe1e --- /dev/null +++ b/examples/technical_details/README.txt @@ -0,0 +1,5 @@ +Technical details +----------------- + +These examples shows some technical details at the core of `skore` to better understand +some of the mechanic under the hood. diff --git a/examples/technical_details/plot_cache_mechanism.py b/examples/technical_details/plot_cache_mechanism.py new file mode 100644 index 000000000..79ea944c9 --- /dev/null +++ b/examples/technical_details/plot_cache_mechanism.py @@ -0,0 +1,247 @@ +""" +=============== +Cache mechanism +=============== + +This example shows how :class:`~skore.EstimatorReport` and +:class:`~skore.CrossValidationReport` use caching to speed up computations. +""" + +# %% +# +# We set some environment variables to avoid some spurious warnings related to +# parallelism. +import os + +os.environ["POLARS_ALLOW_FORKING_THREAD"] = "1" + +# %% +# +# First, we load a dataset from `skrub`. Our goal is to predict if a company paid a +# physician. The ultimate goal is to detect potential conflict of interest when it comes +# to the actual problem that we want to solve. +from skrub.datasets import fetch_open_payments + +dataset = fetch_open_payments() +df = dataset.X +y = dataset.y + +# %% +from skrub import TableReport + +TableReport(df) + +# %% +# +# The dataset has over 70,000 records with only categorical features. Some categories +# are not well-defined. We use `skrub` to create a simple predictive model that handles +# this. +from skrub import tabular_learner + +model = tabular_learner("classifier") +model + +# %% +# +# This model handles all types of data: numbers, categories, dates, and missing values. +# Let's train it on part of our dataset. +from skore import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42) + +# %% +# +# Let's explore how :class:`~skore.EstimatorReport` uses caching to speed up +# predictions. We start by training the model: +from skore import EstimatorReport + +report = EstimatorReport( + model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test +) +report.help() + +# %% +# +# Let's compute the accuracy on our test set and measure how long it takes: +import time + +start = time.time() +result = report.metrics.accuracy() +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# For comparison, here's how scikit-learn computes the same accuracy score: +from sklearn.metrics import accuracy_score + +start = time.time() +result = accuracy_score(report.y_test, report.estimator_.predict(report.X_test)) +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# Both approaches take similar time. Now watch what happens when we compute accuracy +# again: +start = time.time() +result = report.metrics.accuracy() +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# The second calculation is instant! This happens because the report saves previous +# calculations in its cache. Let's look inside the cache: +report._cache + +# %% +# +# The cache stores predictions by type and data source. This means metrics that use +# the same type of predictions will be faster. Let's try the precision metric: +start = time.time() +result = report.metrics.precision() +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# We observe that it takes only a few milliseconds to compute the precision because we +# don't need to re-compute the predictions and only have to compute the precision +# metric itself. Since the predictions are the bottleneck in terms of time, we observe +# an interesting speedup. +# +# We can pre-compute all predictions at once using parallel processing: +report.cache_predictions(n_jobs=2) + +# %% +# +# Now all possible predictions are stored. Any metric calculation will be much faster, +# even on different data (like the training set): +start = time.time() +result = report.metrics.log_loss(data_source="train") +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# The report can also work with external data. We use `data_source="X_y"` to indicate +# that we want to pass those external data. +start = time.time() +result = report.metrics.log_loss(data_source="X_y", X=X_test, y=y_test) +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# The first calculation is slower than when using the internal train or test sets +# because it needs to compute a hash of the new data for later retrieval. Let's +# calculate it again: +start = time.time() +result = report.metrics.log_loss(data_source="X_y", X=X_test, y=y_test) +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# Much faster! The remaining time is related to the hash computation. Let's compute the +# ROC AUC on the same data: +start = time.time() +result = report.metrics.roc_auc(data_source="X_y", X=X_test, y=y_test) +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# We observe that the computation is already efficient because it boils down to two +# computations: the hash of the data and the ROC-AUC metric. We save a lot of time +# because we don't need to re-compute the predictions. +# +# The cache also speeds up plots. Let's create a ROC curve: +import matplotlib.pyplot as plt + +start = time.time() +display = report.metrics.plot.roc(pos_label="allowed") +end = time.time() +plt.tight_layout() + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# The second plot is instant because it uses cached data: +start = time.time() +display = report.metrics.plot.roc(pos_label="allowed") +end = time.time() +plt.tight_layout() + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# We only use the cache to retrieve the `display` object and not directly the matplotlib +# figure. It means that you can still customize the cached plot before displaying it: +display.plot(roc_curve_kwargs={"color": "tab:orange"}) +plt.tight_layout() + +# %% +# +# Be aware that you can clear the cache if you want to: +report.clear_cache() +report._cache + +# %% +# +# It means that nothing is stored anymore in the cache. +# +# :class:`~skore.CrossValidationReport` uses the same caching system for each fold +# in cross-validation by leveraging the previous :class:`~skore.EstimatorReport`: +from skore import CrossValidationReport + +report = CrossValidationReport(model, X=df, y=y, cv_splitter=5, n_jobs=2) +report.help() + +# %% +# +# We can pre-compute all predictions at once using parallel processing: +report.cache_predictions(n_jobs=2) + +# %% +# +# Now all possible predictions are stored. Any metric calculation will be much faster, +# even on different data as we show for the :class:`~skore.EstimatorReport`. +start = time.time() +result = report.metrics.report_metrics(aggregate=["mean", "std"]) +end = time.time() +result + +# %% +print(f"Time taken: {end - start:.2f} seconds") + +# %% +# +# So we observe the same type of behaviour as we previously exposed. diff --git a/examples/use_cases/README.txt b/examples/use_cases/README.txt new file mode 100644 index 000000000..81d42a1ff --- /dev/null +++ b/examples/use_cases/README.txt @@ -0,0 +1,9 @@ +End-to-end data science use cases +--------------------------------- + +These examples show `skore` in action on real use case. We aimed at showing `skore` +ability to: + +- be compatible with `scikit-learn` +- reduce boilerplate code for some standard *de facto* data science analysis +- speed-up exploration by optimizing some internal computation diff --git a/examples/use_cases/plot_employee_salaries.py b/examples/use_cases/plot_employee_salaries.py new file mode 100644 index 000000000..64056171b --- /dev/null +++ b/examples/use_cases/plot_employee_salaries.py @@ -0,0 +1,291 @@ +""" +=============================== +Simplified experiment reporting +=============================== + +This example shows how to leverage `skore` for reporting model evaluation and +storing the results for further analysis. +""" + +# %% +# +# We set some environment variables to avoid some spurious warnings related to +# parallelism. +import os + +os.environ["POLARS_ALLOW_FORKING_THREAD"] = "1" +os.environ["TOKENIZERS_PARALLELISM"] = "true" + +# %% +# +# Let's open a `skore` project in which we will be able to store artifacts from our +# experiments. +import skore + +project = skore.open("my_project", create=True) + +# %% +# +# We use a `skrub` dataset that is non-trivial dataset. +from skrub.datasets import fetch_employee_salaries + +datasets = fetch_employee_salaries() +df, y = datasets.X, datasets.y + +# %% +# +# Let's first have a condensed summary of the input data using +# :class:`~skrub.TableReport`. +from skrub import TableReport + +table_report = TableReport(df) +table_report + +# %% +# +# First, we can check that the type of data is heterogeneous: we mainly have categorical +# features and feature related to dates. +# +# We can observe that the year related to the first hired is also present in the date. +# Hence, we should beware of not creating twice the same feature during the feature +# engineering. +# +# By looking at the "Associations" tab, we observe that two features are exactly holding +# the same information: "department" and "department_name". So during our feature +# engineering, we could potentially drop one of them if the final predictive model +# is sensitive to the collinearity. +# +# We can store the report in the project so that we can easily retrieve it later +# without necessarily having to reload the dataset and recomputing the report. +project.put("Input data summary", table_report) + +# %% +# +# In terms of target and thus the task that we want to solve, we are interested in +# predicting the salary of an employee given the previous features. We therefore have +# a regression task at end. +y + +# %% +# +# In a first attempt, we will define a rather complex predictive model that will use +# a linear model as a base estimator. +import numpy as np +from sklearn.compose import make_column_transformer +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder, SplineTransformer +from sklearn.linear_model import RidgeCV +from skrub import DatetimeEncoder, ToDatetime, DropCols + + +def periodic_spline_transformer(period, n_splines=None, degree=3): + if n_splines is None: + n_splines = period + n_knots = n_splines + 1 # periodic and include_bias is True + return SplineTransformer( + degree=degree, + n_knots=n_knots, + knots=np.linspace(0, period, n_knots).reshape(n_knots, 1), + extrapolation="periodic", + include_bias=True, + ) + + +categorical_features = [ + "gender", + "department_name", + "division", + "assignment_category", + "employee_position_title", + "year_first_hired", +] +datetime_features = "date_first_hired" + +date_encoder = make_pipeline( + ToDatetime(), + DatetimeEncoder(resolution="day", add_weekday=True, add_total_seconds=False), + DropCols("date_first_hired_year"), +) + +date_engineering = make_column_transformer( + (periodic_spline_transformer(12, n_splines=6), ["date_first_hired_month"]), + (periodic_spline_transformer(31, n_splines=15), ["date_first_hired_day"]), + (periodic_spline_transformer(7, n_splines=3), ["date_first_hired_weekday"]), +) + +feature_engineering_date = make_pipeline(date_encoder, date_engineering) + +preprocessing = make_column_transformer( + (feature_engineering_date, datetime_features), + (OneHotEncoder(drop="if_binary", handle_unknown="ignore"), categorical_features), +) + +model = make_pipeline(preprocessing, RidgeCV(alphas=np.logspace(-3, 3, 100))) +model + +# %% +# +# In the diagram above, we can see what we intend to do as feature engineering. For +# categorical features, we use a `OneHotEncoder` to transform the categorical features. +# From the previous data exploration, we could have check the unique values from the +# "Stats" tab and observe that we have large cardinality features. In such cases, +# one-hot encoding might not be the best choice but it is our starting point to get the +# ball rolling. +# +# Then, we have another transformation to encode the date features. We first split the +# date into multiple features (day, month, and year). Then, we apply a periodic spline +# transformation to each of the date features to capture the periodicity of the data. +# +# Finally, we fit a :class:`~sklearn.linear_model.RidgeCV` model. +# +# Now, we want to evaluate this complex model via cross-validation. We would like to +# use 5 folds. We use :class:`~skore.CrossValidationReport` to allow us to investigate +# the performance of the model. +from skore import CrossValidationReport + +report = CrossValidationReport(estimator=model, X=df, y=y, cv_splitter=5) +report.help() + +# %% +# +# We observe that the report detected that we have a regression task and provide us only +# a subset of the metrics and plots that make sense for our problem at hand. To later +# accelerate the computation, we cache once for all the predictions of the model. Note +# that we don't necessarily need to cache the predictions as the report will compute +# them on the fly if not cached and cache them for us. + +# %% +import warnings + +with warnings.catch_warnings(): + # catch the warnings raised by the OneHotEncoder for seeing unknown categories + # at transform time + warnings.simplefilter(action="ignore", category=UserWarning) + report.cache_predictions(n_jobs=3) + +# %% +# +# To not lose the report, let's store it in our `skore` project. +project.put("Linear model report", report) + +# %% +# +# We can now have a look at the performance of the model with some standard metrics. +report.metrics.report_metrics(aggregate=["mean", "std"]) + +# %% +# +# So now, that we have our first baseline model, we can try an out-of-the-box model +# using `skrub` that makes feature engineering for us. To deal with the high +# cardinality of the categorical features, we use a :class:`~skrub.TextEncoder` that +# use a language model and an embedding model to encode the categorical features. +# +# Finally, we use a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` as a +# base estimator that is a rather robust model. +from skrub import TableVectorizer, TextEncoder +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.pipeline import make_pipeline + +model = make_pipeline( + TableVectorizer(high_cardinality=TextEncoder()), + HistGradientBoostingRegressor(), +) +model + +# %% +# +# Let's compute the cross-validation report for this model. +from skore import CrossValidationReport + +report = CrossValidationReport(estimator=model, X=df, y=y, cv_splitter=5, n_jobs=3) +report.help() + +# %% +# +# We cache the predictions for later use. +report.cache_predictions(n_jobs=3) + +# %% +# +# We store the report in our `skore` project. +project.put("HGBDT model report", report) + +# %% +# +# We can now have a look at the performance of the model with some standard metrics. +report.metrics.report_metrics(aggregate=["mean", "std"]) + +# %% +# +# At this stage, I might not been careful and have already overwritten the report and +# model from my first attempt. Hopefully, because we stored the reports in our `skore` +# project, we can easily retrieve them. So let's retrieve the reports. +linear_model_report = project.get("Linear model report") +hgbdt_model_report = project.get("HGBDT model report") + +# %% +# +# Now that we retrieved the reports, I can make further comparison and build upon some +# usual pandas operations to concatenate the results. +import pandas as pd + +results = pd.concat( + [ + linear_model_report.metrics.report_metrics(aggregate=["mean", "std"]), + hgbdt_model_report.metrics.report_metrics(aggregate=["mean", "std"]), + ] +) +results + +# %% +# +# In addition, if I forget to compute a specific metric, I can easily add it to the +# the report, without retraining the model and even recomputing the predictions since +# they are cached internally in the report. It allows to save some time. +from sklearn.metrics import mean_absolute_error + +scoring = ["r2", "rmse", mean_absolute_error] +scoring_kwargs = {"response_method": "predict"} +scoring_names = ["R2", "RMSE", "MAE"] +results = pd.concat( + [ + linear_model_report.metrics.report_metrics( + scoring=scoring, + scoring_kwargs=scoring_kwargs, + scoring_names=scoring_names, + aggregate=["mean", "std"], + ), + hgbdt_model_report.metrics.report_metrics( + scoring=scoring, + scoring_kwargs=scoring_kwargs, + scoring_names=scoring_names, + aggregate=["mean", "std"], + ), + ] +) +results + +# %% +# +# Finally, we can even get individual :class:`~skore.EstimatorReport` from the +# cross-validation to make further analysis. Here, we plot the actual vs predicted +# values for each of the splits. +from itertools import zip_longest +import matplotlib.pyplot as plt + +fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(12, 18)) +for split_idx, (ax, estimator_report) in enumerate( + zip_longest(axs.flatten(), linear_model_report.estimator_reports_) +): + if estimator_report is None: + ax.axis("off") + continue + estimator_report.metrics.plot.prediction_error(kind="actual_vs_predicted", ax=ax) + ax.set_title(f"Split #{split_idx + 1}") + ax.legend(loc="lower right") +plt.tight_layout() + +# %% +# +# Finally, we clean up the project by removing the temporary directory. +project.clear() diff --git a/skore-ui/package-lock.json b/skore-ui/package-lock.json index b719e4d10..f0b16e232 100644 --- a/skore-ui/package-lock.json +++ b/skore-ui/package-lock.json @@ -5358,9 +5358,9 @@ } }, "node_modules/katex": { - "version": "0.16.11", - "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.11.tgz", - "integrity": "sha512-RQrI8rlHY92OLf3rho/Ts8i/XvjgguEjOkO1BEXcU3N8BqPpSzBNwV/G0Ukr+P/l3ivvJUE/Fa/CwbS6HesGNQ==", + "version": "0.16.21", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.21.tgz", + "integrity": "sha512-XvqR7FgOHtWupfMiigNzmh+MgUVmDGU2kXZm899ZkPfcuoPuFxyHmXsgATDpFZDAXCI8tvinaVcDo8PIIJSo4A==", "funding": [ "https://opencollective.com/katex", "https://github.com/sponsors/katex" diff --git a/skore/pyproject.toml b/skore/pyproject.toml index ec348beb8..425416249 100644 --- a/skore/pyproject.toml +++ b/skore/pyproject.toml @@ -90,6 +90,7 @@ sphinx = [ "polars", "kaleido", "pydata-sphinx-theme", + "sentence-transformers", "sphinx", "sphinx_autosummary_accessors", "sphinx-design", diff --git a/skore/src/skore/project/project.py b/skore/src/skore/project/project.py index c91dce954..fe0edcd4c 100644 --- a/skore/src/skore/project/project.py +++ b/skore/src/skore/project/project.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, Union from skore.persistence.item import item_to_object, object_to_item +from skore.persistence.view.view import View if TYPE_CHECKING: from skore.persistence import ( @@ -272,3 +273,13 @@ def delete_note(self, key: str, *, version=-1): >>> project.delete_note("key", version=0) # doctest: +SKIP """ return self.item_repository.delete_item_note(key=key, version=version) + + def clear(self): + """Delete all the contents of the project.""" + # delete all the items + for item_key in self.keys(): + self.delete(item_key) + for view_key in self.view_repository.keys(): # noqa: SIM118 + self.view_repository.delete_view(view_key) + # recreate default view + self.view_repository.put_view("default", View(layout=[])) diff --git a/skore/tests/unit/project/test_project.py b/skore/tests/unit/project/test_project.py index eacd24d95..1c0b8bbb6 100644 --- a/skore/tests/unit/project/test_project.py +++ b/skore/tests/unit/project/test_project.py @@ -15,6 +15,7 @@ InvalidProjectNameError, ProjectCreationError, ) +from skore.persistence.view.view import View from skore.project.create import _create, _validate_project_name @@ -266,6 +267,19 @@ def test_put_wrong_key_and_value_raise(in_memory_project): in_memory_project.put(0, (lambda: "unsupported object")) +def test_clear(in_memory_project): + in_memory_project.put("key1", 1) + in_memory_project.put("key1", 2) + in_memory_project.put("a str", "some text here to have fun") + in_memory_project.view_repository.put_view( + "default_test_", View(layout=["key1", "key2"]) + ) + in_memory_project.clear() + assert len(in_memory_project.keys()) == 0 + assert len(in_memory_project.view_repository.keys()) == 1 + assert in_memory_project.view_repository.keys()[0] == "default" + + test_cases = [ ( "a" * 250, diff --git a/sphinx/Makefile b/sphinx/Makefile index af7ea0f5f..c4f49cdf1 100644 --- a/sphinx/Makefile +++ b/sphinx/Makefile @@ -21,4 +21,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/sphinx/conf.py b/sphinx/conf.py index 7cf6afebc..729dd3a76 100644 --- a/sphinx/conf.py +++ b/sphinx/conf.py @@ -64,7 +64,9 @@ # list of examples in explicit order subsections_order = [ "../examples/getting_started", + "../examples/use_cases", "../examples/model_evaluation", + "../examples/technical_details", ] @@ -93,6 +95,7 @@ def reset_mpl(gallery_conf, fname): "doc_module": "skore", "default_thumb_file": "./_static/images/Logo_Skore_Light@2x.svg", "reset_modules": (reset_mpl, "seaborn"), + "abort_on_example_error": True, } # intersphinx configuration