From 8982b3f4b93637e329f5c3b4fda7be8a5613b76f Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Thu, 17 Oct 2024 19:38:27 +0200
Subject: [PATCH] ENH Rework bagging notebook (#778)

---
 python_scripts/ensemble_bagging.py | 72 ++++++++++++++----------------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/python_scripts/ensemble_bagging.py b/python_scripts/ensemble_bagging.py
index 111ef9684..f3750e642 100644
--- a/python_scripts/ensemble_bagging.py
+++ b/python_scripts/ensemble_bagging.py
@@ -8,31 +8,28 @@
 # %% [markdown]
 # # Bagging
 #
-# This notebook introduces a very natural strategy to build ensembles of machine
-# learning models named "bagging".
+# In this notebook we introduce a very natural strategy to build ensembles of
+# machine learning models, named "bagging".
 #
 # "Bagging" stands for Bootstrap AGGregatING. It uses bootstrap resampling
 # (random sampling with replacement) to learn several models on random
 # variations of the training set. At predict time, the predictions of each
 # learner are aggregated to give the final predictions.
 #
-# First, we will generate a simple synthetic dataset to get insights regarding
-# bootstraping.
+# We first create a simple synthetic dataset to better understand bootstrapping.
 
 # %%
 import pandas as pd
 import numpy as np
 
-# create a random number generator that will be used to set the randomness
-rng = np.random.RandomState(1)
-
 
 def generate_data(n_samples=30):
     """Generate synthetic dataset. Returns `data_train`, `data_test`,
     `target_train`."""
     x_min, x_max = -3, 3
+    rng = np.random.default_rng(1)  # Create a random number generator
     x = rng.uniform(x_min, x_max, size=n_samples)
-    noise = 4.0 * rng.randn(n_samples)
+    noise = 4.0 * rng.normal(size=(n_samples,))
     y = x**3 - 0.5 * (x + 1) ** 2 + noise
     y /= y.std()
 
@@ -57,9 +54,8 @@ def generate_data(n_samples=30):
 
 # %% [markdown]
 #
-# The relationship between our feature and the target to predict is non-linear.
-# However, a decision tree is capable of approximating such a non-linear
-# dependency:
+# The target to predict is a non-linear function of the only feature. However, a
+# decision tree is capable of approximating such a non-linear dependency:
 
 # %%
 from sklearn.tree import DecisionTreeRegressor
@@ -86,23 +82,24 @@ def generate_data(n_samples=30):
 #
 # ## Bootstrap resampling
 #
-# Given a dataset with `n` data points, bootstrapping corresponds to resampling
-# with replacement  `n` out of such `n` data points uniformly at random.
+# Bootstrapping involves uniformly resampling `n` data points from a dataset of
+# `n` points, with replacement, ensuring each sample has an equal chance of
+# selection.
 #
 # As a result, the output of the bootstrap sampling procedure is another dataset
-# with also n data points, but likely with duplicates. As a consequence, there
-# are also data points from the original dataset that are never selected to
-# appear in a bootstrap sample (by chance). Those data points that are left away
-# are often referred to as the out-of-bag sample.
+# with `n` data points, likely containing duplicates. Consequently, some data
+# points from the original dataset may not be selected for a bootstrap sample.
+# These unselected data points are often referred to as the out-of-bag sample.
 #
-# We will create a function that given `data` and `target` will return a
+# We now create a function that, given `data` and `target`, returns a
 # resampled variation `data_bootstrap` and `target_bootstrap`.
 
 
 # %%
-def bootstrap_sample(data, target):
+def bootstrap_sample(data, target, seed=0):
     # Indices corresponding to a sampling with replacement of the same sample
     # size than the original data
+    rng = np.random.default_rng(seed)
     bootstrap_indices = rng.choice(
         np.arange(target.shape[0]),
         size=target.shape[0],
@@ -117,7 +114,7 @@ def bootstrap_sample(data, target):
 
 # %% [markdown]
 #
-# We will generate 3 bootstrap samples and qualitatively check the difference
+# We generate 3 bootstrap samples and qualitatively check the difference
 # with the original dataset.
 
 # %%
@@ -127,6 +124,7 @@ def bootstrap_sample(data, target):
     data_bootstrap, target_bootstrap = bootstrap_sample(
         data_train,
         target_train,
+        seed=bootstrap_idx,  # ensure bootstrap samples are different but reproducible
     )
     plt.figure()
     plt.scatter(
@@ -179,9 +177,9 @@ def bootstrap_sample(data, target):
 # %% [markdown]
 #
 # On average, roughly 63.2% of the original data points of the original dataset
-# will be present in a given bootstrap sample. Since the bootstrap sample has
-# the same size as the original dataset, there will be many samples that are in
-# the bootstrap sample multiple times.
+# are present in a given bootstrap sample. Since the bootstrap sample has the
+# same size as the original dataset, there are many samples that are in the
+# bootstrap sample multiple times.
 #
 # Using bootstrap we are able to generate many datasets, all slightly different.
 # We can fit a decision tree for each of these datasets and they all shall be
@@ -193,7 +191,7 @@ def bootstrap_sample(data, target):
     tree = DecisionTreeRegressor(max_depth=3, random_state=0)
 
     data_bootstrap_sample, target_bootstrap_sample = bootstrap_sample(
-        data_train, target_train
+        data_train, target_train, seed=bootstrap_idx
     )
     tree.fit(data_bootstrap_sample, target_bootstrap_sample)
     bag_of_trees.append(tree)
@@ -224,7 +222,7 @@ def bootstrap_sample(data, target):
 # %% [markdown]
 # ## Aggregating
 #
-# Once our trees are fitted, we are able to get predictions for each of them. In
+# Once our trees are fitted, we are able to get predictions from each of them. In
 # regression, the most straightforward way to combine those predictions is just
 # to average them: for a given test data point, we feed the input feature values
 # to each of the `n` trained models in the ensemble and as a result compute `n`
@@ -262,7 +260,7 @@ def bootstrap_sample(data, target):
 
 # %% [markdown]
 #
-# The unbroken red line shows the averaged predictions, which would be the final
+# The continuous red line shows the averaged predictions, which would be the final
 # predictions given by our 'bag' of decision tree regressors. Note that the
 # predictions of the ensemble is more stable because of the averaging operation.
 # As a result, the bag of trees as a whole is less likely to overfit than the
@@ -298,7 +296,7 @@ def bootstrap_sample(data, target):
 bagged_trees_predictions = bagged_trees.predict(data_test)
 plt.plot(data_test["Feature"], bagged_trees_predictions)
 
-_ = plt.title("Predictions from a bagging classifier")
+_ = plt.title("Predictions from a bagging regressor")
 
 # %% [markdown]
 # Because we use 100 trees in the ensemble, the average prediction is indeed
@@ -338,15 +336,14 @@ def bootstrap_sample(data, target):
 
 # %% [markdown]
 # We used a low value of the opacity parameter `alpha` to better appreciate the
-# overlap in the prediction functions of the individual trees.
-#
-# This visualization gives some insights on the uncertainty in the predictions
-# in different areas of the feature space.
+# overlap in the prediction functions of the individual trees. Such
+# visualization also gives us an intuition on the variance in the predictions
+# across different zones of the feature space.
 #
 # ## Bagging complex pipelines
 #
-# While we used a decision tree as a base model, nothing prevents us of using
-# any other type of model.
+# Even if here we used a decision tree as a base model, nothing prevents us from
+# using any other type of model.
 #
 # As we know that the original data generating function is a noisy polynomial
 # transformation of the input variable, let us try to fit a bagged polynomial
@@ -361,15 +358,14 @@ def bootstrap_sample(data, target):
 
 polynomial_regressor = make_pipeline(
     MinMaxScaler(),
-    PolynomialFeatures(degree=4),
+    PolynomialFeatures(degree=4, include_bias=False),
     Ridge(alpha=1e-10),
 )
 
 # %% [markdown]
-# This pipeline first scales the data to the 0-1 range with `MinMaxScaler`. Then
-# it extracts degree-4 polynomial features. The resulting features will all stay
-# in the 0-1 range by construction: if `x` lies in the 0-1 range then `x ** n`
-# also lies in the 0-1 range for any value of `n`.
+# This pipeline first scales the data to the 0-1 range using `MinMaxScaler`. It
+# then generates degree-4 polynomial features. By design, these features remain
+# in the 0-1 range, as any power of `x` within this range also stays within 0-1.
 #
 # Then the pipeline feeds the resulting non-linear features to a regularized
 # linear regression model for the final prediction of the target variable.