soda-inria · Vincent-Maladiere · Aug 20, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/doc/api.rst b/doc/api.rst
@@ -15,7 +15,7 @@ Estimators
     :template: class.rst
     :nosignatures:
 
-    GradientBoostingIncidence
+    SurvivalBoost
 
 
 Metrics
@@ -42,13 +42,3 @@ Datasets
     data.make_synthetic_competing_weibull
     data.load_seer
 
-
-Inverse Probability Censoring Weight
-------------------------------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-    :nosignatures:
-
-    IPCWEstimator
diff --git a/doc/conf.py b/doc/conf.py
@@ -19,6 +19,7 @@
     'sphinx.ext.autosummary',
     "sphinx.ext.intersphinx",
     'numpydoc',
+    'sphinx_design',
 ]
 templates_path = ['_templates']
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

diff --git a/examples/plot_marginal_cumulative_incidence_estimation.py b/examples/plot_marginal_cumulative_incidence_estimation.py
@@ -4,76 +4,57 @@
 ==================================================
 
 This example demonstrates how to estimate the marginal cumulative incidence
-using :class:`hazardous.GradientBoostingIncidence` and compares the results to
-the Aalen-Johansen estimator and to the theoretical cumulated incidence curves
-on synthetic data.
+using :class:`hazardous.SurvivalBoost` and compares the results to the
+Aalen-Johansen estimator and to the theoretical cumulated incidence curves on
+synthetic data.
 
 Here the data is generated by taking the minimum time of samples from three
 competing Weibull distributions with fixed parameters and without any
 conditioning covariate. In this case, the Aalen-Johansen estimator is expected
 to be unbiased, and this is empirically confirmed by this example.
 
-The :class:`hazardous.GradientBoostingIncidence` estimator on the other hand is
-a predictive estimator that expects at least one conditioning covariate. In
-this example, we use a dummy covariate that is constant for all samples. Here
-we are not interested in the discrimination power of the estimator: there is
-none by construction, since we do not have access to informative covariates.
-Instead we empirically study its marginal calibration, that is, its ability to
+The :class:`hazardous.SurvivalBoost` estimator on the other hand is a
+predictive estimator that expects at least one conditioning covariate. In this
+example, we use a dummy covariate that is constant for all samples. Here we are
+not interested in the discrimination power of the estimator: there is none by
+construction, since we do not have access to informative covariates. Instead we
+empirically study its marginal calibration, that is, its ability to
 approximately recover an unbiased estimate of the marginal cumulative incidence
 function for each competing event.
 
-This example also highlights that :class:`hazardous.GradientBoostingIncidence`
-estimates noisy cumulative incidence functions, which are not smooth and not
-even monotonically increasing. This is a known limitation of the estimator,
-and attempting to enforce monotonicity at training time typically introduces
-severe over-estimation bias for large time horizons.
+This example also highlights that :class:`hazardous.SurvivalBoost` estimates
+noisy cumulative incidence functions, which are not smooth and not even
+monotonically increasing. This is a known limitation of the estimator, and
+attempting to enforce monotonicity at training time typically introduces severe
+over-estimation bias for large time horizons.
 """
+
 # %%
 from time import perf_counter
 import numpy as np
-from scipy.stats import weibull_min
-from sklearn.base import clone
-import pandas as pd
 import matplotlib.pyplot as plt
 
-from hazardous import GradientBoostingIncidence
+from hazardous import SurvivalBoost
+from hazardous.data import make_synthetic_competing_weibull
 from lifelines import AalenJohansenFitter
 
-rng = np.random.default_rng(0)
 n_samples = 3_000
-
-# Non-informative covariate because scikit-learn estimators expect at least
-# one feature.
-X_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)
-
 base_scale = 1_000.0  # some arbitrary time unit
-
-distributions = [
-    {"event_id": 1, "scale": 10 * base_scale, "shape": 0.5},
-    {"event_id": 2, "scale": 3 * base_scale, "shape": 1},
-    {"event_id": 3, "scale": 3 * base_scale, "shape": 5},
-]
-event_times = np.concatenate(
-    [
-        weibull_min.rvs(
-            dist["shape"],
-            scale=dist["scale"],
-            size=n_samples,
-            random_state=rng,
-        ).reshape(-1, 1)
-        for dist in distributions
-    ],
-    axis=1,
+event_dist_shapes = (0.5, 1.0, 5.0)
+event_dist_scales = (10, 3, 3)
+n_events = len(event_dist_shapes)
+
+_, y_uncensored = make_synthetic_competing_weibull(
+    n_samples=n_samples,
+    n_events=n_events,
+    censoring_relative_scale=0,
+    return_X_y=True,
+    shape_ranges=[(shape, shape) for shape in event_dist_shapes],
+    scale_ranges=[(scale, scale) for scale in event_dist_scales],
+    base_scale=base_scale,
+    random_state=0,
 )
-first_event_idx = np.argmin(event_times, axis=1)
 
-y_uncensored = pd.DataFrame(
-    dict(
-        event=first_event_idx + 1,  # 0 is reserved as the censoring marker
-        duration=event_times[np.arange(n_samples), first_event_idx],
-    )
-)
-y_uncensored["event"].value_counts().sort_index()
 t_max = y_uncensored["duration"].max()
 
 # %%
@@ -88,7 +69,7 @@
 # distribution <https://en.wikipedia.org/wiki/Weibull_distribution>`_:
 
 
-def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs):
+def weibull_hazard(t, shape=1.0, scale=1.0):
     # Plug an arbitrary finite hazard value at t==0 because fractional powers
     # of 0 are undefined.
     #
@@ -103,19 +84,25 @@ def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs):
 #
 # Note that true CIFs are independent of the censoring distribution. We can use
 # them as reference to check that the estimators are unbiased by the censoring.
-# Here are the two estimators of interest:
+#
+# We first define the two estimators of interest. The
+# :class:`hazardous.SurvivalBoost` instance uses the Kaplan-Meier estimator on
+# the negated event labels (1 for censoring, 0 for any event) to estimate
+# internal IPCW weights. This is a valid choice in this context because we do
+# not have access to any informative covariate (either for censoring or for the
+# events of interest).
 
 calculate_variance = n_samples <= 5_000
 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
 
-gb_incidence = GradientBoostingIncidence(
+survival_boost = SurvivalBoost(
     learning_rate=0.03,
     n_iter=100,
     max_leaf_nodes=5,
     hard_zero_fraction=0.1,
     min_samples_leaf=50,
-    loss="ibs",
     show_progressbar=False,
+    ipcw_strategy="kaplan-meier",
     random_state=0,
 )
 
@@ -128,21 +115,25 @@ def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs):
 # theoretical CIFs:
 
 
-def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=None):
-    _, axes = plt.subplots(figsize=(12, 4), ncols=len(distributions), sharey=True)
+def plot_cumulative_incidence_functions(y, survival_boost=None, aj=None):
+    """Plot cause-specific cumulative incidence per event using a dummy covariate"""
+    _, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True)
 
     # Compute the estimate of the CIFs on a coarse grid.
     coarse_timegrid = np.linspace(0, t_max, num=100)
 
     # Compute the theoretical CIFs by integrating the hazard functions on a
     # fine-grained time grid. Note that integration errors can accumulate quite
-    # quickly if the time grid is resolution too coarse, especially for the
+    # quickly if the time grid's resolution is too coarse, especially for the
     # Weibull distribution with shape < 1.
     tic = perf_counter()
     fine_time_grid = np.linspace(0, t_max, num=10_000_000)
     dt = np.diff(fine_time_grid)[0]
     all_hazards = np.stack(
-        [weibull_hazard(fine_time_grid, **dist) for dist in distributions],
+        [
+            weibull_hazard(fine_time_grid, shape, scale * base_scale)
+            for shape, scale in zip(event_dist_shapes, event_dist_scales)
+        ],
         axis=0,
     )
     any_event_hazards = all_hazards.sum(axis=0)
@@ -157,6 +148,20 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
         "Cause-specific cumulative incidence functions"
         f" ({censoring_fraction:.1%} censoring)"
     )
+    # Non-informative covariate because scikit-learn estimators expect at least
+    # one feature.
+    X_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)
+    if survival_boost is not None:
+        tic = perf_counter()
+        survival_boost.fit(X_dummy, y)
+        duration = perf_counter() - tic
+        print(f"SurvivalBoost fit: {duration:.3f} s")
+        tic = perf_counter()
+        cif_preds = survival_boost.predict_cumulative_incidence(
+            X_dummy, coarse_timegrid
+        )
+        duration = perf_counter() - tic
+        print(f"SurvivalBoost prediction: {duration:.3f} s")
 
     for event_id, (ax, hazards_i) in enumerate(zip(axes, all_hazards), 1):
         theoretical_cif = (hazards_i * any_event_survival).cumsum(axis=-1) * dt
@@ -172,28 +177,22 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
             label="Theoretical incidence",
         ),
 
-        if gb_incidence is not None:
-            tic = perf_counter()
-            gb_incidence.set_params(event_of_interest=event_id)
-            gb_incidence.fit(X_dummy, y)
-            duration = perf_counter() - tic
-            print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")
-            tic = perf_counter()
-            cif_pred = gb_incidence.predict_cumulative_incidence(
-                X_dummy[0:1], coarse_timegrid
-            )[0]
-            duration = perf_counter() - tic
-            print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
+        if survival_boost is not None:
+            cif_pred = cif_preds[:, event_id][0]
             ax.plot(
                 coarse_timegrid,
                 cif_pred,
-                label="GradientBoostingIncidence",
+                label="SurvivalBoost",
             )
             ax.set(title=f"Event {event_id}")
 
         if aj is not None:
+            # Randomly break tied durations, to silence a warning raised by the
+            # Aalen-Johansen estimator.
+            rng = np.random.default_rng(0)
+            jitter = rng.normal(scale=1e-3, size=y.shape[0])
             tic = perf_counter()
-            aj.fit(y["duration"], y["event"], event_of_interest=event_id)
+            aj.fit(y["duration"] + jitter, y["event"], event_of_interest=event_id)
             duration = perf_counter() - tic
             print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
             aj.plot(label="Aalen-Johansen", ax=ax)
@@ -205,7 +204,7 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
 
 
 plot_cumulative_incidence_functions(
-    distributions, gb_incidence=gb_incidence, aj=aj, y=y_uncensored
+    survival_boost=survival_boost, aj=aj, y=y_uncensored
 )
 
 # %%
@@ -216,31 +215,23 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
 # Add some independent censoring with some arbitrary parameters to control the
 # amount of censoring: lowering the expected value bound increases the amount
 # of censoring.
-censoring_times = weibull_min.rvs(
-    1.0,
-    scale=1.5 * base_scale,
-    size=n_samples,
-    random_state=rng,
-)
-y_censored = pd.DataFrame(
-    dict(
-        event=np.where(
-            censoring_times < y_uncensored["duration"], 0, y_uncensored["event"]
-        ),
-        duration=np.minimum(censoring_times, y_uncensored["duration"]),
-    )
+_, y_censored = make_synthetic_competing_weibull(
+    n_samples=n_samples,
+    n_events=n_events,
+    censoring_relative_scale=1.5,
+    return_X_y=True,
+    shape_ranges=[(shape, shape) for shape in event_dist_shapes],
+    scale_ranges=[(scale, scale) for scale in event_dist_scales],
+    base_scale=base_scale,
+    random_state=0,
 )
-y_censored["event"].value_counts().sort_index()
 
-# %%
-plot_cumulative_incidence_functions(
-    distributions, gb_incidence=gb_incidence, aj=aj, y=y_censored
-)
+plot_cumulative_incidence_functions(survival_boost=survival_boost, aj=aj, y=y_censored)
 # %%
 #
 # Note that the Aalen-Johansen estimator is unbiased and empirically recovers
 # the theoretical curves both with and without censoring. The
-# GradientBoostingIncidence estimator also appears unbiased by censoring, but
+# SurvivalBoost estimator also appears unbiased by censoring, but
 # the predicted curves are not smooth and not even monotonically increasing. By
 # adjusting the hyper-parameters, notably the learning rate, the number of
 # boosting iterations and leaf nodes, it is possible to somewhat control the
@@ -251,22 +242,3 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
 # Alternatively, we could try to enable a monotonicity constraint at training
 # time, however, in practice this often causes a sever over-estimation bias for
 # the large time horizons:
-
-# %%
-#
-# Finally let's try again to fit the GB Incidence models using a monotonicity
-# constraint:
-
-monotonic_gb_incidence = clone(gb_incidence).set_params(
-    monotonic_incidence="at_training_time"
-)
-plot_cumulative_incidence_functions(
-    distributions, gb_incidence=monotonic_gb_incidence, y=y_censored
-)
-# %%
-#
-# The resulting incidence curves are indeed monotonic. However, for smaller
-# training set sizes, the resulting models can be significantly biased, in
-# particular large time horizons, where the CIFs are getting flatter. This
-# effect diminishes with larger training set sizes (lower epistemic
-# uncertainty).