From d262cfe1c6e8a9406e30f9b3de6e1f14ad98fa7d Mon Sep 17 00:00:00 2001
From: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu>
Date: Wed, 3 Jul 2024 15:11:07 +0200
Subject: [PATCH] DOC Use data generator in marginal cumulative incidence
 example

---
 ...arginal_cumulative_incidence_estimation.py | 100 ++++++++----------
 1 file changed, 43 insertions(+), 57 deletions(-)

diff --git a/examples/plot_marginal_cumulative_incidence_estimation.py b/examples/plot_marginal_cumulative_incidence_estimation.py
index 3758a1c..4832e72 100644
--- a/examples/plot_marginal_cumulative_incidence_estimation.py
+++ b/examples/plot_marginal_cumulative_incidence_estimation.py
@@ -28,51 +28,40 @@
 attempting to enforce monotonicity at training time typically introduces severe
 over-estimation bias for large time horizons.
 """
+
 # %%
 from time import perf_counter
 import numpy as np
-from scipy.stats import weibull_min
-import pandas as pd
 import matplotlib.pyplot as plt
 
 from hazardous import SurvivalBoost
+from hazardous.data import make_synthetic_competing_weibull
 from lifelines import AalenJohansenFitter
 
-rng = np.random.default_rng(0)
+n_events = 3
 n_samples = 3_000
-
-# Non-informative covariate because scikit-learn estimators expect at least
-# one feature.
-X_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)
-
 base_scale = 1_000.0  # some arbitrary time unit
-
-distributions = [
-    {"event_id": 1, "scale": 10 * base_scale, "shape": 0.5},
-    {"event_id": 2, "scale": 3 * base_scale, "shape": 1},
-    {"event_id": 3, "scale": 3 * base_scale, "shape": 5},
-]
-event_times = np.concatenate(
-    [
-        weibull_min.rvs(
-            dist["shape"],
-            scale=dist["scale"],
-            size=n_samples,
-            random_state=rng,
-        ).reshape(-1, 1)
-        for dist in distributions
-    ],
-    axis=1,
+shape_ranges = (
+    (0.5, 0.5),
+    (1.0, 1.0),
+    (5.0, 5.0),
+)
+scale_ranges = (
+    (10, 10),
+    (3, 3),
+    (3, 3),
 )
-first_event_idx = np.argmin(event_times, axis=1)
 
-y_uncensored = pd.DataFrame(
-    dict(
-        event=first_event_idx + 1,  # 0 is reserved as the censoring marker
-        duration=event_times[np.arange(n_samples), first_event_idx],
-    )
+X_uncensored, y_uncensored = make_synthetic_competing_weibull(
+    n_samples=n_samples,
+    censoring_relative_scale=0,
+    return_X_y=True,
+    shape_ranges=shape_ranges,
+    scale_ranges=scale_ranges,
+    base_scale=base_scale,
+    random_state=0,
 )
-y_uncensored["event"].value_counts().sort_index()
+
 t_max = y_uncensored["duration"].max()
 
 # %%
@@ -87,7 +76,7 @@
 # distribution <https://en.wikipedia.org/wiki/Weibull_distribution>`_:
 
 
-def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs):
+def weibull_hazard(t, shape=1.0, scale=1.0):
     # Plug an arbitrary finite hazard value at t==0 because fractional powers
     # of 0 are undefined.
     #
@@ -126,21 +115,25 @@ def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs):
 # theoretical CIFs:
 
 
-def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, aj=None):
-    _, axes = plt.subplots(figsize=(12, 4), ncols=len(distributions), sharey=True)
+def plot_cumulative_incidence_functions(y, survival_boost=None, aj=None):
+    """Plot cause-specific cumulative incidence per event using a dummy covariate"""
+    _, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True)
 
     # Compute the estimate of the CIFs on a coarse grid.
     coarse_timegrid = np.linspace(0, t_max, num=100)
 
     # Compute the theoretical CIFs by integrating the hazard functions on a
     # fine-grained time grid. Note that integration errors can accumulate quite
-    # quickly if the time grid is resolution too coarse, especially for the
+    # quickly if the time grid's resolution is too coarse, especially for the
     # Weibull distribution with shape < 1.
     tic = perf_counter()
     fine_time_grid = np.linspace(0, t_max, num=10_000_000)
     dt = np.diff(fine_time_grid)[0]
     all_hazards = np.stack(
-        [weibull_hazard(fine_time_grid, **dist) for dist in distributions],
+        [
+            weibull_hazard(fine_time_grid, shape[0], scale[0] * base_scale)
+            for shape, scale in zip(shape_ranges, scale_ranges)
+        ],
         axis=0,
     )
     any_event_hazards = all_hazards.sum(axis=0)
@@ -155,7 +148,9 @@ def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, a
         "Cause-specific cumulative incidence functions"
         f" ({censoring_fraction:.1%} censoring)"
     )
-
+    # Non-informative covariate because scikit-learn estimators expect at least
+    # one feature.
+    X_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)
     if survival_boost is not None:
         tic = perf_counter()
         survival_boost.fit(X_dummy, y)
@@ -205,7 +200,7 @@ def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, a
 
 
 plot_cumulative_incidence_functions(
-    distributions, survival_boost=survival_boost, aj=aj, y=y_uncensored
+    survival_boost=survival_boost, aj=aj, y=y_uncensored
 )
 
 # %%
@@ -216,26 +211,17 @@ def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, a
 # Add some independent censoring with some arbitrary parameters to control the
 # amount of censoring: lowering the expected value bound increases the amount
 # of censoring.
-censoring_times = weibull_min.rvs(
-    1.0,
-    scale=1.5 * base_scale,
-    size=n_samples,
-    random_state=rng,
-)
-y_censored = pd.DataFrame(
-    dict(
-        event=np.where(
-            censoring_times < y_uncensored["duration"], 0, y_uncensored["event"]
-        ),
-        duration=np.minimum(censoring_times, y_uncensored["duration"]),
-    )
+X_censored, y_censored = make_synthetic_competing_weibull(
+    n_samples=n_samples,
+    censoring_relative_scale=1.5,
+    return_X_y=True,
+    shape_ranges=shape_ranges,
+    scale_ranges=scale_ranges,
+    base_scale=base_scale,
+    random_state=0,
 )
-y_censored["event"].value_counts().sort_index()
 
-# %%
-plot_cumulative_incidence_functions(
-    distributions, survival_boost=survival_boost, aj=aj, y=y_censored
-)
+plot_cumulative_incidence_functions(survival_boost=survival_boost, aj=aj, y=y_censored)
 # %%
 #
 # Note that the Aalen-Johansen estimator is unbiased and empirically recovers