put functions in the example

soda-inria · Dec 24, 2023 · b03fe1a · b03fe1a
1 parent 4926efd
commit b03fe1a
Showing 1 changed file with 142 additions and 144 deletions.
diff --git a/examples/debaised_bs.py b/examples/debaised_bs.py
@@ -6,10 +6,8 @@
 #
 
 # %%
-import seaborn as sns
 import numpy as np
 import pandas as pd
-from sklearn.base import clone
 
 from hazardous.data._competing_weibull import (
     make_complex_features_with_sparse_matrix,
@@ -38,26 +36,36 @@
 )
 
 # %%
+import seaborn as sns
+from matplotlib import pyplot as plt
+
 from hazardous.data._competing_weibull import _censor
 
 
+def plot_events(y, kind):
+    censoring_rate = int((y["event"] == 0).mean() * 100)
+    ax = sns.histplot(
+        y,
+        x="duration",
+        hue="event",
+        palette="magma",
+        multiple="stack",
+    )
+    title = (
+        f"Duration distributions when censoring is {kind} of X, {censoring_rate=!r}%"
+    )
+    ax.set(title=title)
+
+
 y_censored_indep, shape_censoring_indep, scale_censoring_indep = _censor(
     y_uncensored,
     independent_censoring=True,
     X=X,
     features_censoring_rate=0.5,
-    censoring_relative_scale=0.5,
+    censoring_relative_scale=1,
     random_state=seed,
 )
-
-ax = sns.histplot(
-    y_censored_indep,
-    x="duration",
-    hue="event",
-    palette="magma",
-    multiple="stack",
-)
-ax.set(title="Duration distributions when censoring is independent of X")
+plot_events(y_censored_indep, kind="independent")
 
 
 # %%
@@ -72,188 +80,178 @@
     censoring_relative_scale=0.5,
     random_state=seed,
 )
-
-ax = sns.histplot(
-    y_censored_dep,
-    x="duration",
-    hue="event",
-    palette="magma",
-    multiple="stack",
-)
-ax.set(title="Duration distributions when censoring is dependent of X")
+plot_events(y_censored_dep, kind="dependent")
 
 
 # %%
 from hazardous import GradientBoostingIncidence
 
-
-gbi_indep = GradientBoostingIncidence(
-    learning_rate=0.1,
-    n_iter=20,
-    max_leaf_nodes=15,
-    hard_zero_fraction=0.1,
-    min_samples_leaf=5,
-    loss="ibs",
-    show_progressbar=False,
-    random_state=seed,
-    event_of_interest=event_of_interest,
-)
-gbi_indep.fit(X, y_censored_indep)
-
-gbi_dep = clone(gbi_indep)
-gbi_dep.fit(X, y_censored_dep)
-
-# %%
-from matplotlib import pyplot as plt
-
 from hazardous.metrics._brier_score import (
     brier_score_incidence,
     brier_score_true_probas_incidence,
 )
 
 
-time_grid = gbi_indep.time_grid_
-y_pred_indep = gbi_indep.predict_cumulative_incidence(X, times=time_grid)
+def plot_brier_scores_comparisons(X, y, shape, scale, kind, event_of_interest=1):
+    gbi = GradientBoostingIncidence(
+        learning_rate=0.1,
+        n_iter=20,
+        max_leaf_nodes=15,
+        hard_zero_fraction=0.1,
+        min_samples_leaf=5,
+        loss="ibs",
+        show_progressbar=False,
+        random_state=seed,
+        event_of_interest=event_of_interest,
+    )
+    gbi.fit(X, y)
+    y_pred = gbi.predict_cumulative_incidence(X)
+
+    time_grid = gbi.time_grid_
+    debiased_bs_scores = brier_score_true_probas_incidence(
+        y_train=y,
+        y_test=y,
+        y_pred=y_pred,
+        times=time_grid,
+        shape_censoring=shape,
+        scale_censoring=scale,
+        event_of_interest=event_of_interest,
+    )
 
-scores_indep = brier_score_true_probas_incidence(
-    y_train=y_censored_indep,
-    y_test=y_censored_indep,
-    y_pred=y_pred_indep,
-    times=time_grid,
-    shape_censoring=shape_censoring_indep,
-    scale_censoring=scale_censoring_indep,
-    event_of_interest=event_of_interest,
-)
+    bs_scores = brier_score_incidence(
+        y_train=y,
+        y_test=y,
+        y_pred=y_pred,
+        times=time_grid,
+        event_of_interest=event_of_interest,
+    )
 
-y_pred_dep = gbi_dep.predict_cumulative_incidence(X, times=time_grid)
+    fig, ax = plt.subplots(figsize=(12, 5))
+    ax.plot(time_grid, debiased_bs_scores, label="from true distrib")
+    ax.plot(time_grid, bs_scores, label="from estimate distrib with km")
+    ax.set(
+        title=f"Time-varying Brier score, {kind} censoring",
+    )
+    ax.legend()
 
-scores_dep = brier_score_true_probas_incidence(
-    y_train=y_censored_dep,
-    y_test=y_censored_dep,
-    y_pred=y_pred_dep,
-    times=time_grid,
-    shape_censoring=shape_censoring_dep,
-    scale_censoring=scale_censoring_dep,
-    event_of_interest=event_of_interest,
-)
 
-bs_scores = brier_score_incidence(
-    y_train=y_censored_indep,
-    y_test=y_censored_indep,
-    y_pred=y_pred_indep,
-    times=time_grid,
+plot_brier_scores_comparisons(
+    X,
+    y_censored_indep,
+    shape=shape_censoring_indep,
+    scale=scale_censoring_indep,
+    kind="independent",
     event_of_interest=event_of_interest,
 )
 
-fig, ax = plt.subplots(figsize=(12, 5))
-ax.plot(time_grid, scores_indep, label="from true distrib")
-ax.plot(time_grid, bs_scores, label="from estimate distrib with km")
+# %%
 
-ax.set(
-    title="Time-varying Brier score, Independent censoring",
+plot_brier_scores_comparisons(
+    X,
+    y_censored_dep,
+    shape=shape_censoring_dep,
+    scale=scale_censoring_dep,
+    kind="dependent",
+    event_of_interest=event_of_interest,
 )
-ax.legend()
 
-fig, ax = plt.subplots(figsize=(12, 5))
-ax.plot(time_grid, scores_dep, label="from true distrib")
-ax.plot(time_grid, bs_scores, label="from estimate distrib with km")
-
-ax.set(
-    title="Time-varying Brier score, Dependent censoring",
-)
-ax.legend()
 
 # %%
 
 from lifelines import AalenJohansenFitter
 
 
-aj = AalenJohansenFitter(calculate_variance=False, seed=seed)
-n_events = y_uncensored["event"].max()
-t_max = y_uncensored["duration"].max()
+def plot_marginal_incidence(y_censored, y_uncensored, kind):
+    aj = AalenJohansenFitter(calculate_variance=False, seed=seed)
+    n_events = y_uncensored["event"].max()
 
-coarse_timegrid = np.linspace(0, t_max, num=100)
-censoring_fraction = (y_censored_indep["event"] == 0).mean()
+    censoring_fraction = (y_censored["event"] == 0).mean()
 
+    # Compute the estimate of the CIFs on a coarse grid.
+    _, axes = plt.subplots(figsize=(12, 5), ncols=n_events, sharey=True)
 
-# Compute the estimate of the CIFs on a coarse grid.
-_, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True)
+    plt.suptitle(
+        f"Cause-specific cumulative incidence functions, {kind}"
+        f" ({censoring_fraction:.1%} censoring)"
+    )
 
-plt.suptitle(
-    "Cause-specific cumulative incidence functions, Independent"
-    f" ({censoring_fraction:.1%} censoring)"
-)
+    for event_id, ax in enumerate(axes, 1):
+        aj.fit(
+            y_uncensored["duration"],
+            y_uncensored["event"],
+            event_of_interest=event_id,
+        )
+        aj.plot(label="Aalen-Johansen_uncensored", ax=ax)
 
-for event_id, ax in enumerate(axes, 1):
-    aj.fit(y_uncensored["duration"], y_uncensored["event"], event_of_interest=event_id)
-    aj.plot(label="Aalen-Johansen_uncensored", ax=ax)
+        aj.fit(
+            y_censored["duration"],
+            y_censored["event"],
+            event_of_interest=event_id,
+        )
+        aj.plot(label="Aalen-Johansen_censored", ax=ax)
 
-    aj.fit(
-        y_censored_indep["duration"],
-        y_censored_indep["event"],
-        event_of_interest=event_id,
-    )
-    aj.plot(label="Aalen-Johansen_censored", ax=ax)
+        ax.set_xlim(0, 8_000)
+        ax.set_ylim(0, 0.5)
+        ax.set_title(f"{event_id=!r}")
+    plt.legend()
 
-    ax.set_xlim(0, 8_000)
-    ax.set_ylim(0, 0.5)
-# %%
 
-aj = AalenJohansenFitter(calculate_variance=False, seed=seed)
-n_events = y_uncensored["event"].max()
-t_max = y_uncensored["duration"].max()
+plot_marginal_incidence(y_censored_indep, y_uncensored, kind="independent")
+# %%
 
-coarse_timegrid = np.linspace(0, t_max, num=100)
-censoring_fraction = (y_censored_dep["event"] == 0).mean()
+plot_marginal_incidence(y_censored_dep, y_uncensored, kind="dependent")
 
 
-# Compute the estimate of the CIFs on a coarse grid.
-_, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True)
+# %%
 
-plt.suptitle(
-    "Cause-specific cumulative incidence functions, Dependent"
-    f" ({censoring_fraction:.1%} censoring)"
-)
+from hazardous._ipcw import IPCWEstimator, IPCWSampler
 
-for event_id, ax in enumerate(axes, 1):
-    aj.fit(y_uncensored["duration"], y_uncensored["event"], event_of_interest=event_id)
-    aj.plot(label="Aalen-Johansen_uncensored", ax=ax)
 
-    aj.fit(
-        y_censored_dep["duration"], y_censored_dep["event"], event_of_interest=event_id
-    )
-    aj.plot(label="Aalen-Johansen_censored", ax=ax)
+def plot_ipcw_comparison(
+    y_uncensored, y_censored, shape_censoring, scale_censoring, kind
+):
+    t_max = y_uncensored["duration"].max()
+    time_grid = np.linspace(0, t_max, 100)
 
-    ax.set_xlim(0, 8_000)
-    ax.set_ylim(0, 0.5)
+    estimator = IPCWEstimator().fit(y_censored)
+    ipcw_pred = estimator.compute_ipcw_at(time_grid)
 
-# %%
+    sampler = IPCWSampler(
+        shape=shape_censoring,
+        scale=scale_censoring,
+    ).fit(y_censored)
 
-from hazardous._ipcw import IPCWEstimator, IPCWSampler
+    ipcw_samples = []
+    for time_step in time_grid:
+        time_step = np.full(y_censored.shape[0], fill_value=time_step)
+        ipcw_sample = sampler.compute_ipcw_at(time_step)
+        ipcw_samples.append(ipcw_sample)
+    ipcw_samples = np.mean(ipcw_samples, axis=1)
 
-ipcw_est = IPCWEstimator().fit(y_censored_indep)
-ipcw_y_est = ipcw_est.compute_ipcw_at(time_grid)
+    fig, ax = plt.subplots(figsize=(8, 4))
+    ax.plot(time_grid, ipcw_samples, label="$G^*$")
+    ax.plot(time_grid, ipcw_pred, label="$\hat{G}$, using KM")
+    ax.set_title(f"{kind} censoring")
+    plt.legend()
 
-ipcw_sample_indep = IPCWSampler(
-    shape=shape_censoring_indep,
-    scale=scale_censoring_indep,
-).fit(y_censored_indep)
-ipcw_y_sample_indep = ipcw_sample_indep.compute_ipcw_at(time_grid)
 
-fig, ax = plt.subplots(figsize=(8, 4))
-ax.plot(time_grid, ipcw_y_est, label="est")
-ax.plot(time_grid, ipcw_y_sample_indep, label="sample indep")
-# ax.plot(time_grid, ipcw_y_sample_dep, label="sample dep")
-plt.legend()
+plot_ipcw_comparison(
+    y_uncensored,
+    y_censored_indep,
+    shape_censoring=shape_censoring_indep,
+    scale_censoring=scale_censoring_indep,
+    kind="independent",
+)
 
 
 # %%
 
-
-# ipcw_sample_dep = IPCWSampler(
-#     shape=shape_censoring_dep,
-#     scale=scale_censoring_dep,
-# ).fit(y_censored_dep)
-# ipcw_y_sample_dep = ipcw_sample_dep.compute_ipcw_at(time_grid_rescale)
+plot_ipcw_comparison(
+    y_uncensored,
+    y_censored_dep,
+    shape_censoring=shape_censoring_dep,
+    scale_censoring=scale_censoring_dep,
+    kind="dependent",
+)
 
 # %%