From d262cfe1c6e8a9406e30f9b3de6e1f14ad98fa7d Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 3 Jul 2024 15:11:07 +0200 Subject: [PATCH] DOC Use data generator in marginal cumulative incidence example --- ...arginal_cumulative_incidence_estimation.py | 100 ++++++++---------- 1 file changed, 43 insertions(+), 57 deletions(-) diff --git a/examples/plot_marginal_cumulative_incidence_estimation.py b/examples/plot_marginal_cumulative_incidence_estimation.py index 3758a1c..4832e72 100644 --- a/examples/plot_marginal_cumulative_incidence_estimation.py +++ b/examples/plot_marginal_cumulative_incidence_estimation.py @@ -28,51 +28,40 @@ attempting to enforce monotonicity at training time typically introduces severe over-estimation bias for large time horizons. """ + # %% from time import perf_counter import numpy as np -from scipy.stats import weibull_min -import pandas as pd import matplotlib.pyplot as plt from hazardous import SurvivalBoost +from hazardous.data import make_synthetic_competing_weibull from lifelines import AalenJohansenFitter -rng = np.random.default_rng(0) +n_events = 3 n_samples = 3_000 - -# Non-informative covariate because scikit-learn estimators expect at least -# one feature. -X_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32) - base_scale = 1_000.0 # some arbitrary time unit - -distributions = [ - {"event_id": 1, "scale": 10 * base_scale, "shape": 0.5}, - {"event_id": 2, "scale": 3 * base_scale, "shape": 1}, - {"event_id": 3, "scale": 3 * base_scale, "shape": 5}, -] -event_times = np.concatenate( - [ - weibull_min.rvs( - dist["shape"], - scale=dist["scale"], - size=n_samples, - random_state=rng, - ).reshape(-1, 1) - for dist in distributions - ], - axis=1, +shape_ranges = ( + (0.5, 0.5), + (1.0, 1.0), + (5.0, 5.0), +) +scale_ranges = ( + (10, 10), + (3, 3), + (3, 3), ) -first_event_idx = np.argmin(event_times, axis=1) -y_uncensored = pd.DataFrame( - dict( - event=first_event_idx + 1, # 0 is reserved as the censoring marker - duration=event_times[np.arange(n_samples), first_event_idx], - ) +X_uncensored, y_uncensored = make_synthetic_competing_weibull( + n_samples=n_samples, + censoring_relative_scale=0, + return_X_y=True, + shape_ranges=shape_ranges, + scale_ranges=scale_ranges, + base_scale=base_scale, + random_state=0, ) -y_uncensored["event"].value_counts().sort_index() + t_max = y_uncensored["duration"].max() # %% @@ -87,7 +76,7 @@ # distribution `_: -def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs): +def weibull_hazard(t, shape=1.0, scale=1.0): # Plug an arbitrary finite hazard value at t==0 because fractional powers # of 0 are undefined. # @@ -126,21 +115,25 @@ def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs): # theoretical CIFs: -def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, aj=None): - _, axes = plt.subplots(figsize=(12, 4), ncols=len(distributions), sharey=True) +def plot_cumulative_incidence_functions(y, survival_boost=None, aj=None): + """Plot cause-specific cumulative incidence per event using a dummy covariate""" + _, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True) # Compute the estimate of the CIFs on a coarse grid. coarse_timegrid = np.linspace(0, t_max, num=100) # Compute the theoretical CIFs by integrating the hazard functions on a # fine-grained time grid. Note that integration errors can accumulate quite - # quickly if the time grid is resolution too coarse, especially for the + # quickly if the time grid's resolution is too coarse, especially for the # Weibull distribution with shape < 1. tic = perf_counter() fine_time_grid = np.linspace(0, t_max, num=10_000_000) dt = np.diff(fine_time_grid)[0] all_hazards = np.stack( - [weibull_hazard(fine_time_grid, **dist) for dist in distributions], + [ + weibull_hazard(fine_time_grid, shape[0], scale[0] * base_scale) + for shape, scale in zip(shape_ranges, scale_ranges) + ], axis=0, ) any_event_hazards = all_hazards.sum(axis=0) @@ -155,7 +148,9 @@ def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, a "Cause-specific cumulative incidence functions" f" ({censoring_fraction:.1%} censoring)" ) - + # Non-informative covariate because scikit-learn estimators expect at least + # one feature. + X_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32) if survival_boost is not None: tic = perf_counter() survival_boost.fit(X_dummy, y) @@ -205,7 +200,7 @@ def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, a plot_cumulative_incidence_functions( - distributions, survival_boost=survival_boost, aj=aj, y=y_uncensored + survival_boost=survival_boost, aj=aj, y=y_uncensored ) # %% @@ -216,26 +211,17 @@ def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, a # Add some independent censoring with some arbitrary parameters to control the # amount of censoring: lowering the expected value bound increases the amount # of censoring. -censoring_times = weibull_min.rvs( - 1.0, - scale=1.5 * base_scale, - size=n_samples, - random_state=rng, -) -y_censored = pd.DataFrame( - dict( - event=np.where( - censoring_times < y_uncensored["duration"], 0, y_uncensored["event"] - ), - duration=np.minimum(censoring_times, y_uncensored["duration"]), - ) +X_censored, y_censored = make_synthetic_competing_weibull( + n_samples=n_samples, + censoring_relative_scale=1.5, + return_X_y=True, + shape_ranges=shape_ranges, + scale_ranges=scale_ranges, + base_scale=base_scale, + random_state=0, ) -y_censored["event"].value_counts().sort_index() -# %% -plot_cumulative_incidence_functions( - distributions, survival_boost=survival_boost, aj=aj, y=y_censored -) +plot_cumulative_incidence_functions(survival_boost=survival_boost, aj=aj, y=y_censored) # %% # # Note that the Aalen-Johansen estimator is unbiased and empirically recovers