Skip to content

Commit

Permalink
DOC Use data generator in marginal cumulative incidence example
Browse files Browse the repository at this point in the history
  • Loading branch information
ArturoAmorQ committed Jul 3, 2024
1 parent c0e02eb commit d262cfe
Showing 1 changed file with 43 additions and 57 deletions.
100 changes: 43 additions & 57 deletions examples/plot_marginal_cumulative_incidence_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,51 +28,40 @@
attempting to enforce monotonicity at training time typically introduces severe
over-estimation bias for large time horizons.
"""

# %%
from time import perf_counter
import numpy as np
from scipy.stats import weibull_min
import pandas as pd
import matplotlib.pyplot as plt

from hazardous import SurvivalBoost
from hazardous.data import make_synthetic_competing_weibull
from lifelines import AalenJohansenFitter

rng = np.random.default_rng(0)
n_events = 3
n_samples = 3_000

# Non-informative covariate because scikit-learn estimators expect at least
# one feature.
X_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)

base_scale = 1_000.0 # some arbitrary time unit

distributions = [
{"event_id": 1, "scale": 10 * base_scale, "shape": 0.5},
{"event_id": 2, "scale": 3 * base_scale, "shape": 1},
{"event_id": 3, "scale": 3 * base_scale, "shape": 5},
]
event_times = np.concatenate(
[
weibull_min.rvs(
dist["shape"],
scale=dist["scale"],
size=n_samples,
random_state=rng,
).reshape(-1, 1)
for dist in distributions
],
axis=1,
shape_ranges = (
(0.5, 0.5),
(1.0, 1.0),
(5.0, 5.0),
)
scale_ranges = (
(10, 10),
(3, 3),
(3, 3),
)
first_event_idx = np.argmin(event_times, axis=1)

y_uncensored = pd.DataFrame(
dict(
event=first_event_idx + 1, # 0 is reserved as the censoring marker
duration=event_times[np.arange(n_samples), first_event_idx],
)
X_uncensored, y_uncensored = make_synthetic_competing_weibull(
n_samples=n_samples,
censoring_relative_scale=0,
return_X_y=True,
shape_ranges=shape_ranges,
scale_ranges=scale_ranges,
base_scale=base_scale,
random_state=0,
)
y_uncensored["event"].value_counts().sort_index()

t_max = y_uncensored["duration"].max()

# %%
Expand All @@ -87,7 +76,7 @@
# distribution <https://en.wikipedia.org/wiki/Weibull_distribution>`_:


def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs):
def weibull_hazard(t, shape=1.0, scale=1.0):
# Plug an arbitrary finite hazard value at t==0 because fractional powers
# of 0 are undefined.
#
Expand Down Expand Up @@ -126,21 +115,25 @@ def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs):
# theoretical CIFs:


def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, aj=None):
_, axes = plt.subplots(figsize=(12, 4), ncols=len(distributions), sharey=True)
def plot_cumulative_incidence_functions(y, survival_boost=None, aj=None):
"""Plot cause-specific cumulative incidence per event using a dummy covariate"""
_, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True)

# Compute the estimate of the CIFs on a coarse grid.
coarse_timegrid = np.linspace(0, t_max, num=100)

# Compute the theoretical CIFs by integrating the hazard functions on a
# fine-grained time grid. Note that integration errors can accumulate quite
# quickly if the time grid is resolution too coarse, especially for the
# quickly if the time grid's resolution is too coarse, especially for the
# Weibull distribution with shape < 1.
tic = perf_counter()
fine_time_grid = np.linspace(0, t_max, num=10_000_000)
dt = np.diff(fine_time_grid)[0]
all_hazards = np.stack(
[weibull_hazard(fine_time_grid, **dist) for dist in distributions],
[
weibull_hazard(fine_time_grid, shape[0], scale[0] * base_scale)
for shape, scale in zip(shape_ranges, scale_ranges)
],
axis=0,
)
any_event_hazards = all_hazards.sum(axis=0)
Expand All @@ -155,7 +148,9 @@ def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, a
"Cause-specific cumulative incidence functions"
f" ({censoring_fraction:.1%} censoring)"
)

# Non-informative covariate because scikit-learn estimators expect at least
# one feature.
X_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)
if survival_boost is not None:
tic = perf_counter()
survival_boost.fit(X_dummy, y)
Expand Down Expand Up @@ -205,7 +200,7 @@ def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, a


plot_cumulative_incidence_functions(
distributions, survival_boost=survival_boost, aj=aj, y=y_uncensored
survival_boost=survival_boost, aj=aj, y=y_uncensored
)

# %%
Expand All @@ -216,26 +211,17 @@ def plot_cumulative_incidence_functions(distributions, y, survival_boost=None, a
# Add some independent censoring with some arbitrary parameters to control the
# amount of censoring: lowering the expected value bound increases the amount
# of censoring.
censoring_times = weibull_min.rvs(
1.0,
scale=1.5 * base_scale,
size=n_samples,
random_state=rng,
)
y_censored = pd.DataFrame(
dict(
event=np.where(
censoring_times < y_uncensored["duration"], 0, y_uncensored["event"]
),
duration=np.minimum(censoring_times, y_uncensored["duration"]),
)
X_censored, y_censored = make_synthetic_competing_weibull(
n_samples=n_samples,
censoring_relative_scale=1.5,
return_X_y=True,
shape_ranges=shape_ranges,
scale_ranges=scale_ranges,
base_scale=base_scale,
random_state=0,
)
y_censored["event"].value_counts().sort_index()

# %%
plot_cumulative_incidence_functions(
distributions, survival_boost=survival_boost, aj=aj, y=y_censored
)
plot_cumulative_incidence_functions(survival_boost=survival_boost, aj=aj, y=y_censored)
# %%
#
# Note that the Aalen-Johansen estimator is unbiased and empirically recovers
Expand Down

0 comments on commit d262cfe

Please sign in to comment.