Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Corrected cross-fitting loop #42

Merged
merged 2 commits into from
Dec 4, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 50 additions & 56 deletions src/benchmark_mediation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,9 @@ def med_dml(
Trimming treshold for discarding observations with extreme probability.

normalized : boolean, default=True
Normalizes the inverse probability-based weights.
Normalizes the inverse probability-based weights so they add up to 1, as
described in "Identifying causal mechanisms (primarily) based on inverse probability weighting",
Huber (2014), https://doi.org/10.1002/jae.2341

regularization : boolean, default=True
Whether to use regularized models (logistic or linear regression).
Expand Down Expand Up @@ -1185,8 +1187,6 @@ def med_dml(

# initialisation
(
tte, # test treatment
yte, # test outcome
ptx, # P(T=1|X)
ptmx, # P(T=1|M,X)
mu_t1_m_x, # E[Y|T=1,M,X]
Expand All @@ -1197,11 +1197,9 @@ def med_dml(
w_t1_x, # E[E[Y|T=0,M,X]|T=1,X]
mu_t1_x, # E[Y|T=1,X]
mu_t0_x, # E[Y|T=0,X]
) = [np.empty((max(crossfit, 1),), dtype=object) for _ in range(12)]
) = [np.zeros(n) for _ in range(10)]
bthirion marked this conversation as resolved.
Show resolved Hide resolved

var_name = [
"tte",
"yte",
"ptx",
"ptmx",
"mu_t1_m_x",
Expand All @@ -1228,9 +1226,8 @@ def med_dml(
kf = KFold(n_splits=crossfit)
train_test_list = list(kf.split(x))

for i, train_test in enumerate(train_test_list):
for train, test in train_test_list:
# define test set
train, test = train_test
train1 = train[t[train] == 1]
train0 = train[t[train] == 0]

Expand All @@ -1240,9 +1237,6 @@ def med_dml(
train_nested1 = train_nested[t[train_nested] == 1]
train_nested0 = train_nested[t[train_nested] == 0]

tte[i] = t[test]
yte[i] = y[test]

# predict P(T=1|X)
if use_forest:
res = RandomForestClassifier(n_estimators=100, min_samples_leaf=10).fit(
Expand All @@ -1260,7 +1254,7 @@ def med_dml(
res = CalibratedClassifierCV(res, method=calib_method).fit(
x[train], t[train]
)
ptx[i] = res.predict_proba(x[test])[:, 1]
ptx[test] = res.predict_proba(x[test])[:, 1]

# predict P(T=1|M,X)
if use_forest:
Expand All @@ -1279,7 +1273,7 @@ def med_dml(
res = CalibratedClassifierCV(res, method=calib_method).fit(
xm[train], t[train]
)
ptmx[i] = res.predict_proba(xm[test])[:, 1]
ptmx[test] = res.predict_proba(xm[test])[:, 1]

# predict E[Y|T=1,M,X]
if use_forest:
Expand All @@ -1290,8 +1284,8 @@ def med_dml(
res = LassoCV(alphas=alphas, cv=CV_FOLDS).fit(
xm[train_mean1], y[train_mean1]
)
mu_t1_m_x[i] = res.predict(xm[test])
mu_t1_m_x_nested[i] = res.predict(xm[train_nested])
mu_t1_m_x[test] = res.predict(xm[test])
mu_t1_m_x_nested[train_nested] = res.predict(xm[train_nested])

# predict E[Y|T=0,M,X]
if use_forest:
Expand All @@ -1302,30 +1296,30 @@ def med_dml(
res = LassoCV(alphas=alphas, cv=CV_FOLDS).fit(
xm[train_mean0], y[train_mean0]
)
mu_t0_m_x[i] = res.predict(xm[test])
mu_t0_m_x_nested[i] = res.predict(xm[train_nested])
mu_t0_m_x[test] = res.predict(xm[test])
mu_t0_m_x_nested[train_nested] = res.predict(xm[train_nested])

# predict E[E[Y|T=1,M,X]|T=0,X]
if use_forest:
res = RandomForestRegressor(n_estimators=100, min_samples_leaf=10).fit(
x[train_nested0], mu_t1_m_x_nested[i][t[train_nested] == 0]
x[train_nested0], mu_t1_m_x_nested[train_nested0]
)
else:
res = LassoCV(alphas=alphas, cv=CV_FOLDS).fit(
x[train_nested0], mu_t1_m_x_nested[i][t[train_nested] == 0]
x[train_nested0], mu_t1_m_x_nested[train_nested0]
)
w_t0_x[i] = res.predict(x[test])
w_t0_x[test] = res.predict(x[test])

# predict E[E[Y|T=0,M,X]|T=1,X]
if use_forest:
res = RandomForestRegressor(n_estimators=100, min_samples_leaf=10).fit(
x[train_nested1], mu_t0_m_x_nested[i][t[train_nested] == 1]
x[train_nested1], mu_t0_m_x_nested[train_nested1]
)
else:
res = LassoCV(alphas=alphas, cv=CV_FOLDS).fit(
x[train_nested1], mu_t0_m_x_nested[i][t[train_nested] == 1]
x[train_nested1], mu_t0_m_x_nested[train_nested1]
)
w_t1_x[i] = res.predict(x[test])
w_t1_x[test] = res.predict(x[test])

# predict E[Y|T=1,X]
if use_forest:
Expand All @@ -1334,7 +1328,7 @@ def med_dml(
)
else:
res = LassoCV(alphas=alphas, cv=CV_FOLDS).fit(x[train1], y[train1])
mu_t1_x[i] = res.predict(x[test])
mu_t1_x[test] = res.predict(x[test])

# predict E[Y|T=0,X]
if use_forest:
Expand All @@ -1343,56 +1337,56 @@ def med_dml(
)
else:
res = LassoCV(alphas=alphas, cv=CV_FOLDS).fit(x[train0], y[train0])
mu_t0_x[i] = res.predict(x[test])

# trimming
not_trimmed = (
(((1 - ptmx[i]) * ptx[i]) >= trim)
* ((1 - ptx[i]) >= trim)
* (ptx[i] >= trim)
* (((ptmx[i] * (1 - ptx[i]))) >= trim)
)
for var in var_name:
exec(f"{var}[i] = {var}[i][not_trimmed]")
nobs += np.sum(not_trimmed)
mu_t0_x[test] = res.predict(x[test])

# trimming
not_trimmed = (
(((1 - ptmx) * ptx) >= trim)
* ((1 - ptx) >= trim)
* (ptx >= trim)
* (((ptmx * (1 - ptx))) >= trim)
)
for var in var_name:
exec(f"{var} = {var}[not_trimmed]")
nobs = np.sum(not_trimmed)

# score computing
if normalized:
sumscore1 = [np.mean(_) for _ in (1 - tte) * ptmx / ((1 - ptmx) * ptx)]
sumscore2 = [np.mean(_) for _ in tte / ptx]
sumscore3 = [np.mean(_) for _ in (1 - tte) / (1 - ptx)]
sumscore4 = [np.mean(_) for _ in tte * (1 - ptmx) / (ptmx * (1 - ptx))]
y1m1 = (tte * (yte - mu_t1_x) / ptx) / sumscore2 + mu_t1_x
y0m0 = ((1 - tte) * (yte - mu_t0_x) / (1 - ptx)) / sumscore3 + mu_t0_x
sumscore1 = np.mean(t / ptx)
sumscore2 = np.mean((1 - t) / (1 - ptx))
sumscore3 = np.mean(t * (1 - ptmx) / (ptmx * (1 - ptx)))
sumscore4 = np.mean((1 - t) * ptmx / ((1 - ptmx) * ptx))
y1m1 = (t / ptx * (y - mu_t1_x)) / sumscore1 + mu_t1_x
y0m0 = ((1 - t) / (1 - ptx) * (y - mu_t0_x)) / sumscore2 + mu_t0_x
y1m0 = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there tests that check whether this snippet computes the right thing ?
Such computation blocks should better be isolated into small functions bzw.

Copy link
Contributor Author

@sami6mz sami6mz Aug 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There aren't any test, I just made sure it gives similar performances with and without normalization on several dataset. I can declared that as an issue, #44

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want this lib to survive, we need to add a test suite in highest priority.
See #45

(tte * (1 - ptmx) / (ptmx * (1 - ptx)) * (yte - mu_t1_m_x)) / sumscore4
+ ((1 - tte) / (1 - ptx) * (mu_t1_m_x - w_t0_x)) / sumscore3
(t * (1 - ptmx) / (ptmx * (1 - ptx)) * (y - mu_t1_m_x)) / sumscore3
+ ((1 - t) / (1 - ptx) * (mu_t1_m_x - w_t0_x)) / sumscore2
+ w_t0_x
)
y0m1 = (
((1 - tte) * ptmx / ((1 - ptmx) * ptx) * (yte - mu_t0_m_x)) / sumscore1
+ (tte / ptx * (mu_t0_m_x - w_t1_x)) / sumscore2
((1 - t) * ptmx / ((1 - ptmx) * ptx) * (y - mu_t0_m_x)) / sumscore4
+ (t / ptx * (mu_t0_m_x - w_t1_x)) / sumscore1
+ w_t1_x
)
else:
y1m1 = tte * (yte - mu_t1_x) / ptx + mu_t1_x
y0m0 = (1 - tte) * (yte - mu_t0_x) / (1 - ptx) + mu_t0_x
y1m1 = t / ptx * (y - mu_t1_x) + mu_t1_x
y0m0 = (1 - t) / (1 - ptx) * (y - mu_t0_x) + mu_t0_x
y1m0 = (
tte * (1 - ptmx) / (ptmx * (1 - ptx)) * (yte - mu_t1_m_x)
+ (1 - tte) / (1 - ptx) * (mu_t1_m_x - w_t0_x)
t * (1 - ptmx) / (ptmx * (1 - ptx)) * (y - mu_t1_m_x)
+ (1 - t) / (1 - ptx) * (mu_t1_m_x - w_t0_x)
+ w_t0_x
)
y0m1 = (
(1 - tte) * ptmx / ((1 - ptmx) * ptx) * (yte - mu_t0_m_x)
+ tte / ptx * (mu_t0_m_x - w_t1_x)
(1 - t) * ptmx / ((1 - ptmx) * ptx) * (y - mu_t0_m_x)
+ t / ptx * (mu_t0_m_x - w_t1_x)
+ w_t1_x
)

# mean score computing
my1m1 = np.mean([np.mean(_) for _ in y1m1])
my0m0 = np.mean([np.mean(_) for _ in y0m0])
my1m0 = np.mean([np.mean(_) for _ in y1m0])
my0m1 = np.mean([np.mean(_) for _ in y0m1])
my1m1 = np.mean(y1m1)
my0m0 = np.mean(y0m0)
my1m0 = np.mean(y1m0)
my0m1 = np.mean(y0m1)

# effects computing
total = my1m1 - my0m0
Expand Down