From 9bff44b88d021d74501cfe2a27b84bdbeda5ab9f Mon Sep 17 00:00:00 2001 From: brash6 Date: Thu, 14 Nov 2024 15:22:57 +0100 Subject: [PATCH] files cleaning --- src/med_bench/estimation/base.py | 10 +- .../estimation/mediation_g_computation.py | 10 +- src/med_bench/example.py | 118 --- src/med_bench/get_estimation_results.py | 371 -------- src/med_bench/mediation.py | 796 ------------------ src/med_bench/r_mediation.py | 205 +++++ src/med_bench/utils/constants.py | 4 +- src/med_bench/utils/nuisances.py | 476 ----------- src/med_bench/utils/utils.py | 13 +- 9 files changed, 226 insertions(+), 1777 deletions(-) delete mode 100644 src/med_bench/example.py delete mode 100644 src/med_bench/get_estimation_results.py delete mode 100644 src/med_bench/mediation.py create mode 100644 src/med_bench/r_mediation.py delete mode 100644 src/med_bench/utils/nuisances.py diff --git a/src/med_bench/estimation/base.py b/src/med_bench/estimation/base.py index 6bfa501..9df9491 100644 --- a/src/med_bench/estimation/base.py +++ b/src/med_bench/estimation/base.py @@ -325,9 +325,9 @@ def _estimate_mediator_probability(self, t, m, x, y): Returns ------- - f_m0, array-like, shape (n_samples) + f_m0x, array-like, shape (n_samples) probabilities f(M|T=0,X) - f_m1, array-like, shape (n_samples) + f_m1x, array-like, shape (n_samples) probabilities f(M|T=1,X) """ n = len(y) @@ -340,10 +340,10 @@ def _estimate_mediator_probability(self, t, m, x, y): t0_x = np.hstack([t0.reshape(-1, 1), x]) t1_x = np.hstack([t1.reshape(-1, 1), x]) - fm_0 = self._classifier_m.predict_proba(t0_x)[:, 1] - fm_1 = self._classifier_m.predict_proba(t1_x)[:, 1] + f_m0x = self._classifier_m.predict_proba(t0_x)[:, m] + f_m1x = self._classifier_m.predict_proba(t1_x)[:, m] - return fm_0, fm_1 + return f_m0x, f_m1x def _estimate_mediators_probabilities(self, t, m, x, y): """ diff --git a/src/med_bench/estimation/mediation_g_computation.py b/src/med_bench/estimation/mediation_g_computation.py index a813b41..abf6f86 100644 --- a/src/med_bench/estimation/mediation_g_computation.py +++ b/src/med_bench/estimation/mediation_g_computation.py @@ -2,7 +2,7 @@ from med_bench.estimation.base import Estimator from med_bench.utils.decorators import fitted -from med_bench.utils.utils import is_array_integer +from med_bench.utils.utils import is_array_binary class GComputation(Estimator): @@ -35,7 +35,7 @@ def fit(self, t, m, x, y): """Fits nuisance parameters to data""" t, m, x, y = self._resize(t, m, x, y) - if is_array_integer(m): + if is_array_binary(m): self._fit_mediator_nuisance(t, m, x, y) self._fit_conditional_mean_outcome_nuisance(t, m, x, y) else: @@ -55,10 +55,10 @@ def estimate(self, t, m, x, y): """ t, m, x, y = self._resize(t, m, x, y) - if is_array_integer(m): - mu_00x, mu_01x, mu_10x, mu_11x = self._estimate_mediators_probabilities( + if is_array_binary(m): + f_00x, f_01x, f_10x, f_11x = self._estimate_mediators_probabilities( t, m, x, y) - f_00x, f_01x, f_10x, f_11x = self._estimate_conditional_mean_outcome( + mu_00x, mu_01x, mu_10x, mu_11x = self._estimate_conditional_mean_outcome( t, m, x, y) direct_effect_i1 = mu_11x - mu_01x diff --git a/src/med_bench/example.py b/src/med_bench/example.py deleted file mode 100644 index 7768760..0000000 --- a/src/med_bench/example.py +++ /dev/null @@ -1,118 +0,0 @@ -from numpy.random import default_rng -import pandas as pd -from sklearn.calibration import CalibratedClassifierCV -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LogisticRegressionCV, RidgeCV -from sklearn.model_selection import train_test_split - -from med_bench.mediation import (mediation_IPW, mediation_coefficient_product, mediation_dml, - mediation_g_formula, mediation_multiply_robust) -from med_bench.estimation.mediation_coefficient_product import CoefficientProduct -from med_bench.estimation.mediation_dml import DoubleMachineLearning -from med_bench.estimation.mediation_g_computation import GComputation -from med_bench.estimation.mediation_ipw import ImportanceWeighting -from med_bench.estimation.mediation_mr import MultiplyRobust -from med_bench.get_simulated_data import simulate_data -from med_bench.nuisances.utils import _get_regularization_parameters -from med_bench.utils.constants import CV_FOLDS - - -print("get simulated data") -(x, t, m, y, - theta_1_delta_0, theta_1, theta_0, delta_1, delta_0, - p_t, th_p_t_mx) = simulate_data(n=1000, rg=default_rng(321), dim_x=5) - -(x_train, x_test, t_train, t_test, - m_train, m_test, y_train, y_test) = train_test_split(x, t, m, y, test_size=0.33, random_state=42) - -cs, alphas = _get_regularization_parameters(regularization=True) - -clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - -clf2 = LogisticRegressionCV(random_state=42, Cs=cs, cv=CV_FOLDS) - -reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - -reg2 = RidgeCV(alphas=alphas, cv=CV_FOLDS) - -# Step 4: Define estimators (modularized and non-modularized) -estimators = { - "CoefficientProduct": { - "modular": CoefficientProduct( - regressor=reg, classifier=clf, regularize=True - ), - "non_modular": mediation_coefficient_product - }, - "DoubleMachineLearning": { - "modular": DoubleMachineLearning( - clip=1e-6, trim=0.05, normalized=True, regressor=reg2, classifier=clf2 - ), - "non_modular": mediation_dml - }, - "GComputation": { - "modular": GComputation( - regressor=reg2, classifier=CalibratedClassifierCV( - clf2, method="sigmoid") - ), - "non_modular": mediation_g_formula - }, - "ImportanceWeighting": { - "modular": ImportanceWeighting( - clip=1e-6, trim=0.01, regressor=reg2, classifier=CalibratedClassifierCV(clf2, method="sigmoid") - ), - "non_modular": mediation_IPW - }, - "MultiplyRobust": { - "modular": MultiplyRobust( - clip=1e-6, ratio="propensities", normalized=True, regressor=reg2, - classifier=CalibratedClassifierCV(clf2, method="sigmoid") - ), - "non_modular": mediation_multiply_robust - } -} - -# Step 5: Initialize results DataFrame -results = [] - -# Step 6: Iterate over each estimator -for estimator_name, estimator_dict in estimators.items(): - # Non-Modularized Estimation - # Check if non-modular is a function - if callable(estimator_dict["non_modular"]): - (total_effect, direct_effect1, direct_effect2, indirect_effect1, indirect_effect2, _) = estimator_dict["non_modular"]( - y, t, m, x) - - results.append({ - "Estimator": estimator_name, - "Method": "Non-Modularized", - "Total Effect": total_effect, - "Direct Effect (Treated)": direct_effect1, - "Direct Effect (Control)": direct_effect2, - "Indirect Effect (Treated)": indirect_effect1, - "Indirect Effect (Control)": indirect_effect2, - }) - - # Modularized Estimation - modular_estimator = estimator_dict["modular"] - modular_estimator.fit(t_train, m_train, x_train, y_train) - causal_effects = modular_estimator.estimate( - t_test, m_test, x_test, y_test) - - # Append modularized results - results.append({ - "Estimator": estimator_name, - "Method": "Modularized", - "Total Effect": causal_effects['total_effect'], - "Direct Effect (Treated)": causal_effects['direct_effect_treated'], - "Direct Effect (Control)": causal_effects['direct_effect_control'], - "Indirect Effect (Treated)": causal_effects['indirect_effect_treated'], - "Indirect Effect (Control)": causal_effects['indirect_effect_control'], - }) - -# Convert results to DataFrame -results_df = pd.DataFrame(results) - -# Display or save the DataFrame -print(results_df) diff --git a/src/med_bench/get_estimation_results.py b/src/med_bench/get_estimation_results.py deleted file mode 100644 index 8ca2c2e..0000000 --- a/src/med_bench/get_estimation_results.py +++ /dev/null @@ -1,371 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- - -import numpy as np - -from .mediation import ( - mediation_dml, - r_mediation_g_estimator, - r_mediation_dml, - r_mediate, -) - -from med_bench.estimation.mediation_coefficient_product import CoefficientProduct -from med_bench.estimation.mediation_dml import DoubleMachineLearning -from med_bench.estimation.mediation_g_computation import GComputation -from med_bench.estimation.mediation_ipw import InversePropensityWeighting -from med_bench.estimation.mediation_mr import MultiplyRobust -from med_bench.utils.utils import _get_regularization_parameters -from med_bench.utils.constants import CV_FOLDS - -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LogisticRegressionCV, RidgeCV -from sklearn.calibration import CalibratedClassifierCV - - -def transform_outputs(causal_effects): - """Transforms outputs in the old format - - Args: - causal_effects (dict): dictionary of causal effects - - Returns: - list: list of causal effects - """ - total = causal_effects['total_effect'] - direct_treated = causal_effects['direct_effect_treated'] - direct_control = causal_effects['direct_effect_control'] - indirect_treated = causal_effects['indirect_effect_treated'] - indirect_control = causal_effects['indirect_effect_control'] - - return [total, direct_treated, direct_control, indirect_treated, indirect_control, 0] - - -def get_estimation_results(x, t, m, y, estimator, config): - """Dynamically selects and calls an estimator (class-based or legacy function) to estimate total, direct, and indirect effects.""" - - effects = None # Initialize variable to store the effects - - # Helper function for regularized regressor and classifier initialization - def get_regularized_regressor_and_classifier(regularize=True, calibration=None, method="sigmoid"): - cs, alphas = _get_regularization_parameters(regularization=regularize) - clf = LogisticRegressionCV(random_state=42, Cs=cs, cv=CV_FOLDS) - reg = RidgeCV(alphas=alphas, cv=CV_FOLDS) - if calibration: - clf = CalibratedClassifierCV(clf, method=method) - return clf, reg - - if estimator == "mediation_IPW_R": - # Use R-based mediation estimator with direct output extraction - x_r, t_r, m_r, y_r = [_convert_array_to_R(uu) for uu in (x, t, m, y)] - output_w = causalweight.medweight( - y=y_r, d=t_r, m=m_r, x=x_r, trim=0.0, ATET="FALSE", logit="TRUE", boot=2 - ) - raw_res_R = np.array(output_w.rx2("results")) - effects = raw_res_R[0, :] - - elif estimator == "coefficient_product": - # Class-based implementation for CoefficientProduct - estimator_obj = CoefficientProduct(regularize=True) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_ipw_noreg": - # Class-based implementation for InversePropensityWeighting without regularization - clf, reg = get_regularized_regressor_and_classifier(regularize=False) - estimator_obj = InversePropensityWeighting( - clip=1e-6, trim=0, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_ipw_reg": - # Class-based implementation with regularization - clf, reg = get_regularized_regressor_and_classifier(regularize=True) - estimator_obj = InversePropensityWeighting( - clip=1e-6, trim=0, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_ipw_reg_calibration": - # Class-based implementation with regularization and calibration (sigmoid) - clf, reg = get_regularized_regressor_and_classifier( - regularize=True, calibration=True, method="sigmoid") - estimator_obj = InversePropensityWeighting( - clip=1e-6, trim=0, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_ipw_reg_calibration_iso": - # Class-based implementation with isotonic calibration - clf, reg = get_regularized_regressor_and_classifier( - regularize=True, calibration=True, method="isotonic") - estimator_obj = InversePropensityWeighting( - clip=1e-6, trim=0, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_ipw_forest": - # Class-based implementation with forest models - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - estimator_obj = InversePropensityWeighting( - clip=1e-6, trim=0, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_ipw_forest_calibration": - # Class-based implementation with forest and calibrated sigmoid - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - calibrated_clf = CalibratedClassifierCV(clf, method="sigmoid") - estimator_obj = InversePropensityWeighting( - clip=1e-6, trim=0, classifier=calibrated_clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_ipw_forest_calibration_iso": - # Class-based implementation with isotonic calibration - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - calibrated_clf = CalibratedClassifierCV(clf, method="isotonic") - estimator_obj = InversePropensityWeighting( - clip=1e-6, trim=0, classifier=calibrated_clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_g_computation_noreg": - # Class-based implementation of GComputation without regularization - clf, reg = get_regularized_regressor_and_classifier(regularize=False) - estimator_obj = GComputation(regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_g_computation_reg": - # Class-based implementation of GComputation with regularization - clf, reg = get_regularized_regressor_and_classifier(regularize=True) - estimator_obj = GComputation(regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_g_computation_reg_calibration": - # Class-based implementation with regularization and calibrated sigmoid - clf, reg = get_regularized_regressor_and_classifier( - regularize=True, calibration=True, method="sigmoid") - estimator_obj = GComputation(regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_g_computation_reg_calibration_iso": - # Class-based implementation with isotonic calibration - clf, reg = get_regularized_regressor_and_classifier( - regularize=True, calibration=True, method="isotonic") - estimator_obj = GComputation(regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_g_computation_forest": - # Class-based implementation with forest models - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - estimator_obj = GComputation(regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_multiply_robust_noreg": - # Class-based implementation for MultiplyRobust without regularization - clf, reg = get_regularized_regressor_and_classifier(regularize=False) - estimator_obj = MultiplyRobust( - ratio="propensities", normalized=True, regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "simulation_based": - # R-based function for simulation - effects = r_mediate(y, t, m, x, interaction=False) - - elif estimator == "mediation_dml": - # R-based function for Double Machine Learning with legacy config - effects = r_mediation_dml(y, t, m, x, trim=0.0, order=1) - - elif estimator == "mediation_dml_noreg": - # Class-based implementation for DoubleMachineLearning without regularization - clf, reg = get_regularized_regressor_and_classifier(regularize=False) - estimator_obj = DoubleMachineLearning( - clip=1e-6, trim=0, normalized=True, regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # Regularized MultiplyRobust estimator - elif estimator == "mediation_multiply_robust_reg": - clf, reg = get_regularized_regressor_and_classifier(regularize=True) - estimator_obj = MultiplyRobust( - ratio="propensities", normalized=True, regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # Regularized MultiplyRobust with sigmoid calibration - elif estimator == "mediation_multiply_robust_reg_calibration": - clf, reg = get_regularized_regressor_and_classifier( - regularize=True, calibration=True, method="sigmoid") - estimator_obj = MultiplyRobust( - ratio="propensities", normalized=True, regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # Regularized MultiplyRobust with isotonic calibration - elif estimator == "mediation_multiply_robust_reg_calibration_iso": - clf, reg = get_regularized_regressor_and_classifier( - regularize=True, calibration=True, method="isotonic") - estimator_obj = MultiplyRobust( - ratio="propensities", normalized=True, regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_multiply_robust_forest": - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - estimator = MultiplyRobust( - ratio="propensities", normalized=True, regressor=reg, - classifier=clf) - estimator.fit(t, m, x, y) - causal_effects = estimator.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # MultiplyRobust with forest and sigmoid calibration - elif estimator == "mediation_multiply_robust_forest_calibration": - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - calibrated_clf = CalibratedClassifierCV(clf, method="sigmoid") - estimator_obj = MultiplyRobust( - ratio="propensities", normalized=True, regressor=reg, classifier=calibrated_clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # MultiplyRobust with forest and isotonic calibration - elif estimator == "mediation_multiply_robust_forest_calibration_iso": - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - calibrated_clf = CalibratedClassifierCV(clf, method="isotonic") - estimator_obj = MultiplyRobust( - ratio="propensities", normalized=True, regressor=reg, classifier=calibrated_clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # Regularized Double Machine Learning - elif estimator == "mediation_dml_reg": - clf, reg = get_regularized_regressor_and_classifier(regularize=True) - estimator_obj = DoubleMachineLearning( - clip=1e-6, trim=0, normalized=True, regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # Double Machine Learning with fixed seed - elif estimator == "mediation_dml_reg_fixed_seed": - effects = mediation_dml( - y, t, m, x, trim=0, clip=1e-6, random_state=321, calibration=None) - - # Regularized Double Machine Learning with sigmoid calibration - elif estimator == "mediation_dml_reg_calibration": - clf, reg = get_regularized_regressor_and_classifier( - regularize=True, calibration=True, method="sigmoid") - estimator_obj = DoubleMachineLearning( - clip=1e-6, trim=0, normalized=True, regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # Regularized Double Machine Learning with forest models - elif estimator == "mediation_dml_forest": - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - estimator_obj = DoubleMachineLearning( - clip=1e-6, trim=0, normalized=True, regressor=reg, classifier=clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # Double Machine Learning with forest and calibrated sigmoid - elif estimator == "mediation_dml_forest_calibration": - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - calibrated_clf = CalibratedClassifierCV(clf, method="sigmoid") - estimator_obj = DoubleMachineLearning( - clip=1e-6, trim=0, normalized=True, regressor=reg, classifier=calibrated_clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # GComputation with forest and sigmoid calibration - elif estimator == "mediation_g_computation_forest_calibration": - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - calibrated_clf = CalibratedClassifierCV(clf, method="sigmoid") - estimator_obj = GComputation(regressor=reg, classifier=calibrated_clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - # GComputation with forest and isotonic calibration - elif estimator == "mediation_g_computation_forest_calibration_iso": - clf = RandomForestClassifier( - random_state=42, n_estimators=100, min_samples_leaf=10) - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=42) - calibrated_clf = CalibratedClassifierCV(clf, method="isotonic") - estimator_obj = GComputation(regressor=reg, classifier=calibrated_clf) - estimator_obj.fit(t, m, x, y) - causal_effects = estimator_obj.estimate(t, m, x, y) - effects = transform_outputs(causal_effects) - - elif estimator == "mediation_g_estimator": - if config in (0, 1, 2): - effects = r_mediation_g_estimator(y, t, m, x) - else: - raise ValueError("Unrecognized estimator label.") - - # Catch unsupported estimators and raise an error - if effects is None: - raise ValueError( - f"Estimation failed for {estimator}. Check inputs or configuration.") - return effects diff --git a/src/med_bench/mediation.py b/src/med_bench/mediation.py deleted file mode 100644 index 66b69fd..0000000 --- a/src/med_bench/mediation.py +++ /dev/null @@ -1,796 +0,0 @@ -""" -the objective of this script is to implement estimators for mediation in -causal inference, simulate data, and evaluate and compare estimators -""" - -# first step, run r code to have the original implementation by Huber -# using rpy2 to have the same data in R and python... - -import numpy as np -import pandas as pd -from sklearn.base import clone - - -from .utils.nuisances import ( - _estimate_conditional_mean_outcome, - _estimate_cross_conditional_mean_outcome, - _estimate_cross_conditional_mean_outcome_nesting, - _estimate_mediator_density, - _estimate_treatment_probabilities, - _get_classifier, - _get_regressor, -) -from .utils.utils import r_dependency_required, _check_input - -ALPHAS = np.logspace(-5, 5, 8) -CV_FOLDS = 5 -TINY = 1.0e-12 - - -def mediation_IPW( - y, - t, - m, - x, - trim=0.05, - regularization=True, - forest=False, - crossfit=0, - clip=1e-6, - calibration="sigmoid", -): - """ - IPW estimator presented in - HUBER, Martin. Identifying causal mechanisms (primarily) based on inverse - probability weighting. Journal of Applied Econometrics, 2014, - vol. 29, no 6, p. 920-943. - - Parameters - ---------- - y : array-like, shape (n_samples) - outcome value for each unit, continuous - - t : array-like, shape (n_samples) - treatment value for each unit, binary - - m : array-like, shape (n_samples, n_features_mediator) - mediator value for each unit, can be continuous or binary, and - multidimensional - - x : array-like, shape (n_samples, n_features_covariates) - covariates (potential confounders) values - - trim : float - Trimming rule for discarding observations with extreme propensity - scores. In the absence of post-treatment confounders (w=NULL), - observations with Pr(D=1|M,X)(1-trim) are - dropped. In the presence of post-treatment confounders - (w is defined), observations with Pr(D=1|M,W,X)(1-trim) are dropped. - - regularization : boolean, default=True - whether to use regularized models (logistic or - linear regression). If True, cross-validation is used - to chose among 8 potential log-spaced values between - 1e-5 and 1e5 - - forest : boolean, default=False - whether to use a random forest model to estimate the propensity - scores instead of logistic regression - - crossfit : integer, default=0 - number of folds for cross-fitting - - clip : float, default=1e-6 - limit to clip for numerical stability (min=clip, max=1-clip) - - calibration : str, default=sigmoid - calibration mode; for example using a sigmoid function - - Returns - ------- - float - total effect - float - direct effect treated (\theta(1)) - float - direct effect nontreated (\theta(0)) - float - indirect effect treated (\delta(1)) - float - indirect effect untreated (\delta(0)) - int - number of used observations (non trimmed) - """ - # check input - y, t, m, x = _check_input(y, t, m, x, setting="multidimensional") - - # estimate propensities - classifier_t_x = _get_classifier(regularization, forest, calibration) - classifier_t_xm = _get_classifier(regularization, forest, calibration) - p_x, p_xm = _estimate_treatment_probabilities( - t, m, x, crossfit, classifier_t_x, classifier_t_xm - ) - - # trimming. Following causal weight code, not sure I understand - # why we trim only on p_xm and not on p_x - ind = (p_xm > trim) & (p_xm < (1 - trim)) - y, t, p_x, p_xm = y[ind], t[ind], p_x[ind], p_xm[ind] - - # note on the names, ytmt' = Y(t, M(t')), the treatment needs to be - # binary but not the mediator - p_x = np.clip(p_x, clip, 1 - clip) - p_xm = np.clip(p_xm, clip, 1 - clip) - - # importance weighting - y1m1 = np.sum(y * t / p_x) / np.sum(t / p_x) - y1m0 = np.sum(y * t * (1 - p_xm) / (p_xm * (1 - p_x))) / np.sum( - t * (1 - p_xm) / (p_xm * (1 - p_x)) - ) - y0m0 = np.sum(y * (1 - t) / (1 - p_x)) / np.sum((1 - t) / (1 - p_x)) - y0m1 = np.sum(y * (1 - t) * p_xm / ((1 - p_xm) * p_x)) / np.sum( - (1 - t) * p_xm / ((1 - p_xm) * p_x) - ) - - return ( - y1m1 - y0m0, - y1m1 - y0m1, - y1m0 - y0m0, - y1m1 - y1m0, - y0m1 - y0m0, - np.sum(ind), - ) - - -def mediation_g_formula( - y, - t, - m, - x, - interaction=False, - forest=False, - crossfit=0, - regularization=True, - calibration="sigmoid", -): - """ - Warning : m needs to be binary - - implementation of the g formula for mediation - - Parameters - ---------- - y : array-like, shape (n_samples) - outcome value for each unit, continuous - - t : array-like, shape (n_samples) - treatment value for each unit, binary - - m : array-like, shape (n_samples) - mediator value for each unit, here m is necessary binary and uni- - dimensional - - x : array-like, shape (n_samples, n_features_covariates) - covariates (potential confounders) values - - interaction : boolean, default=False - whether to include interaction terms in the model - interactions are terms XT, TM, MX - - forest : boolean, default=False - whether to use a random forest model to estimate the propensity - scores instead of logistic regression, and outcome model instead - of linear regression - - crossfit : integer, default=0 - number of folds for cross-fitting - - regularization : boolean, default=True - whether to use regularized models (logistic or - linear regression). If True, cross-validation is used - to chose among 8 potential log-spaced values between - 1e-5 and 1e5 - - calibration : str, default=sigmoid - calibration mode; for example using a sigmoid function - """ - # check input - y, t, m, x = _check_input(y, t, m, x, setting="binary") - - # estimate mediator densities - classifier_m = _get_classifier(regularization, forest, calibration) - f_00x, f_01x, f_10x, f_11x, _, _ = _estimate_mediator_density( - y, t, m, x, crossfit, classifier_m, interaction - ) - - # estimate conditional mean outcomes - regressor_y = _get_regressor(regularization, forest) - mu_00x, mu_01x, mu_10x, mu_11x, _, _ = _estimate_conditional_mean_outcome( - y, t, m, x, crossfit, regressor_y, interaction - ) - - # G computation - direct_effect_i1 = mu_11x - mu_01x - direct_effect_i0 = mu_10x - mu_00x - n = len(y) - direct_effect_treated = ( - direct_effect_i1 * f_11x + direct_effect_i0 * f_10x - ).sum() / n - direct_effect_control = ( - direct_effect_i1 * f_01x + direct_effect_i0 * f_00x - ).sum() / n - indirect_effect_i1 = f_11x - f_01x - indirect_effect_i0 = f_10x - f_00x - indirect_effect_treated = ( - indirect_effect_i1 * mu_11x + indirect_effect_i0 * mu_10x - ).sum() / n - indirect_effect_control = ( - indirect_effect_i1 * mu_01x + indirect_effect_i0 * mu_00x - ).sum() / n - total_effect = direct_effect_control + indirect_effect_treated - - return ( - total_effect, - direct_effect_treated, - direct_effect_control, - indirect_effect_treated, - indirect_effect_control, - None, - ) - - -def mediation_multiply_robust( - y, - t, - m, - x, - interaction=False, - forest=False, - crossfit=0, - clip=1e-6, - normalized=True, - regularization=True, - calibration="sigmoid", -): - """ - Presented in Eric J. Tchetgen Tchetgen. Ilya Shpitser. - "Semiparametric theory for causal mediation analysis: Efficiency bounds, - multiple robustness and sensitivity analysis." - Ann. Statist. 40 (3) 1816 - 1845, June 2012. - https://doi.org/10.1214/12-AOS990 - - Parameters - ---------- - y : array-like, shape (n_samples) - Outcome value for each unit, continuous - - t : array-like, shape (n_samples) - Treatment value for each unit, binary - - m : array-like, shape (n_samples) - Mediator value for each unit, binary and unidimensional - - x : array-like, shape (n_samples, n_features_covariates) - Covariates value for each unit, continuous - - interaction : boolean, default=False - Whether to include interaction terms in the model - interactions are terms XT, TM, MX - - forest : boolean, default=False - Whether to use a random forest model to estimate the propensity - scores instead of logistic regression, and outcome model instead - of linear regression - - crossfit : integer, default=0 - Number of folds for cross-fitting. If crossfit<2, no cross-fitting is - applied - - clip : float, default=1e-6 - Limit to clip p_x and f_mtx for numerical stability (min=clip, - max=1-clip) - - normalized : boolean, default=True - Normalizes the inverse probability-based weights so they add up to 1, - as described in "Identifying causal mechanisms (primarily) based on - inverse probability weighting", Huber (2014), - https://doi.org/10.1002/jae.2341 - - regularization : boolean, default=True - Whether to use regularized models (logistic or linear regression). - If True, cross-validation is used to chose among 8 potential - log-spaced values between 1e-5 and 1e5 - - calibration : str, default="sigmoid" - Which calibration method to use. - Implemented calibration methods are "sigmoid" and "isotonic". - - - Returns - ------- - total : float - Average total effect. - direct1 : float - Direct effect on the exposed. - direct0 : float - Direct effect on the unexposed, - indirect1 : float - Indirect effect on the exposed. - indirect0 : float - Indirect effect on the unexposed. - n_discarded : int - Number of discarded samples due to trimming. - - - Raises - ------ - ValueError - - If t or y are multidimensional. - - If x, t, m, or y don't have the same length. - - If m is not binary. - """ - # check input - y, t, m, x = _check_input(y, t, m, x, setting="binary") - - # estimate propensities - classifier_t_x = _get_classifier(regularization, forest, calibration) - p_x, _ = _estimate_treatment_probabilities( - t, m, x, crossfit, classifier_t_x, clone(classifier_t_x) - ) - - # estimate mediator densities - classifier_m = _get_classifier(regularization, forest, calibration) - f_00x, f_01x, f_10x, f_11x, f_m0x, f_m1x = _estimate_mediator_density( - y, t, m, x, crossfit, classifier_m, interaction - ) - f = f_00x, f_01x, f_10x, f_11x - - # estimate conditional mean outcomes - regressor_y = _get_regressor(regularization, forest) - regressor_cross_y = _get_regressor(regularization, forest) - mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 = ( - _estimate_cross_conditional_mean_outcome( - y, t, m, x, crossfit, regressor_y, regressor_cross_y, f, interaction - ) - ) - - # clipping - p_x_clip = p_x != np.clip(p_x, clip, 1 - clip) - f_m0x_clip = f_m0x != np.clip(f_m0x, clip, 1 - clip) - f_m1x_clip = f_m1x != np.clip(f_m1x, clip, 1 - clip) - clipped = p_x_clip + f_m0x_clip + f_m1x_clip - - var_name = ["t", "y", "p_x", "f_m0x", "f_m1x", "mu_1mx", "mu_0mx"] - var_name += ["E_mu_t1_t1", "E_mu_t0_t0", "E_mu_t1_t0", "E_mu_t0_t1"] - n_discarded = 0 - for var in var_name: - exec(f"{var} = {var}[~clipped]") - n_discarded += np.sum(clipped) - - # score computing - if normalized: - sum_score_m1 = np.mean(t / p_x) - sum_score_m0 = np.mean((1 - t) / (1 - p_x)) - sum_score_t1m0 = np.mean((t / p_x) * (f_m0x / f_m1x)) - sum_score_t0m1 = np.mean((1 - t) / (1 - p_x) * (f_m1x / f_m0x)) - - y1m1 = (t / p_x * (y - E_mu_t1_t1)) / sum_score_m1 + E_mu_t1_t1 - y0m0 = ((1 - t) / (1 - p_x) * (y - E_mu_t0_t0)) / \ - sum_score_m0 + E_mu_t0_t0 - y1m0 = ( - ((t / p_x) * (f_m0x / f_m1x) * (y - mu_1mx)) / sum_score_t1m0 - + ((1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0)) / sum_score_m0 - + E_mu_t1_t0 - ) - y0m1 = ( - ((1 - t) / (1 - p_x) * (f_m1x / f_m0x) * (y - mu_0mx)) / sum_score_t0m1 - + t / p_x * (mu_0mx - E_mu_t0_t1) / sum_score_m1 - + E_mu_t0_t1 - ) - else: - y1m1 = t / p_x * (y - E_mu_t1_t1) + E_mu_t1_t1 - y0m0 = (1 - t) / (1 - p_x) * (y - E_mu_t0_t0) + E_mu_t0_t0 - y1m0 = ( - (t / p_x) * (f_m0x / f_m1x) * (y - mu_1mx) - + (1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0) - + E_mu_t1_t0 - ) - y0m1 = ( - (1 - t) / (1 - p_x) * (f_m1x / f_m0x) * (y - mu_0mx) - + t / p_x * (mu_0mx - E_mu_t0_t1) - + E_mu_t0_t1 - ) - - # effects computing - total = np.mean(y1m1 - y0m0) - direct1 = np.mean(y1m1 - y0m1) - direct0 = np.mean(y1m0 - y0m0) - indirect1 = np.mean(y1m1 - y1m0) - indirect0 = np.mean(y0m1 - y0m0) - - return total, direct1, direct0, indirect1, indirect0, n_discarded - - -@r_dependency_required(["mediation", "stats", "base"]) -def r_mediate(y, t, m, x, interaction=False): - """ - This function calls the R function mediate from the package mediation - (https://cran.r-project.org/package=mediation) - - Parameters - ---------- - y : array-like, shape (n_samples) - outcome value for each unit, continuous - - t : array-like, shape (n_samples) - treatment value for each unit, binary - - m : array-like, shape (n_samples) - mediator value for each unit, here m is necessary binary and - unidimensional - - x : array-like, shape (n_samples, n_features_covariates) - covariates (potential confounders) values - - interaction : boolean, default=False - whether to include interaction terms in the model - interactions are terms XT, TM, MX - """ - - import rpy2.robjects as robjects - import rpy2.robjects.packages as rpackages - from rpy2.robjects import numpy2ri, pandas2ri - - pandas2ri.activate() - numpy2ri.activate() - - mediation = rpackages.importr("mediation") - Rstats = rpackages.importr("stats") - base = rpackages.importr("base") - - # check input - y, t, m, x = _check_input(y, t, m, x, setting="binary") - m = m.ravel() - - var_names = [[y, "y"], [t, "t"], [m, "m"], [x, "x"]] - df_list = list() - for var, name in var_names: - if len(var.shape) > 1: - var_dim = var.shape[1] - col_names = ["{}_{}".format(name, i) for i in range(var_dim)] - sub_df = pd.DataFrame(var, columns=col_names) - else: - sub_df = pd.DataFrame(var, columns=[name]) - df_list.append(sub_df) - df = pd.concat(df_list, axis=1) - m_features = [c for c in df.columns if ("y" not in c) and ("m" not in c)] - y_features = [c for c in df.columns if ("y" not in c)] - if not interaction: - m_formula = "m ~ " + " + ".join(m_features) - y_formula = "y ~ " + " + ".join(y_features) - else: - m_formula = "m ~ " + " + ".join( - m_features + [":".join(p) for p in combinations(m_features, 2)] - ) - y_formula = "y ~ " + " + ".join( - y_features + [":".join(p) for p in combinations(y_features, 2)] - ) - robjects.globalenv["df"] = df - mediator_model = Rstats.lm(m_formula, data=base.as_symbol("df")) - outcome_model = Rstats.lm(y_formula, data=base.as_symbol("df")) - res = mediation.mediate( - mediator_model, outcome_model, treat="t", mediator="m", boot=True, sims=1 - ) - - relevant_variables = ["tau.coef", "z1", "z0", "d1", "d0"] - to_return = [np.array(res.rx2(v))[0] for v in relevant_variables] - return to_return + [None] - - -@r_dependency_required(["plmed", "base"]) -def r_mediation_g_estimator(y, t, m, x): - """ - This function calls the R G-estimator from the package plmed - (https://github.com/ohines/plmed) - """ - - import rpy2.robjects as robjects - import rpy2.robjects.packages as rpackages - from rpy2.robjects import numpy2ri, pandas2ri - - pandas2ri.activate() - numpy2ri.activate() - - plmed = rpackages.importr("plmed") - base = rpackages.importr("base") - - # check input - y, t, m, x = _check_input(y, t, m, x, setting="binary") - m = m.ravel() - - var_names = [[y, "y"], [t, "t"], [m, "m"], [x, "x"]] - df_list = list() - for var, name in var_names: - if len(var.shape) > 1: - var_dim = var.shape[1] - col_names = ["{}_{}".format(name, i) for i in range(var_dim)] - sub_df = pd.DataFrame(var, columns=col_names) - else: - sub_df = pd.DataFrame(var, columns=[name]) - df_list.append(sub_df) - df = pd.concat(df_list, axis=1) - m_features = [c for c in df.columns if ("x" in c)] - y_features = [c for c in df.columns if ("x" in c)] - t_features = [c for c in df.columns if ("x" in c)] - m_formula = "m ~ " + " + ".join(m_features) - y_formula = "y ~ " + " + ".join(y_features) - t_formula = "t ~ " + " + ".join(t_features) - robjects.globalenv["df"] = df - res = plmed.G_estimation( - t_formula, - m_formula, - y_formula, - exposure_family="binomial", - data=base.as_symbol("df"), - ) - direct_effect = res.rx2("coef")[0] - indirect_effect = res.rx2("coef")[1] - return ( - direct_effect + indirect_effect, - direct_effect, - direct_effect, - indirect_effect, - indirect_effect, - None, - ) - - -@r_dependency_required(["causalweight", "base"]) -def r_mediation_dml(y, t, m, x, trim=0.05, order=1): - """ - This function calls the R Double Machine Learning estimator from the - package causalweight (https://cran.r-project.org/web/packages/causalweight) - - Parameters - ---------- - y : array-like, shape (n_samples) - outcome value for each unit, continuous - - t : array-like, shape (n_samples) - treatment value for each unit, binary - - m : array-like, shape (n_samples, n_features_mediator) - mediator value for each unit, can be continuous or binary, and - multi-dimensional - - x : array-like, shape (n_samples, n_features_covariates) - covariates (potential confounders) values - - trim : float, default=0.05 - Trimming rule for discarding observations with extreme - conditional treatment or mediator probabilities - (or products thereof). Observations with (products of) - conditional probabilities that are smaller than trim in any - denominator of the potential outcomes are dropped. - - order : integer, default=1 - If set to an integer larger than 1, then polynomials of that - order and interactions using the power series) rather than the - original control variables are used in the estimation of any - conditional probability or conditional mean outcome. - Polynomials/interactions are created using the Generate. - Powers command of the LARF package. - """ - - import rpy2.robjects.packages as rpackages - from rpy2.robjects import numpy2ri, pandas2ri - from .utils.utils import _convert_array_to_R - - pandas2ri.activate() - numpy2ri.activate() - - causalweight = rpackages.importr("causalweight") - base = rpackages.importr("base") - - # check input - y, t, m, x = _check_input(y, t, m, x, setting="multidimensional") - - x_r, t_r, m_r, y_r = [ - base.as_matrix(_convert_array_to_R(uu)) for uu in (x, t, m, y) - ] - res = causalweight.medDML(y_r, t_r, m_r, x_r, trim=trim, order=order) - raw_res_R = np.array(res.rx2("results")) - ntrimmed = res.rx2("ntrimmed")[0] - return list(raw_res_R[0, :5]) + [ntrimmed] - - -def mediation_dml( - y, - t, - m, - x, - forest=False, - crossfit=0, - trim=0.05, - clip=1e-6, - normalized=True, - regularization=True, - random_state=None, - calibration=None, -): - """ - Python implementation of Double Machine Learning procedure, as described - in : - Helmut Farbmacher and others, Causal mediation analysis with double - machine learning, - The Econometrics Journal, Volume 25, Issue 2, May 2022, Pages 277–300, - https://doi.org/10.1093/ectj/utac003 - - Parameters - ---------- - - y : array-like, shape (n_samples) - Outcome value for each unit. - - t : array-like, shape (n_samples) - Treatment value for each unit. - - m : array-like, shape (n_samples, n_features_mediator) - Mediator value for each unit, multidimensional or continuous. - - x : array-like, shape (n_samples, n_features_covariates) - Covariates value for each unit, multidimensional or continuous. - - forest : boolean, default=False - Whether to use a random forest model to estimate the propensity - scores instead of logistic regression, and outcome model instead - of linear regression. - - crossfit : int, default=0 - Number of folds for cross-fitting. - - trim : float, default=0.05 - Trimming treshold for discarding observations with extreme probability. - - clip : float, default=1e-6 - limit to clip for numerical stability (min=clip, max=1-clip) - - normalized : boolean, default=True - Normalizes the inverse probability-based weights so they add up to 1, - as described in "Identifying causal mechanisms (primarily) based on - inverse probability weighting", - Huber (2014), https://doi.org/10.1002/jae.2341 - - regularization : boolean, default=True - Whether to use regularized models (logistic or linear regression). - If True, cross-validation is used to chose among 8 potential - log-spaced values between 1e-5 and 1e5. - - random_state : int, default=None - LogisticRegression random state instance. - - calibration : {None, "sigmoid", "isotonic"}, default=None - Whether to add a calibration step for the classifier used to estimate - the treatment propensity score and P(T|M,X). "None" means no - calibration. - Calibration ensures the output of the [predict_proba] - (https://scikit-learn.org/stable/glossary.html#term-predict_proba) - method can be directly interpreted as a confidence level. - Implemented calibration methods are "sigmoid" and "isotonic". - - Returns - ------- - total : float - Average total effect. - direct1 : float - Direct effect on the exposed. - direct0 : float - Direct effect on the unexposed, - indirect1 : float - Indirect effect on the exposed. - indirect0 : float - Indirect effect on the unexposed. - n_discarded : int - Number of discarded samples due to trimming. - - Raises - ------ - ValueError - - If t or y are multidimensional. - - If x, t, m, or y don't have the same length. - """ - # check input - y, t, m, x = _check_input(y, t, m, x, setting="multidimensional") - n = len(y) - - nobs = 0 - - var_name = [ - "p_x", - "p_xm", - "mu_1mx", - "mu_0mx", - "E_mu_t1_t0", - "E_mu_t0_t1", - "E_mu_t1_t1", - "E_mu_t0_t0", - ] - - # estimate propensities - classifier_t_x = _get_classifier(regularization, forest, calibration) - classifier_t_xm = _get_classifier(regularization, forest, calibration) - p_x, p_xm = _estimate_treatment_probabilities( - t, m, x, crossfit, classifier_t_x, classifier_t_xm - ) - - # estimate conditional mean outcomes - regressor_y = _get_regressor(regularization, forest) - regressor_cross_y = _get_regressor(regularization, forest) - - mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 = ( - _estimate_cross_conditional_mean_outcome_nesting( - y, t, m, x, crossfit, regressor_y, regressor_cross_y - ) - ) - - # trimming - not_trimmed = ( - (((1 - p_xm) * p_x) >= trim) - * ((1 - p_x) >= trim) - * (p_x >= trim) - * ((p_xm * (1 - p_x)) >= trim) - ) - for var in var_name: - exec(f"{var} = {var}[not_trimmed]") - nobs = np.sum(not_trimmed) - - # clipping - p_x = np.clip(p_x, clip, 1 - clip) - p_xm = np.clip(p_xm, clip, 1 - clip) - - # score computing - if normalized: - sum_score_m1 = np.mean(t / p_x) - sum_score_m0 = np.mean((1 - t) / (1 - p_x)) - sum_score_t1m0 = np.mean(t * (1 - p_xm) / (p_xm * (1 - p_x))) - sum_score_t0m1 = np.mean((1 - t) * p_xm / ((1 - p_xm) * p_x)) - y1m1 = (t / p_x * (y - E_mu_t1_t1)) / sum_score_m1 + E_mu_t1_t1 - y0m0 = ((1 - t) / (1 - p_x) * (y - E_mu_t0_t0)) / \ - sum_score_m0 + E_mu_t0_t0 - y1m0 = ( - (t * (1 - p_xm) / (p_xm * (1 - p_x)) * (y - mu_1mx)) / sum_score_t1m0 - + ((1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0)) / sum_score_m0 - + E_mu_t1_t0 - ) - y0m1 = ( - ((1 - t) * p_xm / ((1 - p_xm) * p_x) * (y - mu_0mx)) / sum_score_t0m1 - + (t / p_x * (mu_0mx - E_mu_t0_t1)) / sum_score_m1 - + E_mu_t0_t1 - ) - else: - y1m1 = t / p_x * (y - E_mu_t1_t1) + E_mu_t1_t1 - y0m0 = (1 - t) / (1 - p_x) * (y - E_mu_t0_t0) + E_mu_t0_t0 - y1m0 = ( - t * (1 - p_xm) / (p_xm * (1 - p_x)) * (y - mu_1mx) - + (1 - t) / (1 - p_x) * (mu_1mx - E_mu_t1_t0) - + E_mu_t1_t0 - ) - y0m1 = ( - (1 - t) * p_xm / ((1 - p_xm) * p_x) * (y - mu_0mx) - + t / p_x * (mu_0mx - E_mu_t0_t1) - + E_mu_t0_t1 - ) - - # mean score computing - my1m1 = np.mean(y1m1) - my0m0 = np.mean(y0m0) - my1m0 = np.mean(y1m0) - my0m1 = np.mean(y0m1) - - # effects computing - total = my1m1 - my0m0 - direct1 = my1m1 - my0m1 - direct0 = my1m0 - my0m0 - indirect1 = my1m1 - my1m0 - indirect0 = my0m1 - my0m0 - return total, direct1, direct0, indirect1, indirect0, n - nobs diff --git a/src/med_bench/r_mediation.py b/src/med_bench/r_mediation.py new file mode 100644 index 0000000..1afc8ef --- /dev/null +++ b/src/med_bench/r_mediation.py @@ -0,0 +1,205 @@ +""" +the objective of this script is to implement estimators for mediation in +causal inference, simulate data, and evaluate and compare estimators +""" + +# first step, run r code to have the original implementation by Huber +# using rpy2 to have the same data in R and python... + +import numpy as np +import pandas as pd + +from .utils.utils import r_dependency_required, _check_input + + +@r_dependency_required(["mediation", "stats", "base"]) +def r_mediate(y, t, m, x, interaction=False): + """ + This function calls the R function mediate from the package mediation + (https://cran.r-project.org/package=mediation) + + Parameters + ---------- + y : array-like, shape (n_samples) + outcome value for each unit, continuous + + t : array-like, shape (n_samples) + treatment value for each unit, binary + + m : array-like, shape (n_samples) + mediator value for each unit, here m is necessary binary and + unidimensional + + x : array-like, shape (n_samples, n_features_covariates) + covariates (potential confounders) values + + interaction : boolean, default=False + whether to include interaction terms in the model + interactions are terms XT, TM, MX + """ + + import rpy2.robjects as robjects + import rpy2.robjects.packages as rpackages + from rpy2.robjects import numpy2ri, pandas2ri + + pandas2ri.activate() + numpy2ri.activate() + + mediation = rpackages.importr("mediation") + Rstats = rpackages.importr("stats") + base = rpackages.importr("base") + + # check input + y, t, m, x = _check_input(y, t, m, x, setting="binary") + m = m.ravel() + + var_names = [[y, "y"], [t, "t"], [m, "m"], [x, "x"]] + df_list = list() + for var, name in var_names: + if len(var.shape) > 1: + var_dim = var.shape[1] + col_names = ["{}_{}".format(name, i) for i in range(var_dim)] + sub_df = pd.DataFrame(var, columns=col_names) + else: + sub_df = pd.DataFrame(var, columns=[name]) + df_list.append(sub_df) + df = pd.concat(df_list, axis=1) + m_features = [c for c in df.columns if ("y" not in c) and ("m" not in c)] + y_features = [c for c in df.columns if ("y" not in c)] + if not interaction: + m_formula = "m ~ " + " + ".join(m_features) + y_formula = "y ~ " + " + ".join(y_features) + else: + m_formula = "m ~ " + " + ".join( + m_features + [":".join(p) for p in combinations(m_features, 2)] + ) + y_formula = "y ~ " + " + ".join( + y_features + [":".join(p) for p in combinations(y_features, 2)] + ) + robjects.globalenv["df"] = df + mediator_model = Rstats.lm(m_formula, data=base.as_symbol("df")) + outcome_model = Rstats.lm(y_formula, data=base.as_symbol("df")) + res = mediation.mediate( + mediator_model, outcome_model, treat="t", mediator="m", boot=True, sims=1 + ) + + relevant_variables = ["tau.coef", "z1", "z0", "d1", "d0"] + to_return = [np.array(res.rx2(v))[0] for v in relevant_variables] + return to_return + [None] + + +@r_dependency_required(["plmed", "base"]) +def r_mediation_g_estimator(y, t, m, x): + """ + This function calls the R G-estimator from the package plmed + (https://github.com/ohines/plmed) + """ + + import rpy2.robjects as robjects + import rpy2.robjects.packages as rpackages + from rpy2.robjects import numpy2ri, pandas2ri + + pandas2ri.activate() + numpy2ri.activate() + + plmed = rpackages.importr("plmed") + base = rpackages.importr("base") + + # check input + y, t, m, x = _check_input(y, t, m, x, setting="binary") + m = m.ravel() + + var_names = [[y, "y"], [t, "t"], [m, "m"], [x, "x"]] + df_list = list() + for var, name in var_names: + if len(var.shape) > 1: + var_dim = var.shape[1] + col_names = ["{}_{}".format(name, i) for i in range(var_dim)] + sub_df = pd.DataFrame(var, columns=col_names) + else: + sub_df = pd.DataFrame(var, columns=[name]) + df_list.append(sub_df) + df = pd.concat(df_list, axis=1) + m_features = [c for c in df.columns if ("x" in c)] + y_features = [c for c in df.columns if ("x" in c)] + t_features = [c for c in df.columns if ("x" in c)] + m_formula = "m ~ " + " + ".join(m_features) + y_formula = "y ~ " + " + ".join(y_features) + t_formula = "t ~ " + " + ".join(t_features) + robjects.globalenv["df"] = df + res = plmed.G_estimation( + t_formula, + m_formula, + y_formula, + exposure_family="binomial", + data=base.as_symbol("df"), + ) + direct_effect = res.rx2("coef")[0] + indirect_effect = res.rx2("coef")[1] + return ( + direct_effect + indirect_effect, + direct_effect, + direct_effect, + indirect_effect, + indirect_effect, + None, + ) + + +@r_dependency_required(["causalweight", "base"]) +def r_mediation_dml(y, t, m, x, trim=0.05, order=1): + """ + This function calls the R Double Machine Learning estimator from the + package causalweight (https://cran.r-project.org/web/packages/causalweight) + + Parameters + ---------- + y : array-like, shape (n_samples) + outcome value for each unit, continuous + + t : array-like, shape (n_samples) + treatment value for each unit, binary + + m : array-like, shape (n_samples, n_features_mediator) + mediator value for each unit, can be continuous or binary, and + multi-dimensional + + x : array-like, shape (n_samples, n_features_covariates) + covariates (potential confounders) values + + trim : float, default=0.05 + Trimming rule for discarding observations with extreme + conditional treatment or mediator probabilities + (or products thereof). Observations with (products of) + conditional probabilities that are smaller than trim in any + denominator of the potential outcomes are dropped. + + order : integer, default=1 + If set to an integer larger than 1, then polynomials of that + order and interactions using the power series) rather than the + original control variables are used in the estimation of any + conditional probability or conditional mean outcome. + Polynomials/interactions are created using the Generate. + Powers command of the LARF package. + """ + + import rpy2.robjects.packages as rpackages + from rpy2.robjects import numpy2ri, pandas2ri + from .utils.utils import _convert_array_to_R + + pandas2ri.activate() + numpy2ri.activate() + + causalweight = rpackages.importr("causalweight") + base = rpackages.importr("base") + + # check input + y, t, m, x = _check_input(y, t, m, x, setting="multidimensional") + + x_r, t_r, m_r, y_r = [ + base.as_matrix(_convert_array_to_R(uu)) for uu in (x, t, m, y) + ] + res = causalweight.medDML(y_r, t_r, m_r, x_r, trim=trim, order=order) + raw_res_R = np.array(res.rx2("results")) + ntrimmed = res.rx2("ntrimmed")[0] + return list(raw_res_R[0, :5]) + [ntrimmed] diff --git a/src/med_bench/utils/constants.py b/src/med_bench/utils/constants.py index eb11572..706b84b 100644 --- a/src/med_bench/utils/constants.py +++ b/src/med_bench/utils/constants.py @@ -79,7 +79,9 @@ def get_tolerance_array(tolerance_size: str) -> np.array: "mediation_multiply_robust_forest_calibration": LARGE_TOLERANCE, "simulation_based": LARGE_TOLERANCE, "mediation_dml": INFINITE_TOLERANCE, - "mediation_dml_reg_fixed_seed": INFINITE_TOLERANCE, + "mediation_dml_noreg": INFINITE_TOLERANCE, + "mediation_dml_reg": INFINITE_TOLERANCE, + "mediation_dml_forest": INFINITE_TOLERANCE, "mediation_g_estimator": LARGE_TOLERANCE, } diff --git a/src/med_bench/utils/nuisances.py b/src/med_bench/utils/nuisances.py deleted file mode 100644 index bb60e08..0000000 --- a/src/med_bench/utils/nuisances.py +++ /dev/null @@ -1,476 +0,0 @@ -""" -the objective of this script is to implement nuisances functions -used in mediation estimators in causal inference -""" - -import numpy as np -from sklearn.base import clone -from sklearn.calibration import CalibratedClassifierCV -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LogisticRegressionCV, RidgeCV -from sklearn.model_selection import KFold - -from .utils import check_r_dependencies, _get_interactions - -if check_r_dependencies(): - pass - - -ALPHAS = np.logspace(-5, 5, 8) -CV_FOLDS = 5 -TINY = 1.0e-12 - - -def _get_train_test_lists(crossfit, n, x): - """ - Obtain train and test folds - - Returns - ------- - train_test_list : list - indexes with train and test indexes - """ - if crossfit < 2: - train_test_list = [[np.arange(n), np.arange(n)]] - else: - kf = KFold(n_splits=crossfit) - train_test_list = list() - for train_index, test_index in kf.split(x): - train_test_list.append([train_index, test_index]) - return train_test_list - - -def _get_regularization_parameters(regularization): - """ - Obtain regularization parameters - - Returns - ------- - cs : list - each of the values in Cs describes the inverse of regularization - strength for predictors - alphas : list - alpha values to try in ridge models - """ - if regularization: - alphas = ALPHAS - cs = ALPHAS - else: - alphas = [TINY] - cs = [np.inf] - - return cs, alphas - - -def _get_classifier(regularization, forest, calibration, random_state=42): - """ - Obtain context classifiers to estimate treatment probabilities. - - Returns - ------- - clf : classifier on contexts, etc. for predicting P(T=1|X), - P(T=1|X, M) or f(M|T,X) - """ - cs, _ = _get_regularization_parameters(regularization) - - if not forest: - clf = LogisticRegressionCV(random_state=random_state, Cs=cs, cv=CV_FOLDS) - else: - clf = RandomForestClassifier( - random_state=random_state, n_estimators=100, min_samples_leaf=10 - ) - if calibration in {"sigmoid", "isotonic"}: - clf = CalibratedClassifierCV(clf, method=calibration) - - return clf - - -def _get_regressor(regularization, forest, random_state=42): - """ - Obtain regressors to estimate conditional mean outcomes. - - Returns - ------- - reg : regressor on contexts, etc. for predicting E[Y|T,M,X], etc. - """ - _, alphas = _get_regularization_parameters(regularization) - - if not forest: - reg = RidgeCV(alphas=alphas, cv=CV_FOLDS) - else: - reg = RandomForestRegressor( - n_estimators=100, min_samples_leaf=10, random_state=random_state - ) - - return reg - - -def _estimate_treatment_probabilities(t, m, x, crossfit, clf_t_x, clf_t_xm): - """ - Estimate treatment probabilities P(T=1|X) and P(T=1|X, M) with train - test lists from crossfitting - - Returns - ------- - p_x : array-like, shape (n_samples) - probabilities P(T=1|X) - p_xm : array-like, shape (n_samples) - probabilities P(T=1|X, M) - """ - n = len(t) - - p_x, p_xm = [np.zeros(n) for h in range(2)] - # compute propensity scores - if len(t.shape) == 1: - t = t.reshape(-1, 1) - - train_test_list = _get_train_test_lists(crossfit, n, x) - - xm = np.hstack((x, m)) - - for train_index, test_index in train_test_list: - - # p_x, p_xm model fitting - clf_t_x = clf_t_x.fit(x[train_index, :], t[train_index]) - clf_t_xm = clf_t_xm.fit(xm[train_index, :], t[train_index]) - - # predict P(T=1|X), P(T=1|X, M) - p_x[test_index] = clf_t_x.predict_proba(x[test_index, :])[:, 1] - p_xm[test_index] = clf_t_xm.predict_proba(xm[test_index, :])[:, 1] - - return p_x, p_xm - - -def _estimate_mediator_density(y, t, m, x, crossfit, clf_m, interaction): - """ - Estimate mediator density f(M|T,X) - with train test lists from crossfitting - - Returns - ------- - f_00x: array-like, shape (n_samples) - probabilities f(M=0|T=0,X) - f_01x, array-like, shape (n_samples) - probabilities f(M=0|T=1,X) - f_10x, array-like, shape (n_samples) - probabilities f(M=1|T=0,X) - f_11x, array-like, shape (n_samples) - probabilities f(M=1|T=1,X) - f_m0x, array-like, shape (n_samples) - probabilities f(M|T=0,X) - f_m1x, array-like, shape (n_samples) - probabilities f(M|T=1,X) - """ - n = len(y) - - if len(t.shape) == 1: - t = t.reshape(-1, 1) - - t0 = np.zeros((n, 1)) - t1 = np.ones((n, 1)) - - m = m.ravel() - - train_test_list = _get_train_test_lists(crossfit, n, x) - - f_00x, f_01x, f_10x, f_11x, f_m0x, f_m1x = [np.zeros(n) for _ in range(6)] - - t_x = _get_interactions(interaction, t, x) - - t0_x = _get_interactions(interaction, t0, x) - t1_x = _get_interactions(interaction, t1, x) - - for train_index, test_index in train_test_list: - - test_ind = np.arange(len(test_index)) - - # f_mtx model fitting - clf_m = clf_m.fit(t_x[train_index, :], m[train_index]) - - # predict f(M=m|T=t,X) - fm_0 = clf_m.predict_proba(t0_x[test_index, :]) - f_00x[test_index] = fm_0[:, 0] - f_01x[test_index] = fm_0[:, 1] - fm_1 = clf_m.predict_proba(t1_x[test_index, :]) - f_10x[test_index] = fm_1[:, 0] - f_11x[test_index] = fm_1[:, 1] - - # predict f(M|T=t,X) - f_m0x[test_index] = fm_0[test_ind, m[test_index].astype(int)] - f_m1x[test_index] = fm_1[test_ind, m[test_index].astype(int)] - - return f_00x, f_01x, f_10x, f_11x, f_m0x, f_m1x - - -def _estimate_conditional_mean_outcome(y, t, m, x, crossfit, reg_y, interaction): - """ - Estimate conditional mean outcome E[Y|T,M,X] - with train test lists from crossfitting - - Returns - ------- - mu_00x: array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=0,M=0,X] - mu_01x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=0,M=1,X] - mu_10x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=1,M=0,X] - mu_11x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=1,M=1,X] - mu_m0x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=0,M,X] - mu_m1x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=1,M,X] - """ - n = len(y) - mr = np.copy(m) - if len(t.shape) == 1: - t = t.reshape(-1, 1) - - t0 = np.zeros((n, 1)) - t1 = np.ones((n, 1)) - m0 = np.zeros((n, 1)) - m1 = np.ones((n, 1)) - - train_test_list = _get_train_test_lists(crossfit, n, x) - - mu_11x, mu_10x, mu_01x, mu_00x, mu_1mx, mu_0mx = [np.zeros(n) for _ in range(6)] - - x_t_mr = _get_interactions(interaction, x, t, mr) - - x_t1_m1 = _get_interactions(interaction, x, t1, m1) - x_t1_m0 = _get_interactions(interaction, x, t1, m0) - x_t0_m1 = _get_interactions(interaction, x, t0, m1) - x_t0_m0 = _get_interactions(interaction, x, t0, m0) - - x_t1_m = _get_interactions(interaction, x, t1, m) - x_t0_m = _get_interactions(interaction, x, t0, m) - - for train_index, test_index in train_test_list: - - # mu_tm model fitting - reg_y = reg_y.fit(x_t_mr[train_index, :], y[train_index]) - - # predict E[Y|T=t,M=m,X] - mu_00x[test_index] = reg_y.predict(x_t0_m0[test_index, :]) - mu_01x[test_index] = reg_y.predict(x_t0_m1[test_index, :]) - mu_10x[test_index] = reg_y.predict(x_t1_m0[test_index, :]) - mu_11x[test_index] = reg_y.predict(x_t1_m1[test_index, :]) - - # predict E[Y|T=t,M,X] - mu_0mx[test_index] = reg_y.predict(x_t0_m[test_index, :]) - mu_1mx[test_index] = reg_y.predict(x_t1_m[test_index, :]) - - return mu_00x, mu_01x, mu_10x, mu_11x, mu_0mx, mu_1mx - - -def _estimate_cross_conditional_mean_outcome( - y, t, m, x, crossfit, reg_y, reg_cross_y, f, interaction -): - """ - Estimate the conditional mean outcome, - the cross conditional mean outcome - - Returns - ------- - mu_m0x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=0,M,X] - mu_m1x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=1,M,X] - E_mu_t0_t0, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=0,M,X]|T=0,X] - E_mu_t0_t1, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=0,M,X]|T=1,X] - E_mu_t1_t0, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=1,M,X]|T=0,X] - E_mu_t1_t1, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=1,M,X]|T=1,X] - """ - n = len(y) - - # Initialisation - ( - mu_1mx, # E[Y|T=1,M,X] - mu_0mx, # E[Y|T=0,M,X] - mu_11x, # E[Y|T=1,M=1,X] - mu_10x, # E[Y|T=1,M=0,X] - mu_01x, # E[Y|T=0,M=1,X] - mu_00x, # E[Y|T=0,M=0,X] - E_mu_t0_t0, # E[E[Y|T=0,M,X]|T=0,X] - E_mu_t0_t1, # E[E[Y|T=0,M,X]|T=1,X] - E_mu_t1_t0, # E[E[Y|T=1,M,X]|T=0,X] - E_mu_t1_t1, # E[E[Y|T=1,M,X]|T=1,X] - ) = [np.zeros(n) for _ in range(10)] - - t0, m0 = np.zeros((n, 1)), np.zeros((n, 1)) - t1, m1 = np.ones((n, 1)), np.ones((n, 1)) - - train_test_list = _get_train_test_lists(crossfit, n, x) - - x_t_m = _get_interactions(interaction, x, t, m) - x_t1_m = _get_interactions(interaction, x, t1, m) - x_t0_m = _get_interactions(interaction, x, t0, m) - - x_t0_m0 = _get_interactions(interaction, x, t0, m0) - x_t0_m1 = _get_interactions(interaction, x, t0, m1) - x_t1_m0 = _get_interactions(interaction, x, t1, m0) - x_t1_m1 = _get_interactions(interaction, x, t1, m1) - - f_00x, f_01x, f_10x, f_11x = f - - # Cross-fitting loop - for train_index, test_index in train_test_list: - # Index declaration - ind_t0 = t[test_index] == 0 - - # mu_tm model fitting - reg_y = reg_y.fit(x_t_m[train_index, :], y[train_index]) - - # predict E[Y|T=t,M,X] - mu_1mx[test_index] = reg_y.predict(x_t1_m[test_index, :]) - mu_0mx[test_index] = reg_y.predict(x_t0_m[test_index, :]) - - # predict E[Y|T=t,M=m,X] - mu_00x[test_index] = reg_y.predict(x_t0_m0[test_index, :]) - mu_01x[test_index] = reg_y.predict(x_t0_m1[test_index, :]) - mu_11x[test_index] = reg_y.predict(x_t1_m1[test_index, :]) - mu_10x[test_index] = reg_y.predict(x_t1_m0[test_index, :]) - - # E[E[Y|T=1,M=m,X]|T=t,X] model fitting - reg_y_t1m1_t0 = clone(reg_cross_y).fit( - x[test_index, :][ind_t0, :], mu_11x[test_index][ind_t0] - ) - reg_y_t1m0_t0 = clone(reg_cross_y).fit( - x[test_index, :][ind_t0, :], mu_10x[test_index][ind_t0] - ) - reg_y_t1m1_t1 = clone(reg_cross_y).fit( - x[test_index, :][~ind_t0, :], mu_11x[test_index][~ind_t0] - ) - reg_y_t1m0_t1 = clone(reg_cross_y).fit( - x[test_index, :][~ind_t0, :], mu_10x[test_index][~ind_t0] - ) - - # predict E[E[Y|T=1,M=m,X]|T=t,X] - E_mu_t1_t0[test_index] = ( - reg_y_t1m0_t0.predict(x[test_index, :]) * f_00x[test_index] - + reg_y_t1m1_t0.predict(x[test_index, :]) * f_01x[test_index] - ) - E_mu_t1_t1[test_index] = ( - reg_y_t1m0_t1.predict(x[test_index, :]) * f_10x[test_index] - + reg_y_t1m1_t1.predict(x[test_index, :]) * f_11x[test_index] - ) - - # E[E[Y|T=0,M=m,X]|T=t,X] model fitting - reg_y_t0m1_t0 = clone(reg_cross_y).fit( - x[test_index, :][ind_t0, :], mu_01x[test_index][ind_t0] - ) - reg_y_t0m0_t0 = clone(reg_cross_y).fit( - x[test_index, :][ind_t0, :], mu_00x[test_index][ind_t0] - ) - reg_y_t0m1_t1 = clone(reg_cross_y).fit( - x[test_index, :][~ind_t0, :], mu_01x[test_index][~ind_t0] - ) - reg_y_t0m0_t1 = clone(reg_cross_y).fit( - x[test_index, :][~ind_t0, :], mu_00x[test_index][~ind_t0] - ) - - # predict E[E[Y|T=0,M=m,X]|T=t,X] - E_mu_t0_t0[test_index] = ( - reg_y_t0m0_t0.predict(x[test_index, :]) * f_00x[test_index] - + reg_y_t0m1_t0.predict(x[test_index, :]) * f_01x[test_index] - ) - E_mu_t0_t1[test_index] = ( - reg_y_t0m0_t1.predict(x[test_index, :]) * f_10x[test_index] - + reg_y_t0m1_t1.predict(x[test_index, :]) * f_11x[test_index] - ) - - return mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 - - -def _estimate_cross_conditional_mean_outcome_nesting( - y, t, m, x, crossfit, reg_y, reg_cross_y -): - """ - Estimate treatment probabilities and the conditional mean outcome, - cross conditional mean outcome - - Estimate the conditional mean outcome, - the cross conditional mean outcome - - Returns - ------- - mu_m0x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=0,M,X] - mu_m1x, array-like, shape (n_samples) - conditional mean outcome estimates E[Y|T=1,M,X] - mu_0x, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=0,M,X]|T=0,X] - E_mu_t0_t1, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=0,M,X]|T=1,X] - E_mu_t1_t0, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=1,M,X]|T=0,X] - mu_1x, array-like, shape (n_samples) - cross conditional mean outcome estimates E[E[Y|T=1,M,X]|T=1,X] - """ - n = len(y) - - # initialisation - ( - mu_1mx, # E[Y|T=1,M,X] - mu_1mx_nested, # E[Y|T=1,M,X] predicted on train_nested set - mu_0mx, # E[Y|T=0,M,X] - mu_0mx_nested, # E[Y|T=0,M,X] predicted on train_nested set - E_mu_t1_t0, # E[E[Y|T=1,M,X]|T=0,X] - E_mu_t0_t1, # E[E[Y|T=0,M,X]|T=1,X] - mu_1x, # E[Y|T=1,X] - mu_0x, # E[Y|T=0,X] - ) = [np.zeros(n) for _ in range(8)] - - xm = np.hstack((x, m)) - - train_test_list = _get_train_test_lists(crossfit, n, x) - - for train, test in train_test_list: - # define test set - train1 = train[t[train] == 1] - train0 = train[t[train] == 0] - - train_mean, train_nested = np.array_split(train, 2) - train_mean1 = train_mean[t[train_mean] == 1] - train_mean0 = train_mean[t[train_mean] == 0] - train_nested1 = train_nested[t[train_nested] == 1] - train_nested0 = train_nested[t[train_nested] == 0] - - # predict E[Y|T=1,M,X] - reg_y1m = clone(reg_y) - reg_y1m.fit(xm[train_mean1], y[train_mean1]) - mu_1mx[test] = reg_y1m.predict(xm[test]) - mu_1mx_nested[train_nested] = reg_y1m.predict(xm[train_nested]) - - # predict E[Y|T=0,M,X] - reg_y0m = clone(reg_y) - reg_y0m.fit(xm[train_mean0], y[train_mean0]) - mu_0mx[test] = reg_y0m.predict(xm[test]) - mu_0mx_nested[train_nested] = reg_y0m.predict(xm[train_nested]) - - # predict E[E[Y|T=1,M,X]|T=0,X] - reg_cross_y1 = clone(reg_cross_y) - reg_cross_y1.fit(x[train_nested0], mu_1mx_nested[train_nested0]) - E_mu_t1_t0[test] = reg_cross_y1.predict(x[test]) - - # predict E[E[Y|T=0,M,X]|T=1,X] - reg_cross_y0 = clone(reg_cross_y) - reg_cross_y0.fit(x[train_nested1], mu_0mx_nested[train_nested1]) - E_mu_t0_t1[test] = reg_cross_y0.predict(x[test]) - - # predict E[Y|T=1,X] - reg_y1 = clone(reg_y) - reg_y1.fit(x[train1], y[train1]) - mu_1x[test] = reg_y1.predict(x[test]) - - # predict E[Y|T=0,X] - reg_y0 = clone(reg_y) - reg_y0.fit(x[train0], y[train0]) - mu_0x[test] = reg_y0.predict(x[test]) - - return mu_0mx, mu_1mx, mu_0x, E_mu_t0_t1, E_mu_t1_t0, mu_1x diff --git a/src/med_bench/utils/utils.py b/src/med_bench/utils/utils.py index 017f145..2310c2d 100644 --- a/src/med_bench/utils/utils.py +++ b/src/med_bench/utils/utils.py @@ -1,12 +1,9 @@ +from numpy.random import default_rng import numpy as np import pandas as pd - -from sklearn.cluster import KMeans -from sklearn.model_selection import train_test_split -from sklearn.model_selection import KFold - import subprocess +from med_bench.get_simulated_data import simulate_data from med_bench.utils.constants import ALPHAS, TINY @@ -258,6 +255,12 @@ def is_array_integer(array): return all(list((array == array.astype(int)).squeeze())) +def is_array_binary(array): + if len(np.unique(array)) == 2: + return True + return False + + def _get_regularization_parameters(regularization): """ Obtain regularization parameters