Introduced "counterfactual" weighting

insightsengineering · Mar 21, 2024 · f0ed45f · f0ed45f
1 parent 6e989b4
commit f0ed45f
Show file tree

Hide file tree

Showing 7 changed files with 415 additions and 269 deletions.
diff --git a/R/ancova.R b/R/ancova.R
@@ -12,9 +12,10 @@
 #' fit the ancova model at. If `NULL`, a separate ancova model will be fit to the
 #' outcomes for each visit (as determined by `unique(data[[vars$visit]])`).
 #' See details.
-#' @param weights Character, either `"proportional"` (default) or `"equal"`. Specifies the
-#' weighting strategy to be used for categorical covariates when calculating the lsmeans.
-#' See details.
+#' @param weights Character, either `"counterfactual"` (default), `"equal"`,
+#' `"proportional_em"` or `"proportional"`.
+#' Specifies the weighting strategy to be used when calculating the lsmeans.
+#' See the weighting section for more details.
 #'
 #' @details
 #' The function works as follows:
@@ -49,29 +50,18 @@
 #' by providing them to the `covariates` argument of [set_vars()]
 #' e.g. `set_vars(covariates = c("sex*age"))`.
 #'
-#'
-#' ## Weighting
-#'
-#' `"proportional"` is the default scheme that is used. This is equivalent to standardization,
-#' i.e. the lsmeans in
-#' each group are equal to the predicted mean outcome from the ancova model for
-#' that group based on baseline characteristics of all subjects regardless of
-#' their assigned group. The alternative weighting scheme, `"equal"`, creates hypothetical
-#' patients by expanding out all combinations of the models categorical covariates. The
-#' lsmeans are then calculated as the average of
-#' the predicted mean outcome for these hypothetical patients assuming they come from each
-#' group in turn.
-#'
-#' In short:
-#'   - `"proportional"` weights categorical covariates based upon their frequency of occurrence
-#' in the data.
-#'   - `"equal"` weights categorical covariates equally across all theoretical combinations.
+#' @inheritSection lsmeans Weighting
 #'
 #' @seealso [analyse()]
 #' @seealso [stats::lm()]
 #' @seealso [set_vars()]
 #' @export
-ancova <- function(data, vars, visits = NULL, weights = c("proportional", "equal")) {
+ancova <- function(
+    data,
+    vars,
+    visits = NULL,
+    weights = c("counterfactual", "equal", "proportional_em", "proportional")
+) {
 
     outcome <- vars[["outcome"]]
     group <- vars[["group"]]
@@ -154,13 +144,13 @@ ancova <- function(data, vars, visits = NULL, weights = c("proportional", "equal
 #' @description
 #' Performance analysis of covariance. See [ancova()] for full details.
 #'
-#' @param data The `data.frame` containing all of the data required for the model.
 #' @param outcome Character, the name of the outcome variable in `data`.
 #' @param group Character, the name of the group variable in `data`.
 #' @param covariates Character vector containing the name of any additional covariates
 #' to be included in the model as well as any interaction terms.
-#' @param weights Character, specifies whether to use "proportional" or "equal" weighting for each
-#' categorical covariate combination when calculating the lsmeans.
+#'
+#' @inheritParams ancova
+#' @inheritSection lsmeans Weighting
 #'
 #' @details
 #' - `group` must be a factor variable with only 2 levels.
@@ -174,7 +164,13 @@ ancova <- function(data, vars, visits = NULL, weights = c("proportional", "equal
 #' }
 #' @seealso [ancova()]
 #' @importFrom stats lm coef vcov df.residual
-ancova_single <- function(data, outcome, group, covariates, weights = c("proportional", "equal")) {
+ancova_single <- function(
+    data,
+    outcome,
+    group,
+    covariates,
+    weights = c("counterfactual", "equal", "proportional_em", "proportional")
+) {
 
     weights <- match.arg(weights)
     assert_that(

diff --git a/R/lsmeans.R b/R/lsmeans.R
@@ -3,25 +3,55 @@
 #'
 #'
 #' Estimates the least square means from a linear model. The exact implementation
-#' / interpretation depends on the weighting scheme; see details for more information.
-#'
-#' @details
-#'
-#' For `.weights = "proportional"` (the default) the lsmeans are obtained by
-#' taking the average of the fitted values for each patient.
-#' In terms of the interactions between two continuous covariates 
-#' this is equivalent to constructing a hypothetical patient whose
-#' interaction term is `mean(X * Y)`. This is opposed to the
-#' `emmeans` package which calculates the interaction term
-#' of the hypothetical patient as `mean(X) * mean(Y)`. The approach
-#' outlined here is equivalent to standardization or g-computation.
+#' / interpretation depends on the weighting scheme; see the weighting section for more
+#' information.
+#'
 #' 
-#' For `.weights = "equal"` the lsmeans are obtained by taking the model fitted
+#' @section Weighting:
+#'
+#' ### Counterfactual
+#'
+#' For `weights = "counterfactual"` (the default) the lsmeans are obtained by
+#' taking the average of the predicted values for each patient after assigning all patients
+#' to each arm in turn.
+#' This approach is equivalent to standardization or g-computation.
+#' In comparison to `emmeans` this approach is equivalent to:
+#' ```
+#' emmeans::emmeans(model, specs = "<treatment>", counterfactual = "<treatment>")
+#' ```
+#' Note that `weights = "proportional"` is an alias for `weights = "counterfactual"`.
+#' Whilst potentially confusing, this has been done to ensure backwards compatibility
+#' with prior versions of `rbmi`. To get results
+#' consistent with `emmeans`'s `weights = "proportional"` please use `weights = "proportional_em"`.
+#'
+#' ### Equal 
+#'
+#' For `weights = "equal"` the lsmeans are obtained by taking the model fitted
 #' value of a hypothetical patient whose covariates are defined as follows:
 #' - Continuous covariates are set to `mean(X)`
 #' - Dummy categorical variables are set to `1/N` where `N` is the number of levels
 #' - Continuous * continuous interactions are set to `mean(X) * mean(Y)`
 #' - Continuous * categorical interactions are set to `mean(X) * 1/N`
+#' - Dummary categorical * categorical interaction variables are set to `1/N * 1/M`
+#'
+#' In comparison to `emmeans` this approach is equivalent to:
+#' ```
+#' emmeans::emmeans(model, specs = "<treatment>", weights = "equal")
+#' ```
+#'
+#' ### Proportional
+#'
+#' For `weights = "proportional_em"` the lsmeans are obtained as per `weights = "equal"`
+#' except instead of weighting each observation equally they are weighted by the proportion
+#' in which the given combination of categorical values occurred in the data.
+#' In comparison to `emmeans` this approach is equivalent to:
+#' ```
+#' emmeans::emmeans(model, specs = "<treatment>", weights = "proportional")
+#' ```
+#' Note that this is not to be confused with `weights = "proportional"` which is an alias
+#' for `weights = "counterfactual"`.
+#'
+#' @section Fixing:
 #'
 #' Regardless of the weighting scheme any named arguments passed via `...` will
 #' fix the value of the covariate to the specified value.
@@ -35,8 +65,10 @@
 #' @param ... Fixes specific variables to specific values i.e.
 #' `trt = 1` or `age = 50`. The name of the argument must be the name
 #' of the variable within the dataset.
-#' @param .weights Character, specifies whether to use "proportional" or "equal" weighting for each
-#' categorical covariate combination when calculating the lsmeans.
+#' @param .weights Character, either `"counterfactual"` (default), `"equal"`,
+#' `"proportional_em"` or `"proportional"`.
+#' Specifies the weighting strategy to be used when calculating the lsmeans.
+#' See the weighting section for more details.
 #'
 #' @references \url{https://CRAN.R-project.org/package=emmeans}
 #' @references \url{https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.3/statug/statug_glm_details41.htm}
@@ -49,7 +81,11 @@
 #' lsmeans(mod, Species = "versicolor", Petal.Length = 1)
 #' }
 #' @importFrom stats model.matrix terms reformulate
-lsmeans <- function(model, ..., .weights = c("proportional", "equal")) {
+lsmeans <- function(
+    model,
+    ...,
+    .weights = c("counterfactual", "equal", "proportional_em", "proportional")
+) {
     .weights <- match.arg(arg = .weights)
     fix <- list(...)
 
@@ -66,9 +102,22 @@ lsmeans <- function(model, ..., .weights = c("proportional", "equal")) {
         )
     }
 
+    if (.weights == "proportional") {
+        message(
+            paste(
+                "NOTE: The `proportional` weighting scheme is an alias for `counterfactual`",
+                "and will be deprecated in the future. Please use `proportional_em` or",
+                "`counterfactual` as appropriate instead.",
+                sep = " "
+            )
+        )
+    }
+
     design_fun <- switch(.weights,
+        counterfactual = ls_design_counterfactual,
+        proportional = ls_design_counterfactual,
         equal = ls_design_equal,
-        proportional = ls_design_proportional
+        proportional_em = ls_design_proportional
     )
 
     design <- design_fun(data, frm, fix)
@@ -81,6 +130,27 @@ lsmeans <- function(model, ..., .weights = c("proportional", "equal")) {
 }
 
 
+
+collapse_values <- function(x) {
+    UseMethod("collapse_values")
+}
+
+#' @export
+collapse_values.factor <- function(x) {
+    return(factor(levels(x), levels(x)))
+}
+
+#' @export
+collapse_values.numeric <- function(x) {
+    return(mean(x))
+}
+
+#' @export
+collapse_values.default <- function(x) {
+    stop("invalid data type")
+}
+
+
 #' Calculate design vector for the lsmeans
 #'
 #' Calculates the design vector as required to generate the lsmean
@@ -99,7 +169,8 @@ ls_design_equal <- function(data, frm, fix) {
         inherits(frm, "formula"),
         is.data.frame(data),
         is.list(fix),
-        all(names(fix) %in% colnames(data))
+        all(names(fix) %in% colnames(data)),
+        all(vapply(fix, length, numeric(1)) == 1)
     )
     frm2 <- update(frm, NULL ~ .)
     data2 <- model.frame(frm2, data)
@@ -125,28 +196,8 @@ ls_design_equal <- function(data, frm, fix) {
 }
 
 
-collapse_values <- function(x) {
-    UseMethod("collapse_values")
-}
-
-#' @export
-collapse_values.factor <- function(x) {
-    return(factor(levels(x), levels(x)))
-}
-
-#' @export
-collapse_values.numeric <- function(x) {
-    return(mean(x))
-}
-
-#' @export
-collapse_values.default <- function(x) {
-    stop("invalid data type")
-}
-
-
 #' @rdname ls_design
-ls_design_proportional <- function(data, frm, fix) {
+ls_design_counterfactual <- function(data, frm, fix) {
     data2 <- data
     for (var in names(fix)) {
         value <- data[[var]]
@@ -159,3 +210,50 @@ ls_design_proportional <- function(data, frm, fix) {
     design <- colMeans(model.matrix(frm, data = data2))
     return(design)
 }
+
+
+#' @rdname ls_design
+ls_design_proportional <- function(data, frm, fix) {
+    assert_that(
+        inherits(frm, "formula"),
+        is.data.frame(data),
+        is.list(fix),
+        all(names(fix) %in% colnames(data)),
+        all(vapply(fix, length, numeric(1)) == 1)
+    )
+    frm2 <- update(frm, NULL ~ .)
+    dat2 <- model.frame(frm2, data)
+    collection <- lapply(as.list(dat2), collapse_values)
+
+    for (var in names(fix)) {
+        if (is.numeric(dat2[[var]])) {
+            dat2[[var]] <- fix[[var]]
+            collection[[var]] <- fix[[var]]
+        } else if (is.factor(dat2[[var]])) {
+            dat2[[var]] <- factor(fix[[var]], levels = levels(dat2[[var]]))
+            collection[[var]] <- factor(fix[[var]], levels = levels(dat2[[var]]))
+        }
+    }
+
+    all_combinations <- expand.grid(collection)
+    design_matrix <- model.matrix(frm2, all_combinations)
+
+    categorical_vars <- dat2 |>
+        vapply(\(x) is.character(x) || is.factor(x), logical(1)) |>
+        which() |>
+        names()
+
+    wgts <- dat2[, categorical_vars[[1]]] |>
+        aggregate(
+            as.list(dat2[, categorical_vars]),
+            length
+        )
+    assert_that(
+        all.equal(wgts[, categorical_vars], all_combinations[, categorical_vars])
+    )
+
+    wgts_scaled <- wgts[["x"]] / sum(wgts[["x"]])
+
+    design <- apply(design_matrix, 2, \(x) sum(x * wgts_scaled))
+    return(design)
+}