diff --git a/DESCRIPTION b/DESCRIPTION index d17d2bf6b..e684f7480 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,7 @@ LazyData: true Roxygen: list(markdown = TRUE) URL: https://insightsengineering.github.io/rbmi/, https://github.com/insightsengineering/rbmi BugReports: https://github.com/insightsengineering/rbmi/issues -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Suggests: dplyr, tidyr, diff --git a/NAMESPACE b/NAMESPACE index c13a4d03a..1c5fe66c9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,9 @@ # Generated by roxygen2: do not edit by hand S3method(as.data.frame,pool) +S3method(collapse_values,default) +S3method(collapse_values,factor) +S3method(collapse_values,numeric) S3method(draws,approxbayes) S3method(draws,bayes) S3method(draws,bmlmi) @@ -19,6 +22,7 @@ S3method(print,imputation_list_df) S3method(print,imputation_list_single) S3method(print,imputation_single) S3method(print,pool) +S3method(repr,numeric) S3method(validate,analysis) S3method(validate,bmlmi) S3method(validate,bootstrap) diff --git a/NEWS.md b/NEWS.md index f1a23c611..0f5068ff2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,11 @@ * Include vignette on how to obtain frequentist and information-anchored inference with conditional mean imputation using `rbmi` * Added FAQ vignette +* Updates to `lsmeans()` for better consistency with the `emmeans` package (#412) + * Renamed `lsmeans(..., weights = "proportional")` to `lsmeans(..., weights = "counterfactual")`to more accurately reflect the weights used in the calculation. + * Added `lsmeans(..., weights = "proportional_em")` which provides consistent results with `emmeans(..., weights = "proportional")` + * `lsmeans(..., weights = "proportional")` has been left in the package for backwards compatibility and is an alias for `lsmeans(..., weights = "counterfactual")` but now gives + a message prompting users to use either "proptional_em" or "counterfactual" instead. # rbmi 1.2.6 diff --git a/R/ancova.R b/R/ancova.R index 57a08af44..caec031f6 100644 --- a/R/ancova.R +++ b/R/ancova.R @@ -12,9 +12,10 @@ #' fit the ancova model at. If `NULL`, a separate ancova model will be fit to the #' outcomes for each visit (as determined by `unique(data[[vars$visit]])`). #' See details. -#' @param weights Character, either `"proportional"` (default) or `"equal"`. Specifies the -#' weighting strategy to be used for categorical covariates when calculating the lsmeans. -#' See details. +#' @param weights Character, either `"counterfactual"` (default), `"equal"`, +#' `"proportional_em"` or `"proportional"`. +#' Specifies the weighting strategy to be used when calculating the lsmeans. +#' See the weighting section for more details. #' #' @details #' The function works as follows: @@ -49,29 +50,18 @@ #' by providing them to the `covariates` argument of [set_vars()] #' e.g. `set_vars(covariates = c("sex*age"))`. #' -#' -#' ## Weighting -#' -#' `"proportional"` is the default scheme that is used. This is equivalent to standardization, -#' i.e. the lsmeans in -#' each group are equal to the predicted mean outcome from the ancova model for -#' that group based on baseline characteristics of all subjects regardless of -#' their assigned group. The alternative weighting scheme, `"equal"`, creates hypothetical -#' patients by expanding out all combinations of the models categorical covariates. The -#' lsmeans are then calculated as the average of -#' the predicted mean outcome for these hypothetical patients assuming they come from each -#' group in turn. -#' -#' In short: -#' - `"proportional"` weights categorical covariates based upon their frequency of occurrence -#' in the data. -#' - `"equal"` weights categorical covariates equally across all theoretical combinations. +#' @inheritSection lsmeans Weighting #' #' @seealso [analyse()] #' @seealso [stats::lm()] #' @seealso [set_vars()] #' @export -ancova <- function(data, vars, visits = NULL, weights = c("proportional", "equal")) { +ancova <- function( + data, + vars, + visits = NULL, + weights = c("counterfactual", "equal", "proportional_em", "proportional") +) { outcome <- vars[["outcome"]] group <- vars[["group"]] @@ -154,13 +144,13 @@ ancova <- function(data, vars, visits = NULL, weights = c("proportional", "equal #' @description #' Performance analysis of covariance. See [ancova()] for full details. #' -#' @param data The `data.frame` containing all of the data required for the model. #' @param outcome Character, the name of the outcome variable in `data`. #' @param group Character, the name of the group variable in `data`. #' @param covariates Character vector containing the name of any additional covariates #' to be included in the model as well as any interaction terms. -#' @param weights Character, specifies whether to use "proportional" or "equal" weighting for each -#' categorical covariate combination when calculating the lsmeans. +#' +#' @inheritParams ancova +#' @inheritSection lsmeans Weighting #' #' @details #' - `group` must be a factor variable with only 2 levels. @@ -174,7 +164,13 @@ ancova <- function(data, vars, visits = NULL, weights = c("proportional", "equal #' } #' @seealso [ancova()] #' @importFrom stats lm coef vcov df.residual -ancova_single <- function(data, outcome, group, covariates, weights = c("proportional", "equal")) { +ancova_single <- function( + data, + outcome, + group, + covariates, + weights = c("counterfactual", "equal", "proportional_em", "proportional") +) { weights <- match.arg(weights) assert_that( diff --git a/R/dataclasses.R b/R/dataclasses.R index dd4a1cd8c..7b8cc1022 100644 --- a/R/dataclasses.R +++ b/R/dataclasses.R @@ -285,6 +285,7 @@ repr <- function(x, ...) { UseMethod("repr") } +#' @export repr.numeric <- function(x, ...) { paste0("c(", paste0(round(x, 2), collapse = ", "), ")") } diff --git a/R/lsmeans.R b/R/lsmeans.R index 622686c25..4cc57f4d0 100644 --- a/R/lsmeans.R +++ b/R/lsmeans.R @@ -2,37 +2,73 @@ #' Least Square Means #' #' -#' Estimates the least square means from a linear model. This is done by -#' generating a prediction from the model using an hypothetical observation -#' that is constructed by averaging the data. See details for more information. +#' Estimates the least square means from a linear model. The exact implementation +#' / interpretation depends on the weighting scheme; see the weighting section for more +#' information. #' -#' @details +#' +#' @section Weighting: #' -#' The lsmeans are obtained by calculating hypothetical patients -#' and predicting their expected values. These hypothetical patients -#' are constructed by expanding out all possible combinations of each -#' categorical covariate and by setting any numerical covariates equal -#' to the mean. +#' ### Counterfactual #' -#' A final lsmean value is calculated by averaging these hypothetical -#' patients. If `.weights` equals `"proportional"` then the values are weighted -#' by the frequency in which they occur in the full dataset. If `.weights` -#' equals `"equal"` then each hypothetical patient is given an equal weight -#' regardless of what actually occurs in the dataset. +#' For `weights = "counterfactual"` (the default) the lsmeans are obtained by +#' taking the average of the predicted values for each patient after assigning all patients +#' to each arm in turn. +#' This approach is equivalent to standardization or g-computation. +#' In comparison to `emmeans` this approach is equivalent to: +#' ``` +#' emmeans::emmeans(model, specs = "", counterfactual = "") +#' ``` +#' Note that to ensure backwards compatibility with previous versions of rbmi +#' `weights = "proportional"` is an alias for `weights = "counterfactual"`. +#' To get results consistent with `emmeans`'s `weights = "proportional"` +#' please use `weights = "proportional_em"`. #' -#' Use the `...` argument to fix specific variables to specific values. +#' ### Equal #' -#' See the references for identical implementations as done in SAS and -#' in R via the `emmeans` package. This function attempts to re-implement the -#' `emmeans` derivation for standard linear models but without having to include -#' all of it's dependencies. +#' For `weights = "equal"` the lsmeans are obtained by taking the model fitted +#' value of a hypothetical patient whose covariates are defined as follows: +#' - Continuous covariates are set to `mean(X)` +#' - Dummy categorical variables are set to `1/N` where `N` is the number of levels +#' - Continuous * continuous interactions are set to `mean(X) * mean(Y)` +#' - Continuous * categorical interactions are set to `mean(X) * 1/N` +#' - Dummy categorical * categorical interactions are set to `1/N * 1/M` +#' +#' In comparison to `emmeans` this approach is equivalent to: +#' ``` +#' emmeans::emmeans(model, specs = "", weights = "equal") +#' ``` +#' +#' ### Proportional +#' +#' For `weights = "proportional_em"` the lsmeans are obtained as per `weights = "equal"` +#' except instead of weighting each observation equally they are weighted by the proportion +#' in which the given combination of categorical values occurred in the data. +#' In comparison to `emmeans` this approach is equivalent to: +#' ``` +#' emmeans::emmeans(model, specs = "", weights = "proportional") +#' ``` +#' Note that this is not to be confused with `weights = "proportional"` which is an alias +#' for `weights = "counterfactual"`. +#' +#' @section Fixing: +#' +#' Regardless of the weighting scheme any named arguments passed via `...` will +#' fix the value of the covariate to the specified value. +#' For example, `lsmeans(model, trt = "A")` will fix the dummy variable `trtA` to 1 +#' for all patients (real or hypothetical) when calculating the lsmeans. +#' +#' See the references for similar implementations as done in SAS and +#' in R via the `emmeans` package. #' #' @param model A model created by `lm`. #' @param ... Fixes specific variables to specific values i.e. #' `trt = 1` or `age = 50`. The name of the argument must be the name #' of the variable within the dataset. -#' @param .weights Character, specifies whether to use "proportional" or "equal" weighting for each -#' categorical covariate combination when calculating the lsmeans. +#' @param .weights Character, either `"counterfactual"` (default), `"equal"`, +#' `"proportional_em"` or `"proportional"`. +#' Specifies the weighting strategy to be used when calculating the lsmeans. +#' See the weighting section for more details. #' #' @references \url{https://CRAN.R-project.org/package=emmeans} #' @references \url{https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.3/statug/statug_glm_details41.htm} @@ -45,7 +81,11 @@ #' lsmeans(mod, Species = "versicolor", Petal.Length = 1) #' } #' @importFrom stats model.matrix terms reformulate -lsmeans <- function(model, ..., .weights = c("proportional", "equal")) { +lsmeans <- function( + model, + ..., + .weights = c("counterfactual", "equal", "proportional_em", "proportional") +) { .weights <- match.arg(arg = .weights) fix <- list(...) @@ -62,12 +102,25 @@ lsmeans <- function(model, ..., .weights = c("proportional", "equal")) { ) } + if (.weights == "proportional") { + message( + paste( + "NOTE: The `proportional` weighting scheme is an alias for `counterfactual`", + "and will be deprecated in the future. Please use `proportional_em` or", + "`counterfactual` as appropriate instead.", + sep = " " + ) + ) + } + design_fun <- switch(.weights, + counterfactual = ls_design_counterfactual, + proportional = ls_design_counterfactual, equal = ls_design_equal, - proportional = ls_design_proportional + proportional_em = ls_design_proportional ) - design <- design_fun(data, frm, covars, fix) + design <- design_fun(data, frm, fix) beta <- matrix(coef(model), nrow = 1) list( est = as.vector(beta %*% design), @@ -77,6 +130,27 @@ lsmeans <- function(model, ..., .weights = c("proportional", "equal")) { } + +collapse_values <- function(x) { + UseMethod("collapse_values") +} + +#' @export +collapse_values.factor <- function(x) { + return(factor(levels(x), levels(x))) +} + +#' @export +collapse_values.numeric <- function(x) { + return(mean(x)) +} + +#' @export +collapse_values.default <- function(x) { + stop("invalid data type") +} + + #' Calculate design vector for the lsmeans #' #' Calculates the design vector as required to generate the lsmean @@ -88,41 +162,42 @@ lsmeans <- function(model, ..., .weights = c("proportional", "equal")) { #' #' @param data A data.frame #' @param frm Formula used to fit the original model -#' @param covars a character vector of variables names that exist in -#' `data` which should be extracted (`ls_design_equal` only) #' @param fix A named list of variables with fixed values #' @name ls_design -ls_design_equal <- function(data, frm, covars, fix) { - collection <- list() - for (var in covars) { - values <- data[[var]] - if (is.factor(values)) { - lvls <- levels(values) - if (var %in% names(fix)) { - collection[[var]] <- factor(fix[[var]], levels = lvls) - } else { - collection[[var]] <- factor(lvls, levels = lvls) - } - } else if (is.numeric(values)) { - if (var %in% names(fix)) { - collection[[var]] <- fix[[var]] - } else { - collection[[var]] <- mean(values) - } - } else { - stop("invalid data type") +ls_design_equal <- function(data, frm, fix) { + assert_that( + inherits(frm, "formula"), + is.data.frame(data), + is.list(fix), + all(names(fix) %in% colnames(data)), + all(vapply(fix, length, numeric(1)) == 1) + ) + frm2 <- update(frm, NULL ~ .) + data2 <- model.frame(frm2, data) + collection <- lapply(as.list(data2), collapse_values) + + for (var in names(fix)) { + if (is.numeric(data2[[var]])) { + collection[[var]] <- fix[[var]] + } else if (is.factor(data2[[var]])) { + collection[[var]] <- factor(fix[[var]], levels = levels(data2[[var]])) } } - design <- matrix( - apply(model.matrix(frm, expand.grid(collection)), 2, mean), + + all_combinations <- expand.grid(collection) + + design_matrix <- model.matrix(frm2, all_combinations) + + result <- matrix( + apply(design_matrix, 2, mean), ncol = 1 ) - return(design) + return(result) } #' @rdname ls_design -ls_design_proportional <- function(data, frm, covars, fix) { +ls_design_counterfactual <- function(data, frm, fix) { data2 <- data for (var in names(fix)) { value <- data[[var]] @@ -135,3 +210,50 @@ ls_design_proportional <- function(data, frm, covars, fix) { design <- colMeans(model.matrix(frm, data = data2)) return(design) } + + +#' @rdname ls_design +ls_design_proportional <- function(data, frm, fix) { + assert_that( + inherits(frm, "formula"), + is.data.frame(data), + is.list(fix), + all(names(fix) %in% colnames(data)), + all(vapply(fix, length, numeric(1)) == 1) + ) + frm2 <- update(frm, NULL ~ .) + dat2 <- model.frame(frm2, data) + collection <- lapply(as.list(dat2), collapse_values) + + for (var in names(fix)) { + if (is.numeric(dat2[[var]])) { + dat2[[var]] <- fix[[var]] + collection[[var]] <- fix[[var]] + } else if (is.factor(dat2[[var]])) { + dat2[[var]] <- factor(fix[[var]], levels = levels(dat2[[var]])) + collection[[var]] <- factor(fix[[var]], levels = levels(dat2[[var]])) + } + } + + all_combinations <- expand.grid(collection) + design_matrix <- model.matrix(frm2, all_combinations) + + categorical_vars <- dat2 |> + vapply(\(x) is.character(x) || is.factor(x), logical(1)) |> + which() |> + names() + + wgts <- dat2[, categorical_vars[[1]]] |> + aggregate( + as.list(dat2[, categorical_vars]), + length + ) + assert_that( + all.equal(wgts[, categorical_vars], all_combinations[, categorical_vars]) + ) + + wgts_scaled <- wgts[["x"]] / sum(wgts[["x"]]) + + design <- apply(design_matrix, 2, \(x) sum(x * wgts_scaled)) + return(design) +} diff --git a/man/ancova.Rd b/man/ancova.Rd index 886bd48cf..1cbe56b3a 100644 --- a/man/ancova.Rd +++ b/man/ancova.Rd @@ -4,7 +4,12 @@ \alias{ancova} \title{Analysis of Covariance} \usage{ -ancova(data, vars, visits = NULL, weights = c("proportional", "equal")) +ancova( + data, + vars, + visits = NULL, + weights = c("counterfactual", "equal", "proportional_em", "proportional") +) } \arguments{ \item{data}{A \code{data.frame} containing the data to be used in the model.} @@ -17,9 +22,10 @@ fit the ancova model at. If \code{NULL}, a separate ancova model will be fit to outcomes for each visit (as determined by \code{unique(data[[vars$visit]])}). See details.} -\item{weights}{Character, either \code{"proportional"} (default) or \code{"equal"}. Specifies the -weighting strategy to be used for categorical covariates when calculating the lsmeans. -See details.} +\item{weights}{Character, either \code{"counterfactual"} (default), \code{"equal"}, +\code{"proportional_em"} or \code{"proportional"}. +Specifies the weighting strategy to be used when calculating the lsmeans. +See the weighting section for more details.} } \description{ Performs an analysis of covariance between two groups returning the estimated @@ -60,26 +66,59 @@ coincide with the control arm. Analogously, "alt" refers to the second factor le If you want to include interaction terms in your model this can be done by providing them to the \code{covariates} argument of \code{\link[=set_vars]{set_vars()}} e.g. \code{set_vars(covariates = c("sex*age"))}. -\subsection{Weighting}{ - -\code{"proportional"} is the default scheme that is used. This is equivalent to standardization, -i.e. the lsmeans in -each group are equal to the predicted mean outcome from the ancova model for -that group based on baseline characteristics of all subjects regardless of -their assigned group. The alternative weighting scheme, \code{"equal"}, creates hypothetical -patients by expanding out all combinations of the models categorical covariates. The -lsmeans are then calculated as the average of -the predicted mean outcome for these hypothetical patients assuming they come from each -group in turn. - -In short: +} +\section{Weighting}{ + +\subsection{Counterfactual}{ + +For \code{weights = "counterfactual"} (the default) the lsmeans are obtained by +taking the average of the predicted values for each patient after assigning all patients +to each arm in turn. +This approach is equivalent to standardization or g-computation. +In comparison to \code{emmeans} this approach is equivalent to: + +\if{html}{\out{
}}\preformatted{emmeans::emmeans(model, specs = "", counterfactual = "") +}\if{html}{\out{
}} + +Note that to ensure backwards compatibility with previous versions of rbmi +\code{weights = "proportional"} is an alias for \code{weights = "counterfactual"}. +To get results consistent with \code{emmeans}'s \code{weights = "proportional"} +please use \code{weights = "proportional_em"}. +} + +\subsection{Equal}{ + +For \code{weights = "equal"} the lsmeans are obtained by taking the model fitted +value of a hypothetical patient whose covariates are defined as follows: \itemize{ -\item \code{"proportional"} weights categorical covariates based upon their frequency of occurrence -in the data. -\item \code{"equal"} weights categorical covariates equally across all theoretical combinations. +\item Continuous covariates are set to \code{mean(X)} +\item Dummy categorical variables are set to \code{1/N} where \code{N} is the number of levels +\item Continuous * continuous interactions are set to \code{mean(X) * mean(Y)} +\item Continuous * categorical interactions are set to \code{mean(X) * 1/N} +\item Dummy categorical * categorical interactions are set to \code{1/N * 1/M} } + +In comparison to \code{emmeans} this approach is equivalent to: + +\if{html}{\out{
}}\preformatted{emmeans::emmeans(model, specs = "", weights = "equal") +}\if{html}{\out{
}} +} + +\subsection{Proportional}{ + +For \code{weights = "proportional_em"} the lsmeans are obtained as per \code{weights = "equal"} +except instead of weighting each observation equally they are weighted by the proportion +in which the given combination of categorical values occurred in the data. +In comparison to \code{emmeans} this approach is equivalent to: + +\if{html}{\out{
}}\preformatted{emmeans::emmeans(model, specs = "", weights = "proportional") +}\if{html}{\out{
}} + +Note that this is not to be confused with \code{weights = "proportional"} which is an alias +for \code{weights = "counterfactual"}. } } + \seealso{ \code{\link[=analyse]{analyse()}} diff --git a/man/ancova_single.Rd b/man/ancova_single.Rd index 464e652aa..774250aeb 100644 --- a/man/ancova_single.Rd +++ b/man/ancova_single.Rd @@ -9,11 +9,11 @@ ancova_single( outcome, group, covariates, - weights = c("proportional", "equal") + weights = c("counterfactual", "equal", "proportional_em", "proportional") ) } \arguments{ -\item{data}{The \code{data.frame} containing all of the data required for the model.} +\item{data}{A \code{data.frame} containing the data to be used in the model.} \item{outcome}{Character, the name of the outcome variable in \code{data}.} @@ -22,8 +22,10 @@ ancova_single( \item{covariates}{Character vector containing the name of any additional covariates to be included in the model as well as any interaction terms.} -\item{weights}{Character, specifies whether to use "proportional" or "equal" weighting for each -categorical covariate combination when calculating the lsmeans.} +\item{weights}{Character, either \code{"counterfactual"} (default), \code{"equal"}, +\code{"proportional_em"} or \code{"proportional"}. +Specifies the weighting strategy to be used when calculating the lsmeans. +See the weighting section for more details.} } \description{ Performance analysis of covariance. See \code{\link[=ancova]{ancova()}} for full details. @@ -34,6 +36,58 @@ Performance analysis of covariance. See \code{\link[=ancova]{ancova()}} for full \item \code{outcome} must be a continuous numeric variable. } } +\section{Weighting}{ + +\subsection{Counterfactual}{ + +For \code{weights = "counterfactual"} (the default) the lsmeans are obtained by +taking the average of the predicted values for each patient after assigning all patients +to each arm in turn. +This approach is equivalent to standardization or g-computation. +In comparison to \code{emmeans} this approach is equivalent to: + +\if{html}{\out{
}}\preformatted{emmeans::emmeans(model, specs = "", counterfactual = "") +}\if{html}{\out{
}} + +Note that to ensure backwards compatibility with previous versions of rbmi +\code{weights = "proportional"} is an alias for \code{weights = "counterfactual"}. +To get results consistent with \code{emmeans}'s \code{weights = "proportional"} +please use \code{weights = "proportional_em"}. +} + +\subsection{Equal}{ + +For \code{weights = "equal"} the lsmeans are obtained by taking the model fitted +value of a hypothetical patient whose covariates are defined as follows: +\itemize{ +\item Continuous covariates are set to \code{mean(X)} +\item Dummy categorical variables are set to \code{1/N} where \code{N} is the number of levels +\item Continuous * continuous interactions are set to \code{mean(X) * mean(Y)} +\item Continuous * categorical interactions are set to \code{mean(X) * 1/N} +\item Dummy categorical * categorical interactions are set to \code{1/N * 1/M} +} + +In comparison to \code{emmeans} this approach is equivalent to: + +\if{html}{\out{
}}\preformatted{emmeans::emmeans(model, specs = "", weights = "equal") +}\if{html}{\out{
}} +} + +\subsection{Proportional}{ + +For \code{weights = "proportional_em"} the lsmeans are obtained as per \code{weights = "equal"} +except instead of weighting each observation equally they are weighted by the proportion +in which the given combination of categorical values occurred in the data. +In comparison to \code{emmeans} this approach is equivalent to: + +\if{html}{\out{
}}\preformatted{emmeans::emmeans(model, specs = "", weights = "proportional") +}\if{html}{\out{
}} + +Note that this is not to be confused with \code{weights = "proportional"} which is an alias +for \code{weights = "counterfactual"}. +} +} + \examples{ \dontrun{ iris2 <- iris[ iris$Species \%in\% c("versicolor", "virginica"), ] diff --git a/man/ls_design.Rd b/man/ls_design.Rd index 6cdba4aab..cb239207c 100644 --- a/man/ls_design.Rd +++ b/man/ls_design.Rd @@ -3,21 +3,21 @@ \name{ls_design} \alias{ls_design} \alias{ls_design_equal} +\alias{ls_design_counterfactual} \alias{ls_design_proportional} \title{Calculate design vector for the lsmeans} \usage{ -ls_design_equal(data, frm, covars, fix) +ls_design_equal(data, frm, fix) -ls_design_proportional(data, frm, covars, fix) +ls_design_counterfactual(data, frm, fix) + +ls_design_proportional(data, frm, fix) } \arguments{ \item{data}{A data.frame} \item{frm}{Formula used to fit the original model} -\item{covars}{a character vector of variables names that exist in -\code{data} which should be extracted (\code{ls_design_equal} only)} - \item{fix}{A named list of variables with fixed values} } \description{ diff --git a/man/lsmeans.Rd b/man/lsmeans.Rd index 24739d6d6..7d59c9348 100644 --- a/man/lsmeans.Rd +++ b/man/lsmeans.Rd @@ -4,7 +4,11 @@ \alias{lsmeans} \title{Least Square Means} \usage{ -lsmeans(model, ..., .weights = c("proportional", "equal")) +lsmeans( + model, + ..., + .weights = c("counterfactual", "equal", "proportional_em", "proportional") +) } \arguments{ \item{model}{A model created by \code{lm}.} @@ -13,34 +17,80 @@ lsmeans(model, ..., .weights = c("proportional", "equal")) \code{trt = 1} or \code{age = 50}. The name of the argument must be the name of the variable within the dataset.} -\item{.weights}{Character, specifies whether to use "proportional" or "equal" weighting for each -categorical covariate combination when calculating the lsmeans.} +\item{.weights}{Character, either \code{"counterfactual"} (default), \code{"equal"}, +\code{"proportional_em"} or \code{"proportional"}. +Specifies the weighting strategy to be used when calculating the lsmeans. +See the weighting section for more details.} } \description{ -Estimates the least square means from a linear model. This is done by -generating a prediction from the model using an hypothetical observation -that is constructed by averaging the data. See details for more information. -} -\details{ -The lsmeans are obtained by calculating hypothetical patients -and predicting their expected values. These hypothetical patients -are constructed by expanding out all possible combinations of each -categorical covariate and by setting any numerical covariates equal -to the mean. - -A final lsmean value is calculated by averaging these hypothetical -patients. If \code{.weights} equals \code{"proportional"} then the values are weighted -by the frequency in which they occur in the full dataset. If \code{.weights} -equals \code{"equal"} then each hypothetical patient is given an equal weight -regardless of what actually occurs in the dataset. - -Use the \code{...} argument to fix specific variables to specific values. - -See the references for identical implementations as done in SAS and -in R via the \code{emmeans} package. This function attempts to re-implement the -\code{emmeans} derivation for standard linear models but without having to include -all of it's dependencies. +Estimates the least square means from a linear model. The exact implementation +/ interpretation depends on the weighting scheme; see the weighting section for more +information. } +\section{Weighting}{ + +\subsection{Counterfactual}{ + +For \code{weights = "counterfactual"} (the default) the lsmeans are obtained by +taking the average of the predicted values for each patient after assigning all patients +to each arm in turn. +This approach is equivalent to standardization or g-computation. +In comparison to \code{emmeans} this approach is equivalent to: + +\if{html}{\out{
}}\preformatted{emmeans::emmeans(model, specs = "", counterfactual = "") +}\if{html}{\out{
}} + +Note that to ensure backwards compatibility with previous versions of rbmi +\code{weights = "proportional"} is an alias for \code{weights = "counterfactual"}. +To get results consistent with \code{emmeans}'s \code{weights = "proportional"} +please use \code{weights = "proportional_em"}. +} + +\subsection{Equal}{ + +For \code{weights = "equal"} the lsmeans are obtained by taking the model fitted +value of a hypothetical patient whose covariates are defined as follows: +\itemize{ +\item Continuous covariates are set to \code{mean(X)} +\item Dummy categorical variables are set to \code{1/N} where \code{N} is the number of levels +\item Continuous * continuous interactions are set to \code{mean(X) * mean(Y)} +\item Continuous * categorical interactions are set to \code{mean(X) * 1/N} +\item Dummy categorical * categorical interactions are set to \code{1/N * 1/M} +} + +In comparison to \code{emmeans} this approach is equivalent to: + +\if{html}{\out{
}}\preformatted{emmeans::emmeans(model, specs = "", weights = "equal") +}\if{html}{\out{
}} +} + +\subsection{Proportional}{ + +For \code{weights = "proportional_em"} the lsmeans are obtained as per \code{weights = "equal"} +except instead of weighting each observation equally they are weighted by the proportion +in which the given combination of categorical values occurred in the data. +In comparison to \code{emmeans} this approach is equivalent to: + +\if{html}{\out{
}}\preformatted{emmeans::emmeans(model, specs = "", weights = "proportional") +}\if{html}{\out{
}} + +Note that this is not to be confused with \code{weights = "proportional"} which is an alias +for \code{weights = "counterfactual"}. +} +} + +\section{Fixing}{ + + +Regardless of the weighting scheme any named arguments passed via \code{...} will +fix the value of the covariate to the specified value. +For example, \code{lsmeans(model, trt = "A")} will fix the dummy variable \code{trtA} to 1 +for all patients (real or hypothetical) when calculating the lsmeans. + +See the references for similar implementations as done in SAS and +in R via the \code{emmeans} package. +} + \examples{ \dontrun{ mod <- lm(Sepal.Length ~ Species + Petal.Length, data = iris) diff --git a/tests/testthat/test-lsmeans.R b/tests/testthat/test-lsmeans.R index 29954b9bf..06f5b9eba 100644 --- a/tests/testthat/test-lsmeans.R +++ b/tests/testthat/test-lsmeans.R @@ -1,7 +1,7 @@ library(dplyr) library(testthat) -test_that("Least square means works as expected - Part 1", { +test_that("Least square means works as expected", { ##### Check that treatment effect is unaffected set.seed(101) @@ -56,114 +56,6 @@ test_that("Least square means works as expected - Part 1", { }) - - - -test_that("Least square means works as expected - Part 2", { - n <- 8000 - - dat <- tibble( - trt = sample(c("C", "T"), size = n, replace = TRUE, prob = c(0.5, 0.5)), - age = rnorm(n), - sex = sample(c("M", "F", "O"), size = n, replace = TRUE, prob = c(0.3, 0.5, 0.2)), - cat = sample(c("A", "B"), size = n, replace = TRUE, prob = c(0.8, 0.2)), - visit = "vis1" - ) %>% - mutate(sex = factor(sex, levels = c("M", "F", "O"))) %>% - mutate(trt = factor(trt, levels = c("C", "T"))) %>% - mutate(cat = factor(cat, levels = c("A", "B"))) - - - outcome_vec <- ( - model.matrix(~ trt * sex * cat + age, dat) %*% - c(10, 5, 4, 3, 9, 4, 2, 8, 1, -5, -1, -2, -3) - ) + rnorm(n, 0, 5) - - dat2 <- dat %>% - mutate(outcome = outcome_vec) - - - mod <- lm(outcome ~ trt * sex * cat + age, data = dat2) - - - ########### - # - # Proportional weighting - # - - d <- suppressMessages({ - as.data.frame(emmeans::emmeans(mod, "trt", weights = "proportional")) - }) - expected <- list( - "est" = d[["emmean"]], - "se" = d[["SE"]], - "df" = d[["df"]] - ) - lsm1 <- lsmeans(mod, trt = "C") - lsm2 <- lsmeans(mod, trt = "T") - actual <- list( - "est" = c(lsm1$est, lsm2$est), - "se" = c(lsm1$se, lsm2$se), - "df" = c(lsm1$df, lsm2$df) - ) - expect_equal(actual, expected) - - result_actual <- ancova( - dat2, - set_vars( - visit = "visit", - covariates = c("age", "sex*cat*trt"), - outcome = "outcome", - group = "trt" - ) - )[c("lsm_ref_vis1", "lsm_alt_vis1")] - result_expected <- list( - "lsm_ref_vis1" = lsm1, - "lsm_alt_vis1" = lsm2 - ) - expect_equal(result_actual, result_expected) - - - ########### - # - # Equal weighting - # - - d <- suppressMessages({ - as.data.frame(emmeans::emmeans(mod, "trt")) - }) - expected <- list( - "est" = d[["emmean"]], - "se" = d[["SE"]], - "df" = d[["df"]] - ) - lsm1 <- lsmeans(mod, trt = "C", .weights = "equal") - lsm2 <- lsmeans(mod, trt = "T", .weights = "equal") - actual <- list( - "est" = c(lsm1$est, lsm2$est), - "se" = c(lsm1$se, lsm2$se), - "df" = c(lsm1$df, lsm2$df) - ) - expect_equal(actual, expected) - - result_actual <- ancova( - dat2, - set_vars( - visit = "visit", - covariates = c("age", "sex*cat*trt"), - outcome = "outcome", - group = "trt" - ), - weights = "equal" - )[c("lsm_ref_vis1", "lsm_alt_vis1")] - result_expected <- list( - "lsm_ref_vis1" = lsm1, - "lsm_alt_vis1" = lsm2 - ) - expect_equal(result_actual, result_expected) -}) - - test_that("LSmeans (proportional) from rbmi equals means of average prediction", { set.seed(124) @@ -216,50 +108,109 @@ test_that("LSmeans (proportional) from rbmi equals means of average prediction", }) +test_that("LSmeans(proportional) returns equivalent results to 'counterfactual'", { + set.seed(2412) + n <- 4000 + dat <- tibble( + v1 = rnorm(n), + v2 = rnorm(n), + v3 = rnorm(n), + c1 = sample(c("A", "B"), size = n, replace = TRUE, prob = c(0.8, 0.2)), + c2 = sample(c("Y", "X"), size = n, replace = TRUE, prob = c(0.6, 0.4)) |> + factor(levels = c("Y", "X")), + c3 = sample(c("L", "K", "J"), size = n, replace = TRUE, prob = c(0.2, 0.5, 0.3)) |> + factor(levels = c("L", "K", "J")), + error = rnorm(n, 0, 4), + outcome = 30 + + 5 * v1 + + 3 * v2 + + 2 * v3 + + 8 * v1 * v2 + + 9 * v1 * v3 + + 10 * v2 * v3 + + 12 * v1 * v2 * v3 + + 4 * (c1 == "B") + + 6 * (c2 == "Y") + + 7 * (c1 == "B" & c2 == "Y") + + 13 * (c1 == "B") * v1 + + 15 * (c3 == "K") + + 16 * (c3 == "J") + + error + ) + mod <- lm(outcome ~ (v1 * v2 * v3) + (c1 * c2) + (v1 * c1) + c3, data = dat) -#### Toy code to experiment with - -# n <- 8000 - -# dat <- tibble( -# trt = sample(c("C", "T"), size = n, replace = TRUE, prob = c(0.5, 0.5)), -# age = rnorm(n), -# sex = sample(c("M", "F", "O"), size = n, replace = TRUE, prob = c(0.3, 0.5, 0.2)), -# cat = sample(c("A", "B"), size = n, replace = TRUE, prob = c(0.8, 0.2)), -# visit = "vis1" -# ) %>% -# mutate(sex = factor(sex, levels = c("M", "F", "O"))) %>% -# mutate(trt = factor(trt, levels = c("C", "T"))) %>% -# mutate(cat = factor(cat, levels = c("A", "B"))) - - -# outcome_vec <- ( -# model.matrix(~ trt * sex * cat + age, dat) %*% -# c(10, 5, 4, 3, 9, 4, 2, 8, 1, -5, -1, -2, -3) -# ) + rnorm(n, 0, 5) - -# dat2 <- dat %>% -# mutate(outcome = outcome_vec) - - -# mod <- lm(outcome ~ trt * sex * cat + age, data = dat2) - - -# as.data.frame(emmeans::emmeans(mod, "trt", weights = "equal")) -# as.data.frame(emmeans::emmeans(mod, "trt", weights = "proportional")) - -# dat_t <- dat2 %>% mutate(trt = factor("T", levels = c("C", "T"))) -# dat_c <- dat2 %>% mutate(trt = factor("C", levels = c("C", "T"))) -# mean(predict(mod, dat_c)) -# mean(predict(mod, dat_t)) - - - - + # + # + # Equal + # + # + emod <- suppressMessages({ + as.data.frame(emmeans::emmeans(mod, "c1", weights = "equal")) + }) + expected <- list( + "est" = emod[["emmean"]], + "se" = emod[["SE"]], + "df" = emod[["df"]] + ) + lsm1 <- lsmeans(mod, c1 = "A", .weights = "equal") + lsm2 <- lsmeans(mod, c1 = "B", .weights = "equal") + actual <- list( + "est" = c(lsm1$est, lsm2$est), + "se" = c(lsm1$se, lsm2$se), + "df" = c(lsm1$df, lsm2$df) + ) + expect_equal(actual, expected) + # + # + # Proportional + # + # + emod <- suppressMessages({ + as.data.frame(emmeans::emmeans(mod, "c1", weights = "proportional")) + }) + expected <- list( + "est" = emod[["emmean"]], + "se" = emod[["SE"]], + "df" = emod[["df"]] + ) + lsm1 <- lsmeans(mod, c1 = "A", .weights = "proportional_em") + lsm2 <- lsmeans(mod, c1 = "B", .weights = "proportional_em") + actual <- list( + "est" = c(lsm1$est, lsm2$est), + "se" = c(lsm1$se, lsm2$se), + "df" = c(lsm1$df, lsm2$df) + ) + expect_equal(actual, expected) + # + # + # Counterfactual + # + # + emod <- suppressMessages({ + as.data.frame(emmeans::emmeans(mod, "c1", counterfactual = "c1")) + }) + expected <- list( + "est" = emod[["emmean"]], + "se" = emod[["SE"]], + "df" = emod[["df"]] + ) + lsm1 <- lsmeans(mod, c1 = "A", .weights = "counterfactual") + lsm2 <- lsmeans(mod, c1 = "B", .weights = "counterfactual") + actual <- list( + "est" = c(lsm1$est, lsm2$est), + "se" = c(lsm1$se, lsm2$se), + "df" = c(lsm1$df, lsm2$df) + ) + expect_equal(actual, expected) + expect_message( + lsmeans(mod, c1 = "A", .weights = "proportional"), + "NOTE: The `proportional` weighting scheme is an alias for `counterfactual`" + ) +})