From ced29bfddb3cb0df5465a3cc808c72b83df108a0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 31 Dec 2024 12:13:59 +0100 Subject: [PATCH] Append results of `demean()` to original df? (#579) * Append results of `demean()` to original df? Fixes #578 * add test --- DESCRIPTION | 2 +- NEWS.md | 6 ++++++ R/demean.R | 36 +++++++++++++++++++++++++++------ man/demean.Rd | 15 ++++++++++---- tests/testthat/_snaps/demean.md | 29 +++++++++++++++++++++++++- tests/testthat/test-demean.R | 28 ++++++++++++++++++++----- 6 files changed, 99 insertions(+), 17 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 034c823ed..b8a6c3134 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.20 +Version: 0.13.0.21 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index f332643fb..a693a4722 100644 --- a/NEWS.md +++ b/NEWS.md @@ -21,6 +21,12 @@ BREAKING CHANGES AND DEPRECATIONS multiple tables is returned. Furthermore, `print_html()` did not work, which was also fixed now. +* `demean()` (and `degroup()`) gets an `append` argument that defaults to `TRUE`, + to append the centered variabled to the original data frame, instead of + returning the de- and group-meaned variables only. Use `append = FALSE` to + for the previous default behaviour (i.e. only returning the newly created + variables). + CHANGES * The `select` argument, which is available in different functions to select diff --git a/R/demean.R b/R/demean.R index b5363edb6..53685628f 100644 --- a/R/demean.R +++ b/R/demean.R @@ -3,10 +3,10 @@ #' @description #' #' `demean()` computes group- and de-meaned versions of a variable that can be -#' used in regression analysis to model the between- and within-subject effect. -#' `degroup()` is more generic in terms of the centering-operation. While -#' `demean()` always uses mean-centering, `degroup()` can also use the mode or -#' median for centering. +#' used in regression analysis to model the between- and within-subject effect +#' (person-mean centering or centering within clusters). `degroup()` is more +#' generic in terms of the centering-operation. While `demean()` always uses +#' mean-centering, `degroup()` can also use the mode or median for centering. #' #' @param x A data frame. #' @param select Character vector (or formula) with names of variables to select @@ -39,6 +39,9 @@ #' names of the group-meaned and de-meaned variables of `x`. By default, #' de-meaned variables will be suffixed with `"_within"` and #' grouped-meaned variables with `"_between"`. +#' @param append Logical, if `TRUE` (default), the group- and de-meaned +#' variables will be appended (column bind) to the original data `x`, +#' thus returning both the original and the de-/group-meaned variables. #' @param add_attributes Logical, if `TRUE`, the returned variables gain #' attributes to indicate the within- and between-effects. This is only #' relevant when printing `model_parameters()` - in such cases, the @@ -283,6 +286,7 @@ demean <- function(x, nested = FALSE, suffix_demean = "_within", suffix_groupmean = "_between", + append = TRUE, add_attributes = TRUE, verbose = TRUE) { degroup( @@ -293,6 +297,7 @@ demean <- function(x, center = "mean", suffix_demean = suffix_demean, suffix_groupmean = suffix_groupmean, + append = append, add_attributes = add_attributes, verbose = verbose ) @@ -308,9 +313,11 @@ degroup <- function(x, center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", + append = TRUE, add_attributes = TRUE, verbose = TRUE) { - # ugly tibbles again... + # ugly tibbles again... but save original data frame + original_data <- x x <- .coerce_to_dataframe(x) center <- match.arg(tolower(center), choices = c("mean", "median", "mode", "min", "max")) @@ -506,7 +513,24 @@ degroup <- function(x, }) } - cbind(group_means, person_means) + # between and within effects + out <- cbind(group_means, person_means) + + # append to original data? + if (isTRUE(append)) { + # check for unique column names + duplicated_columns <- intersect(colnames(out), colnames(original_data)) + if (length(duplicated_columns)) { + insight::format_error(paste0( + "One or more of the centered variables already exist in the orignal data frame: ", # nolint + text_concatenate(duplicated_columns, enclose = "`"), + ". Please rename the affected variable(s) in the original data, or use the arguments `suffix_demean` and `suffix_groupmean` to rename the centered variables." # nolint + )) + } + out <- cbind(original_data, out) + } + + out } diff --git a/man/demean.Rd b/man/demean.Rd index fb4db3a29..d0c280aca 100644 --- a/man/demean.Rd +++ b/man/demean.Rd @@ -13,6 +13,7 @@ demean( nested = FALSE, suffix_demean = "_within", suffix_groupmean = "_between", + append = TRUE, add_attributes = TRUE, verbose = TRUE ) @@ -25,6 +26,7 @@ degroup( center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", + append = TRUE, add_attributes = TRUE, verbose = TRUE ) @@ -37,6 +39,7 @@ detrend( center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", + append = TRUE, add_attributes = TRUE, verbose = TRUE ) @@ -76,6 +79,10 @@ names of the group-meaned and de-meaned variables of \code{x}. By default, de-meaned variables will be suffixed with \code{"_within"} and grouped-meaned variables with \code{"_between"}.} +\item{append}{Logical, if \code{TRUE} (default), the group- and de-meaned +variables will be appended (column bind) to the original data \code{x}, +thus returning both the original and the de-/group-meaned variables.} + \item{add_attributes}{Logical, if \code{TRUE}, the returned variables gain attributes to indicate the within- and between-effects. This is only relevant when printing \code{model_parameters()} - in such cases, the @@ -98,10 +105,10 @@ grouping level, e.g. \code{predictor_L3_between} and \code{predictor_L2_between} } \description{ \code{demean()} computes group- and de-meaned versions of a variable that can be -used in regression analysis to model the between- and within-subject effect. -\code{degroup()} is more generic in terms of the centering-operation. While -\code{demean()} always uses mean-centering, \code{degroup()} can also use the mode or -median for centering. +used in regression analysis to model the between- and within-subject effect +(person-mean centering or centering within clusters). \code{degroup()} is more +generic in terms of the centering-operation. While \code{demean()} always uses +mean-centering, \code{degroup()} can also use the mode or median for centering. } \section{Heterogeneity Bias}{ diff --git a/tests/testthat/_snaps/demean.md b/tests/testthat/_snaps/demean.md index a1c2da4a3..c90d8e87c 100644 --- a/tests/testthat/_snaps/demean.md +++ b/tests/testthat/_snaps/demean.md @@ -52,10 +52,37 @@ 5 -0.2750000 6 -0.4222222 +--- + + Code + head(x) + Output + Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID binary + 1 5.1 3.5 1.4 0.2 setosa 3 0 + 2 4.9 3.0 1.4 0.2 setosa 3 1 + 3 4.7 3.2 1.3 0.2 setosa 3 0 + 4 4.6 3.1 1.5 0.2 setosa 2 1 + 5 5.0 3.6 1.4 0.2 setosa 3 1 + 6 5.4 3.9 1.7 0.4 setosa 2 0 + Sepal.Length_between Petal.Length_between Sepal.Length_within + 1 5.925000 3.527500 -0.8250000 + 2 5.925000 3.527500 -1.0250000 + 3 5.925000 3.527500 -1.2250000 + 4 5.862222 3.951111 -1.2622222 + 5 5.925000 3.527500 -0.9250000 + 6 5.862222 3.951111 -0.4622222 + Petal.Length_within + 1 -2.127500 + 2 -2.127500 + 3 -2.227500 + 4 -2.451111 + 5 -2.127500 + 6 -2.251111 + # demean interaction term Code - demean(dat, select = c("a", "x*y"), by = "ID") + demean(dat, select = c("a", "x*y"), by = "ID", append = FALSE) Output a_between x_y_between a_within x_y_within 1 2.666667 4.666667 -1.6666667 -0.6666667 diff --git a/tests/testthat/test-demean.R b/tests/testthat/test-demean.R index 6e169f9c0..c947285fd 100644 --- a/tests/testthat/test-demean.R +++ b/tests/testthat/test-demean.R @@ -8,13 +8,13 @@ test_that("demean works", { df$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable set.seed(123) - x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID") + x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID", append = FALSE) expect_snapshot(head(x)) set.seed(123) expect_message( { - x <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID") + x <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID", append = FALSE) }, "have been coerced to numeric" ) @@ -23,17 +23,35 @@ test_that("demean works", { set.seed(123) expect_message( { - y <- demean(df, select = ~ Sepal.Length + binary + Species, by = ~ID) + y <- demean(df, select = ~ Sepal.Length + binary + Species, by = ~ID, append = FALSE) }, "have been coerced to numeric" ) expect_message( { - z <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID") + z <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID", append = FALSE) }, "have been coerced to numeric" ) expect_identical(y, z) + + set.seed(123) + x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID") + expect_named( + x, + c( + "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", + "Species", "ID", "binary", "Sepal.Length_between", "Petal.Length_between", + "Sepal.Length_within", "Petal.Length_within" + ) + ) + expect_snapshot(head(x)) + + df$Sepal.Length_within <- df$Sepal.Length + expect_error( + demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID"), + regex = "One or more of" + ) }) test_that("demean interaction term", { @@ -45,7 +63,7 @@ test_that("demean interaction term", { ) set.seed(123) - expect_snapshot(demean(dat, select = c("a", "x*y"), by = "ID")) + expect_snapshot(demean(dat, select = c("a", "x*y"), by = "ID", append = FALSE)) }) test_that("demean shows message if some vars don't exist", {