From ced29bfddb3cb0df5465a3cc808c72b83df108a0 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Tue, 31 Dec 2024 12:13:59 +0100
Subject: [PATCH] Append results of `demean()` to original df? (#579)

* Append results of `demean()` to original df?
Fixes #578

* add test
---
 DESCRIPTION                     |  2 +-
 NEWS.md                         |  6 ++++++
 R/demean.R                      | 36 +++++++++++++++++++++++++++------
 man/demean.Rd                   | 15 ++++++++++----
 tests/testthat/_snaps/demean.md | 29 +++++++++++++++++++++++++-
 tests/testthat/test-demean.R    | 28 ++++++++++++++++++++-----
 6 files changed, 99 insertions(+), 17 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 034c823ed..b8a6c3134 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.20
+Version: 0.13.0.21
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),
diff --git a/NEWS.md b/NEWS.md
index f332643fb..a693a4722 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -21,6 +21,12 @@ BREAKING CHANGES AND DEPRECATIONS
   multiple tables is returned. Furthermore, `print_html()` did not work, which
   was also fixed now.
 
+* `demean()` (and `degroup()`) gets an `append` argument that defaults to `TRUE`,
+  to append the centered variabled to the original data frame, instead of
+  returning the de- and group-meaned variables only. Use `append = FALSE` to
+  for the previous default behaviour (i.e. only returning the newly created
+  variables).
+
 CHANGES
 
 * The `select` argument, which is available in different functions to select
diff --git a/R/demean.R b/R/demean.R
index b5363edb6..53685628f 100644
--- a/R/demean.R
+++ b/R/demean.R
@@ -3,10 +3,10 @@
 #' @description
 #'
 #' `demean()` computes group- and de-meaned versions of a variable that can be
-#' used in regression analysis to model the between- and within-subject effect.
-#' `degroup()` is more generic in terms of the centering-operation. While
-#' `demean()` always uses mean-centering, `degroup()` can also use the mode or
-#' median for centering.
+#' used in regression analysis to model the between- and within-subject effect
+#' (person-mean centering or centering within clusters). `degroup()` is more
+#' generic in terms of the centering-operation. While `demean()` always uses
+#' mean-centering, `degroup()` can also use the mode or median for centering.
 #'
 #' @param x A data frame.
 #' @param select Character vector (or formula) with names of variables to select
@@ -39,6 +39,9 @@
 #'   names of the group-meaned and de-meaned variables of `x`. By default,
 #'   de-meaned variables will be suffixed with `"_within"` and
 #'   grouped-meaned variables with `"_between"`.
+#' @param append Logical, if `TRUE` (default), the group- and de-meaned
+#'   variables will be appended (column bind) to the original data `x`,
+#'   thus returning both the original and the de-/group-meaned variables.
 #' @param add_attributes Logical, if `TRUE`, the returned variables gain
 #'   attributes to indicate the within- and between-effects. This is only
 #'   relevant when printing `model_parameters()` - in such cases, the
@@ -283,6 +286,7 @@ demean <- function(x,
                    nested = FALSE,
                    suffix_demean = "_within",
                    suffix_groupmean = "_between",
+                   append = TRUE,
                    add_attributes = TRUE,
                    verbose = TRUE) {
   degroup(
@@ -293,6 +297,7 @@ demean <- function(x,
     center = "mean",
     suffix_demean = suffix_demean,
     suffix_groupmean = suffix_groupmean,
+    append = append,
     add_attributes = add_attributes,
     verbose = verbose
   )
@@ -308,9 +313,11 @@ degroup <- function(x,
                     center = "mean",
                     suffix_demean = "_within",
                     suffix_groupmean = "_between",
+                    append = TRUE,
                     add_attributes = TRUE,
                     verbose = TRUE) {
-  # ugly tibbles again...
+  # ugly tibbles again... but save original data frame
+  original_data <- x
   x <- .coerce_to_dataframe(x)
 
   center <- match.arg(tolower(center), choices = c("mean", "median", "mode", "min", "max"))
@@ -506,7 +513,24 @@ degroup <- function(x,
     })
   }
 
-  cbind(group_means, person_means)
+  # between and within effects
+  out <- cbind(group_means, person_means)
+
+  # append to original data?
+  if (isTRUE(append)) {
+    # check for unique column names
+    duplicated_columns <- intersect(colnames(out), colnames(original_data))
+    if (length(duplicated_columns)) {
+      insight::format_error(paste0(
+        "One or more of the centered variables already exist in the orignal data frame: ", # nolint
+        text_concatenate(duplicated_columns, enclose = "`"),
+        ". Please rename the affected variable(s) in the original data, or use the arguments `suffix_demean` and `suffix_groupmean` to rename the centered variables." # nolint
+      ))
+    }
+    out <- cbind(original_data, out)
+  }
+
+  out
 }
 
 
diff --git a/man/demean.Rd b/man/demean.Rd
index fb4db3a29..d0c280aca 100644
--- a/man/demean.Rd
+++ b/man/demean.Rd
@@ -13,6 +13,7 @@ demean(
   nested = FALSE,
   suffix_demean = "_within",
   suffix_groupmean = "_between",
+  append = TRUE,
   add_attributes = TRUE,
   verbose = TRUE
 )
@@ -25,6 +26,7 @@ degroup(
   center = "mean",
   suffix_demean = "_within",
   suffix_groupmean = "_between",
+  append = TRUE,
   add_attributes = TRUE,
   verbose = TRUE
 )
@@ -37,6 +39,7 @@ detrend(
   center = "mean",
   suffix_demean = "_within",
   suffix_groupmean = "_between",
+  append = TRUE,
   add_attributes = TRUE,
   verbose = TRUE
 )
@@ -76,6 +79,10 @@ names of the group-meaned and de-meaned variables of \code{x}. By default,
 de-meaned variables will be suffixed with \code{"_within"} and
 grouped-meaned variables with \code{"_between"}.}
 
+\item{append}{Logical, if \code{TRUE} (default), the group- and de-meaned
+variables will be appended (column bind) to the original data \code{x},
+thus returning both the original and the de-/group-meaned variables.}
+
 \item{add_attributes}{Logical, if \code{TRUE}, the returned variables gain
 attributes to indicate the within- and between-effects. This is only
 relevant when printing \code{model_parameters()} - in such cases, the
@@ -98,10 +105,10 @@ grouping level, e.g. \code{predictor_L3_between} and \code{predictor_L2_between}
 }
 \description{
 \code{demean()} computes group- and de-meaned versions of a variable that can be
-used in regression analysis to model the between- and within-subject effect.
-\code{degroup()} is more generic in terms of the centering-operation. While
-\code{demean()} always uses mean-centering, \code{degroup()} can also use the mode or
-median for centering.
+used in regression analysis to model the between- and within-subject effect
+(person-mean centering or centering within clusters). \code{degroup()} is more
+generic in terms of the centering-operation. While \code{demean()} always uses
+mean-centering, \code{degroup()} can also use the mode or median for centering.
 }
 \section{Heterogeneity Bias}{
 
diff --git a/tests/testthat/_snaps/demean.md b/tests/testthat/_snaps/demean.md
index a1c2da4a3..c90d8e87c 100644
--- a/tests/testthat/_snaps/demean.md
+++ b/tests/testthat/_snaps/demean.md
@@ -52,10 +52,37 @@
       5               -0.2750000
       6               -0.4222222
 
+---
+
+    Code
+      head(x)
+    Output
+        Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID binary
+      1          5.1         3.5          1.4         0.2  setosa  3      0
+      2          4.9         3.0          1.4         0.2  setosa  3      1
+      3          4.7         3.2          1.3         0.2  setosa  3      0
+      4          4.6         3.1          1.5         0.2  setosa  2      1
+      5          5.0         3.6          1.4         0.2  setosa  3      1
+      6          5.4         3.9          1.7         0.4  setosa  2      0
+        Sepal.Length_between Petal.Length_between Sepal.Length_within
+      1             5.925000             3.527500          -0.8250000
+      2             5.925000             3.527500          -1.0250000
+      3             5.925000             3.527500          -1.2250000
+      4             5.862222             3.951111          -1.2622222
+      5             5.925000             3.527500          -0.9250000
+      6             5.862222             3.951111          -0.4622222
+        Petal.Length_within
+      1           -2.127500
+      2           -2.127500
+      3           -2.227500
+      4           -2.451111
+      5           -2.127500
+      6           -2.251111
+
 # demean interaction term
 
     Code
-      demean(dat, select = c("a", "x*y"), by = "ID")
+      demean(dat, select = c("a", "x*y"), by = "ID", append = FALSE)
     Output
         a_between x_y_between   a_within x_y_within
       1  2.666667    4.666667 -1.6666667 -0.6666667
diff --git a/tests/testthat/test-demean.R b/tests/testthat/test-demean.R
index 6e169f9c0..c947285fd 100644
--- a/tests/testthat/test-demean.R
+++ b/tests/testthat/test-demean.R
@@ -8,13 +8,13 @@ test_that("demean works", {
   df$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable
 
   set.seed(123)
-  x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID")
+  x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID", append = FALSE)
   expect_snapshot(head(x))
 
   set.seed(123)
   expect_message(
     {
-      x <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID")
+      x <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID", append = FALSE)
     },
     "have been coerced to numeric"
   )
@@ -23,17 +23,35 @@ test_that("demean works", {
   set.seed(123)
   expect_message(
     {
-      y <- demean(df, select = ~ Sepal.Length + binary + Species, by = ~ID)
+      y <- demean(df, select = ~ Sepal.Length + binary + Species, by = ~ID, append = FALSE)
     },
     "have been coerced to numeric"
   )
   expect_message(
     {
-      z <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID")
+      z <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID", append = FALSE)
     },
     "have been coerced to numeric"
   )
   expect_identical(y, z)
+
+  set.seed(123)
+  x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID")
+  expect_named(
+    x,
+    c(
+      "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width",
+      "Species", "ID", "binary", "Sepal.Length_between", "Petal.Length_between",
+      "Sepal.Length_within", "Petal.Length_within"
+    )
+  )
+  expect_snapshot(head(x))
+
+  df$Sepal.Length_within <- df$Sepal.Length
+  expect_error(
+    demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID"),
+    regex = "One or more of"
+  )
 })
 
 test_that("demean interaction term", {
@@ -45,7 +63,7 @@ test_that("demean interaction term", {
   )
 
   set.seed(123)
-  expect_snapshot(demean(dat, select = c("a", "x*y"), by = "ID"))
+  expect_snapshot(demean(dat, select = c("a", "x*y"), by = "ID", append = FALSE))
 })
 
 test_that("demean shows message if some vars don't exist", {