Merge branch 'main' into rc_datawizard_1.0.0

easystats · Dec 31, 2024 · 198e699 · 198e699
2 parents 052adf2 + ced29bf
commit 198e699
Show file tree

Hide file tree

Showing 6 changed files with 99 additions and 16 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
+
 Version: 1.0.0
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",

diff --git a/NEWS.md b/NEWS.md
@@ -21,6 +21,12 @@ BREAKING CHANGES AND DEPRECATIONS
   multiple tables is returned. Furthermore, `print_html()` did not work, which
   was also fixed now.
 
+* `demean()` (and `degroup()`) gets an `append` argument that defaults to `TRUE`,
+  to append the centered variabled to the original data frame, instead of
+  returning the de- and group-meaned variables only. Use `append = FALSE` to
+  for the previous default behaviour (i.e. only returning the newly created
+  variables).
+
 CHANGES
 
 * The `select` argument, which is available in different functions to select

diff --git a/R/demean.R b/R/demean.R
@@ -3,10 +3,10 @@
 #' @description
 #'
 #' `demean()` computes group- and de-meaned versions of a variable that can be
-#' used in regression analysis to model the between- and within-subject effect.
-#' `degroup()` is more generic in terms of the centering-operation. While
-#' `demean()` always uses mean-centering, `degroup()` can also use the mode or
-#' median for centering.
+#' used in regression analysis to model the between- and within-subject effect
+#' (person-mean centering or centering within clusters). `degroup()` is more
+#' generic in terms of the centering-operation. While `demean()` always uses
+#' mean-centering, `degroup()` can also use the mode or median for centering.
 #'
 #' @param x A data frame.
 #' @param select Character vector (or formula) with names of variables to select
@@ -39,6 +39,9 @@
 #'   names of the group-meaned and de-meaned variables of `x`. By default,
 #'   de-meaned variables will be suffixed with `"_within"` and
 #'   grouped-meaned variables with `"_between"`.
+#' @param append Logical, if `TRUE` (default), the group- and de-meaned
+#'   variables will be appended (column bind) to the original data `x`,
+#'   thus returning both the original and the de-/group-meaned variables.
 #' @param add_attributes Logical, if `TRUE`, the returned variables gain
 #'   attributes to indicate the within- and between-effects. This is only
 #'   relevant when printing `model_parameters()` - in such cases, the
@@ -283,6 +286,7 @@ demean <- function(x,
                    nested = FALSE,
                    suffix_demean = "_within",
                    suffix_groupmean = "_between",
+                   append = TRUE,
                    add_attributes = TRUE,
                    verbose = TRUE) {
   degroup(
@@ -293,6 +297,7 @@ demean <- function(x,
     center = "mean",
     suffix_demean = suffix_demean,
     suffix_groupmean = suffix_groupmean,
+    append = append,
     add_attributes = add_attributes,
     verbose = verbose
   )
@@ -308,9 +313,11 @@ degroup <- function(x,
                     center = "mean",
                     suffix_demean = "_within",
                     suffix_groupmean = "_between",
+                    append = TRUE,
                     add_attributes = TRUE,
                     verbose = TRUE) {
-  # ugly tibbles again...
+  # ugly tibbles again... but save original data frame
+  original_data <- x
   x <- .coerce_to_dataframe(x)
 
   center <- match.arg(tolower(center), choices = c("mean", "median", "mode", "min", "max"))
@@ -506,7 +513,24 @@ degroup <- function(x,
     })
   }
 
-  cbind(group_means, person_means)
+  # between and within effects
+  out <- cbind(group_means, person_means)
+
+  # append to original data?
+  if (isTRUE(append)) {
+    # check for unique column names
+    duplicated_columns <- intersect(colnames(out), colnames(original_data))
+    if (length(duplicated_columns)) {
+      insight::format_error(paste0(
+        "One or more of the centered variables already exist in the orignal data frame: ", # nolint
+        text_concatenate(duplicated_columns, enclose = "`"),
+        ". Please rename the affected variable(s) in the original data, or use the arguments `suffix_demean` and `suffix_groupmean` to rename the centered variables." # nolint
+      ))
+    }
+    out <- cbind(original_data, out)
+  }
+
+  out
 }
 
 

diff --git a/man/demean.Rd b/man/demean.Rd
diff --git a/tests/testthat/_snaps/demean.md b/tests/testthat/_snaps/demean.md
@@ -52,10 +52,37 @@
       5               -0.2750000
       6               -0.4222222
 
+---
+
+    Code
+      head(x)
+    Output
+        Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID binary
+      1          5.1         3.5          1.4         0.2  setosa  3      0
+      2          4.9         3.0          1.4         0.2  setosa  3      1
+      3          4.7         3.2          1.3         0.2  setosa  3      0
+      4          4.6         3.1          1.5         0.2  setosa  2      1
+      5          5.0         3.6          1.4         0.2  setosa  3      1
+      6          5.4         3.9          1.7         0.4  setosa  2      0
+        Sepal.Length_between Petal.Length_between Sepal.Length_within
+      1             5.925000             3.527500          -0.8250000
+      2             5.925000             3.527500          -1.0250000
+      3             5.925000             3.527500          -1.2250000
+      4             5.862222             3.951111          -1.2622222
+      5             5.925000             3.527500          -0.9250000
+      6             5.862222             3.951111          -0.4622222
+        Petal.Length_within
+      1           -2.127500
+      2           -2.127500
+      3           -2.227500
+      4           -2.451111
+      5           -2.127500
+      6           -2.251111
+
 # demean interaction term
 
     Code
-      demean(dat, select = c("a", "x*y"), by = "ID")
+      demean(dat, select = c("a", "x*y"), by = "ID", append = FALSE)
     Output
         a_between x_y_between   a_within x_y_within
       1  2.666667    4.666667 -1.6666667 -0.6666667

diff --git a/tests/testthat/test-demean.R b/tests/testthat/test-demean.R
@@ -8,13 +8,13 @@ test_that("demean works", {
   df$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable
 
   set.seed(123)
-  x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID")
+  x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID", append = FALSE)
   expect_snapshot(head(x))
 
   set.seed(123)
   expect_message(
     {
-      x <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID")
+      x <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID", append = FALSE)
     },
     "have been coerced to numeric"
   )
@@ -23,17 +23,35 @@ test_that("demean works", {
   set.seed(123)
   expect_message(
     {
-      y <- demean(df, select = ~ Sepal.Length + binary + Species, by = ~ID)
+      y <- demean(df, select = ~ Sepal.Length + binary + Species, by = ~ID, append = FALSE)
     },
     "have been coerced to numeric"
   )
   expect_message(
     {
-      z <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID")
+      z <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID", append = FALSE)
     },
     "have been coerced to numeric"
   )
   expect_identical(y, z)
+
+  set.seed(123)
+  x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID")
+  expect_named(
+    x,
+    c(
+      "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width",
+      "Species", "ID", "binary", "Sepal.Length_between", "Petal.Length_between",
+      "Sepal.Length_within", "Petal.Length_within"
+    )
+  )
+  expect_snapshot(head(x))
+
+  df$Sepal.Length_within <- df$Sepal.Length
+  expect_error(
+    demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID"),
+    regex = "One or more of"
+  )
 })
 
 test_that("demean interaction term", {
@@ -45,7 +63,7 @@ test_that("demean interaction term", {
   )
 
   set.seed(123)
-  expect_snapshot(demean(dat, select = c("a", "x*y"), by = "ID"))
+  expect_snapshot(demean(dat, select = c("a", "x*y"), by = "ID", append = FALSE))
 })
 
 test_that("demean shows message if some vars don't exist", {