Skip to content

Commit

Permalink
Merge branch 'main' into rc_datawizard_1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
strengejacke authored Dec 31, 2024
2 parents 052adf2 + ced29bf commit 198e699
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 16 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations

Version: 1.0.0
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ BREAKING CHANGES AND DEPRECATIONS
multiple tables is returned. Furthermore, `print_html()` did not work, which
was also fixed now.

* `demean()` (and `degroup()`) gets an `append` argument that defaults to `TRUE`,
to append the centered variabled to the original data frame, instead of
returning the de- and group-meaned variables only. Use `append = FALSE` to
for the previous default behaviour (i.e. only returning the newly created
variables).

CHANGES

* The `select` argument, which is available in different functions to select
Expand Down
36 changes: 30 additions & 6 deletions R/demean.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
#' @description
#'
#' `demean()` computes group- and de-meaned versions of a variable that can be
#' used in regression analysis to model the between- and within-subject effect.
#' `degroup()` is more generic in terms of the centering-operation. While
#' `demean()` always uses mean-centering, `degroup()` can also use the mode or
#' median for centering.
#' used in regression analysis to model the between- and within-subject effect
#' (person-mean centering or centering within clusters). `degroup()` is more
#' generic in terms of the centering-operation. While `demean()` always uses
#' mean-centering, `degroup()` can also use the mode or median for centering.
#'
#' @param x A data frame.
#' @param select Character vector (or formula) with names of variables to select
Expand Down Expand Up @@ -39,6 +39,9 @@
#' names of the group-meaned and de-meaned variables of `x`. By default,
#' de-meaned variables will be suffixed with `"_within"` and
#' grouped-meaned variables with `"_between"`.
#' @param append Logical, if `TRUE` (default), the group- and de-meaned
#' variables will be appended (column bind) to the original data `x`,
#' thus returning both the original and the de-/group-meaned variables.
#' @param add_attributes Logical, if `TRUE`, the returned variables gain
#' attributes to indicate the within- and between-effects. This is only
#' relevant when printing `model_parameters()` - in such cases, the
Expand Down Expand Up @@ -283,6 +286,7 @@ demean <- function(x,
nested = FALSE,
suffix_demean = "_within",
suffix_groupmean = "_between",
append = TRUE,
add_attributes = TRUE,
verbose = TRUE) {
degroup(
Expand All @@ -293,6 +297,7 @@ demean <- function(x,
center = "mean",
suffix_demean = suffix_demean,
suffix_groupmean = suffix_groupmean,
append = append,
add_attributes = add_attributes,
verbose = verbose
)
Expand All @@ -308,9 +313,11 @@ degroup <- function(x,
center = "mean",
suffix_demean = "_within",
suffix_groupmean = "_between",
append = TRUE,
add_attributes = TRUE,
verbose = TRUE) {
# ugly tibbles again...
# ugly tibbles again... but save original data frame
original_data <- x
x <- .coerce_to_dataframe(x)

center <- match.arg(tolower(center), choices = c("mean", "median", "mode", "min", "max"))
Expand Down Expand Up @@ -506,7 +513,24 @@ degroup <- function(x,
})
}

cbind(group_means, person_means)
# between and within effects
out <- cbind(group_means, person_means)

# append to original data?
if (isTRUE(append)) {
# check for unique column names
duplicated_columns <- intersect(colnames(out), colnames(original_data))
if (length(duplicated_columns)) {
insight::format_error(paste0(
"One or more of the centered variables already exist in the orignal data frame: ", # nolint
text_concatenate(duplicated_columns, enclose = "`"),
". Please rename the affected variable(s) in the original data, or use the arguments `suffix_demean` and `suffix_groupmean` to rename the centered variables." # nolint
))
}
out <- cbind(original_data, out)
}

out
}


Expand Down
15 changes: 11 additions & 4 deletions man/demean.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 28 additions & 1 deletion tests/testthat/_snaps/demean.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,37 @@
5 -0.2750000
6 -0.4222222

---

Code
head(x)
Output
Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID binary
1 5.1 3.5 1.4 0.2 setosa 3 0
2 4.9 3.0 1.4 0.2 setosa 3 1
3 4.7 3.2 1.3 0.2 setosa 3 0
4 4.6 3.1 1.5 0.2 setosa 2 1
5 5.0 3.6 1.4 0.2 setosa 3 1
6 5.4 3.9 1.7 0.4 setosa 2 0
Sepal.Length_between Petal.Length_between Sepal.Length_within
1 5.925000 3.527500 -0.8250000
2 5.925000 3.527500 -1.0250000
3 5.925000 3.527500 -1.2250000
4 5.862222 3.951111 -1.2622222
5 5.925000 3.527500 -0.9250000
6 5.862222 3.951111 -0.4622222
Petal.Length_within
1 -2.127500
2 -2.127500
3 -2.227500
4 -2.451111
5 -2.127500
6 -2.251111

# demean interaction term

Code
demean(dat, select = c("a", "x*y"), by = "ID")
demean(dat, select = c("a", "x*y"), by = "ID", append = FALSE)
Output
a_between x_y_between a_within x_y_within
1 2.666667 4.666667 -1.6666667 -0.6666667
Expand Down
28 changes: 23 additions & 5 deletions tests/testthat/test-demean.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ test_that("demean works", {
df$binary <- as.factor(rbinom(150, 1, 0.35)) # binary variable

set.seed(123)
x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID")
x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID", append = FALSE)
expect_snapshot(head(x))

set.seed(123)
expect_message(
{
x <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID")
x <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID", append = FALSE)
},
"have been coerced to numeric"
)
Expand All @@ -23,17 +23,35 @@ test_that("demean works", {
set.seed(123)
expect_message(
{
y <- demean(df, select = ~ Sepal.Length + binary + Species, by = ~ID)
y <- demean(df, select = ~ Sepal.Length + binary + Species, by = ~ID, append = FALSE)
},
"have been coerced to numeric"
)
expect_message(
{
z <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID")
z <- demean(df, select = c("Sepal.Length", "binary", "Species"), by = "ID", append = FALSE)
},
"have been coerced to numeric"
)
expect_identical(y, z)

set.seed(123)
x <- demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID")
expect_named(
x,
c(
"Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width",
"Species", "ID", "binary", "Sepal.Length_between", "Petal.Length_between",
"Sepal.Length_within", "Petal.Length_within"
)
)
expect_snapshot(head(x))

df$Sepal.Length_within <- df$Sepal.Length
expect_error(
demean(df, select = c("Sepal.Length", "Petal.Length"), by = "ID"),
regex = "One or more of"
)
})

test_that("demean interaction term", {
Expand All @@ -45,7 +63,7 @@ test_that("demean interaction term", {
)

set.seed(123)
expect_snapshot(demean(dat, select = c("a", "x*y"), by = "ID"))
expect_snapshot(demean(dat, select = c("a", "x*y"), by = "ID", append = FALSE))
})

test_that("demean shows message if some vars don't exist", {
Expand Down

0 comments on commit 198e699

Please sign in to comment.