diff --git a/R/rescale_weights.R b/R/rescale_weights.R index 88069c225..1c0e2c366 100644 --- a/R/rescale_weights.R +++ b/R/rescale_weights.R @@ -107,7 +107,7 @@ #' \donttest{ #' # compare different methods, using multilevel-Poisson regression #' -#' d <- rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR") +#' d <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") #' result1 <- lme4::glmer( #' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), #' family = poisson(), @@ -123,7 +123,7 @@ #' #' d <- rescale_weights( #' nhanes_sample, -#' probability_weights = "WTINT2YR", +#' "WTINT2YR", #' method = "kish" #' ) #' result3 <- lme4::glmer( @@ -140,8 +140,8 @@ #' } #' @export rescale_weights <- function(data, - by = NULL, probability_weights = NULL, + by = NULL, nest = FALSE, method = "carle") { method <- insight::validate_argument(method, c("carle", "kish")) @@ -205,22 +205,34 @@ rescale_weights <- function(data, ), .misspelled_string(colnames(data_tmp), dont_exist, "Possibly misspelled?") ) + } else { + # if `by` = NULL, we create a dummy group + by <- "tmp_klish_by" + data_tmp[[by]] <- 1 } - p_weights <- data_tmp[[probability_weights]] - # design effect according to Kish - deff <- mean(p_weights^2) / (mean(p_weights)^2) - # rescale weights, so their mean is 1 - z_weights <- p_weights * (1 / mean(p_weights)) - # divide weights by design effect - data$rescaled_weights <- NA_real_ - data$rescaled_weights[weight_non_na] <- z_weights / deff - # restore original order - x <- x[order(x$.bamboozled), ] - x$.bamboozled <- NULL + # split into groups, and calculate weights + out <- lapply(split(data_tmp, data_tmp$by), function(group_data) { + p_weights <- group_data[[probability_weights]] + # design effect according to Kish + deff <- mean(p_weights^2) / (mean(p_weights)^2) + # rescale weights, so their mean is 1 + z_weights <- p_weights * (1 / mean(p_weights)) + # divide weights by design effect + group_data$rescaled_weights <- z_weights / deff + group_data + }) + + # bind data + result <- do.call(rbind, out) + + # restore original order, remove dummy variables + result <- result[order(result$.bamboozled), ] + result$.bamboozled <- NULL + result$tmp_klish_by # return result - data + result } diff --git a/man/rescale_weights.Rd b/man/rescale_weights.Rd index 912f163a9..5c631b90c 100644 --- a/man/rescale_weights.Rd +++ b/man/rescale_weights.Rd @@ -6,8 +6,8 @@ \usage{ rescale_weights( data, - by = NULL, probability_weights = NULL, + by = NULL, nest = FALSE, method = "carle" ) @@ -15,6 +15,9 @@ rescale_weights( \arguments{ \item{data}{A data frame.} +\item{probability_weights}{Variable indicating the probability (design or +sampling) weights of the survey data (level-1-weight).} + \item{by}{Variable names (as character vector, or as formula), indicating the grouping structure (strata) of the survey data (level-2-cluster variable). It is also possible to create weights for multiple group @@ -22,9 +25,6 @@ variables; in such cases, each created weighting variable will be suffixed by the name of the group variable. Argument \code{by} only applies to the default rescaling-method (\code{method = "carle"}), not to \code{method = "kish"}.} -\item{probability_weights}{Variable indicating the probability (design or -sampling) weights of the survey data (level-1-weight).} - \item{nest}{Logical, if \code{TRUE} and \code{by} indicates at least two group variables, then groups are "nested", i.e. groups are now a combination from each group level of the variables in \code{by}.} @@ -118,7 +118,7 @@ head(x) \donttest{ # compare different methods, using multilevel-Poisson regression -d <- rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR") +d <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") result1 <- lme4::glmer( total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), family = poisson(), @@ -134,7 +134,7 @@ result2 <- lme4::glmer( d <- rescale_weights( nhanes_sample, - probability_weights = "WTINT2YR", + "WTINT2YR", method = "kish" ) result3 <- lme4::glmer( diff --git a/tests/testthat/test-rescale_weights.R b/tests/testthat/test-rescale_weights.R index bea16cd7d..c02991809 100644 --- a/tests/testthat/test-rescale_weights.R +++ b/tests/testthat/test-rescale_weights.R @@ -3,16 +3,16 @@ test_that("rescale_weights works as expected", { # convert tibble into data frame, so check-hard GHA works nhanes_sample <- as.data.frame(nhanes_sample) - expect_snapshot(head(rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR"))) + expect_snapshot(head(rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA"))) - expect_snapshot(head(rescale_weights(nhanes_sample, c("SDMVSTRA", "SDMVPSU"), "WTINT2YR"))) + expect_snapshot(head(rescale_weights(nhanes_sample, "WTINT2YR", c("SDMVSTRA", "SDMVPSU")))) expect_snapshot(head(rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish"))) - out <- rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR") + out <- rescale_weights(nhanes_sample, "WTINT2YR", "SDMVSTRA") expect_equal(sum(out$rescaled_weights_a), 2992, tolerance = 1e-3) expect_equal(sum(out$rescaled_weights_b), 2244.71451, tolerance = 1e-3) - out <- rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish") + out <- rescale_weights(nhanes_sample, "WTINT2YR", method = "kish") expect_equal(sum(out$rescaled_weights), 2162.53961, tolerance = 1e-3) })