Skip to content

Commit

Permalink
Warn about complete separation issues (#913)
Browse files Browse the repository at this point in the history
* Warn about complete separation issues

* wording

* minor

* style

* fix

* fixes

* add tests

* lintr

* minor

* lintr, styler

* fix

* fix
  • Loading branch information
strengejacke authored Oct 25, 2023
1 parent cec62fa commit 065bfc4
Show file tree
Hide file tree
Showing 8 changed files with 162 additions and 26 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: parameters
Title: Processing of Model Parameters
Version: 0.21.2.10
Version: 0.21.2.11
Authors@R:
c(person(given = "Daniel",
family = "Lüdecke",
Expand Down
11 changes: 7 additions & 4 deletions R/ci_profile_boot.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
.ci_profiled <- function(model, ci) {
glm_ci <- tryCatch(
{
out <- as.data.frame(stats::confint(model, level = ci), stringsAsFactors = FALSE)
out <- as.data.frame(
suppressWarnings(stats::confint(model, level = ci)),
stringsAsFactors = FALSE
)
names(out) <- c("CI_low", "CI_high")

out$CI <- ci
Expand Down Expand Up @@ -132,9 +135,9 @@
)

param_names <- switch(component,
"conditional" = pars$Parameter,
"zi" = ,
"zero_inflated" = paste0("zi~", pars$Parameter),
conditional = pars$Parameter,
zi = ,
zero_inflated = paste0("zi~", pars$Parameter),
c(
pars$Parameter[pars$Component == "conditional"],
paste0("zi~", pars$Parameter[pars$Component == "zero_inflated"])
Expand Down
46 changes: 36 additions & 10 deletions R/format.R
Original file line number Diff line number Diff line change
Expand Up @@ -306,12 +306,12 @@ format.compare_parameters <- function(x,
ran_pars <- which(x$Effects == "random")

# find all random effect groups
if (!is.null(x$Group)) {
ran_groups <- unique(insight::compact_character(x$Group))
ran_group_rows <- which(nchar(x$Group) > 0)
} else {
if (is.null(x$Group)) {
ran_groups <- NULL
ran_group_rows <- NULL
} else {
ran_groups <- unique(insight::compact_character(x$Group))
ran_group_rows <- which(nchar(x$Group) > 0)
}

for (i in models) {
Expand Down Expand Up @@ -646,10 +646,10 @@ format.parameters_sem <- function(x,
.add_footer_sigma <- function(footer = NULL, digits = 3, sigma = NULL, residual_df = NULL, type = "text") {
if (!is.null(sigma)) {
# format residual df
if (!is.null(residual_df)) {
res_df <- paste0(" (df = ", residual_df, ")")
} else {
if (is.null(residual_df)) {
res_df <- ""
} else {
res_df <- paste0(" (df = ", residual_df, ")")
}

if (type == "text" || type == "markdown") {
Expand Down Expand Up @@ -785,10 +785,10 @@ format.parameters_sem <- function(x,
.add_footer_formula <- function(footer = NULL, model_formula = NULL, n_obs = NULL, type = "text") {
if (!is.null(model_formula)) {
# format n of observations
if (!is.null(n_obs)) {
n <- paste0(" (", n_obs, " Observations)")
} else {
if (is.null(n_obs)) {
n <- ""
} else {
n <- paste0(" (", n_obs, " Observations)")
}

if (type == "text" || type == "markdown") {
Expand Down Expand Up @@ -919,18 +919,44 @@ format.parameters_sem <- function(x,
.print_footer_exp <- function(x) {
if (isTRUE(getOption("parameters_exponentiate", TRUE))) {
msg <- NULL
# we need this to check whether we have extremely large cofficients
if (all(c("Coefficient", "Parameter") %in% colnames(x))) {
spurious_coefficients <- abs(x$Coefficient[!.in_intercepts(x$Parameter)])
} else {
spurious_coefficients <- NULL
}
exponentiate <- .additional_arguments(x, "exponentiate", FALSE)
if (!.is_valid_exponentiate_argument(exponentiate)) {
if (isTRUE(.additional_arguments(x, "log_link", FALSE))) {
msg <- "The model has a log- or logit-link. Consider using `exponentiate = TRUE` to interpret coefficients as ratios." # nolint
# we only check for exp(coef), so exp() here soince coefficients are on logit-scale
spurious_coefficients <- exp(spurious_coefficients)
} else if (isTRUE(.additional_arguments(x, "log_response", FALSE))) {
msg <- "The model has a log-transformed response variable. Consider using `exponentiate = TRUE` to interpret coefficients as ratios." # nolint
# don't show warning about complete separation
spurious_coefficients <- NULL
}
} else if (.is_valid_exponentiate_argument(exponentiate) && isTRUE(.additional_arguments(x, "log_response", FALSE))) { # nolint
msg <- c(
"This model has a log-transformed response variable, and exponentiated parameters are reported.",
"A one-unit increase in the predictor is associated with multiplying the outcome by that predictor's coefficient." # nolint
)
# don't show warning about complete separation
spurious_coefficients <- NULL
}

# following check only for models with logit-link
logit_model <- isTRUE(.additional_arguments(x, "logit_link", FALSE)) ||
isTRUE(attributes(x)$coefficient_name %in% c("Log-Odds", "Odds Ratio"))

# check for complete separation coefficients or possible issues with
# too few data points
if (!is.null(spurious_coefficients) && logit_model) {
if (any(spurious_coefficients > 140)) {
msg <- c(msg, "Note that some coefficients are very large, which may indicate issues with complete separation.") # nolint
} else if (any(spurious_coefficients > 20)) {
msg <- c(msg, "Note that some coefficients are very large, which may indicate issues with (quasi) complete separation. Consider using bias-corrected or penalized regression models.") # nolint
}
}

if (!is.null(msg) && isTRUE(getOption("parameters_warning_exponentiate", TRUE))) {
Expand Down
21 changes: 11 additions & 10 deletions R/utils_model_parameters.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@
attr(params, "ran_pars") <- isFALSE(group_level)
attr(params, "show_summary") <- isTRUE(summary)
attr(params, "log_link") <- isTRUE(grepl("log", info$link_function, fixed = TRUE))
attr(params, "logit_link") <- isTRUE(identical(info$link_function, "logit"))

# add parameters with value and variable
attr(params, "pretty_labels") <- .format_value_labels(params, model)

# use tryCatch, these might fail...
attr(params, "test_statistic") <- .safe(insight::find_statistic(model))
attr(params, "log_response") <- .safe(isTRUE(grepl("log", insight::find_transformation(model), fixed = TRUE)))
attr(params, "log_predictors") <- .safe(any(grepl("log", unlist(insight::find_terms(model)[c("conditional", "zero_inflated", "instruments")]), fixed = TRUE)))
attr(params, "log_predictors") <- .safe(any(grepl("log", unlist(insight::find_terms(model)[c("conditional", "zero_inflated", "instruments")]), fixed = TRUE))) # nolint

# save if model is multivariate response model
if (isTRUE(info$is_multivariate)) {
Expand Down Expand Up @@ -301,13 +302,13 @@
}

# pattern for marginaleffects objects
if (!is.null(attr(params, "coefficient_name"))) {
if (is.null(attr(params, "coefficient_name"))) {
pattern <- "^(Coefficient|Mean|Median|MAP|Std_Coefficient|CI_|Std_CI)"
} else {
pattern <- sprintf(
"^(Coefficient|Mean|Median|MAP|Std_Coefficient|%s|CI_|Std_CI)",
attr(params, "coefficient_name")
)
} else {
pattern <- "^(Coefficient|Mean|Median|MAP|Std_Coefficient|CI_|Std_CI)"
}

columns <- grepl(pattern = pattern, colnames(params))
Expand All @@ -319,10 +320,10 @@
rows <- !tolower(params$Component) %in% c("location", "scale")
} else {
# don't exponentiate dispersion
if (!is.null(params$Component)) {
rows <- !tolower(params$Component) %in% c("dispersion", "residual")
} else {
if (is.null(params$Component)) {
rows <- seq_len(nrow(params))
} else {
rows <- !tolower(params$Component) %in% c("dispersion", "residual")
}
}
params[rows, columns] <- exp(params[rows, columns])
Expand Down Expand Up @@ -362,7 +363,7 @@

#' @keywords internal
.add_anova_attributes <- function(params, model, ci, test = NULL, alternative = NULL, ...) {
dot.arguments <- lapply(match.call(expand.dots = FALSE)$`...`, function(x) x)
dot.arguments <- lapply(match.call(expand.dots = FALSE)$`...`, function(x) x) # nolint

attr(params, "ci") <- ci
attr(params, "model_class") <- class(model)
Expand Down Expand Up @@ -448,8 +449,8 @@
if (verbose) {
not_allowed_string <- datawizard::text_concatenate(not_allowed, enclose = "\"")
insight::format_alert(
sprintf("Following arguments are not supported in `%s()` for models of class `%s` and will be ignored: %s", function_name, model_class, not_allowed_string),
sprintf("Please run `%s()` again without specifying the above mentioned arguments to obtain expected results.", function_name)
sprintf("Following arguments are not supported in `%s()` for models of class `%s` and will be ignored: %s", function_name, model_class, not_allowed_string), # nolint
sprintf("Please run `%s()` again without specifying the above mentioned arguments to obtain expected results.", function_name) # nolint
)
}
dots[not_allowed] <- NULL
Expand Down
62 changes: 62 additions & 0 deletions tests/testthat/_snaps/complete_separation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# print warning about complete separation

Code
print(out)
Output
Parameter | Log-Odds | SE | 95% CI | z | p
------------------------------------------------------------------------------
(Intercept) | -66.10 | 1.83e+05 | [-10644.72, 10512.52] | -3.60e-04 | > .999
x1 | 15.29 | 27362.84 | [ -3122.69, ] | 5.59e-04 | > .999
x2 | 6.24 | 81543.72 | [-12797.28, ] | 7.65e-05 | > .999
Message
Uncertainty intervals (profile-likelihood) and p-values (two-tailed)
computed using a Wald z-distribution approximation.
The model has a log- or logit-link. Consider using `exponentiate =
TRUE` to interpret coefficients as ratios.
Note that some coefficients are very large, which may indicate issues
with complete separation.

---

Code
print(out)
Output
Parameter | Log-Odds | SE | 95% CI | z | p
-------------------------------------------------------------------------
(Intercept) | -83.33 | 15505.03 | [ , 816.56] | -5.37e-03 | 0.996
gear | 21.01 | 3876.26 | [-248.93, ] | 5.42e-03 | 0.996
Message
Uncertainty intervals (profile-likelihood) and p-values (two-tailed)
computed using a Wald z-distribution approximation.
The model has a log- or logit-link. Consider using `exponentiate =
TRUE` to interpret coefficients as ratios.
Note that some coefficients are very large, which may indicate issues
with complete separation.

# print warning about quasi complete separation

Code
print(out)
Output
Parameter | Log-Odds | SE | 95% CI | z | p
------------------------------------------------------------------
(Intercept) | -56.36 | 21.45 | [-114.76, -25.33] | -2.63 | 0.009
qsec | 3.12 | 1.19 | [ 1.40, 6.37] | 2.62 | 0.009
Message
Uncertainty intervals (profile-likelihood) and p-values (two-tailed)
computed using a Wald z-distribution approximation.
The model has a log- or logit-link. Consider using `exponentiate =
TRUE` to interpret coefficients as ratios.
Note that some coefficients are very large, which may indicate issues
with (quasi) complete separation. Consider using bias-corrected or
penalized regression models.

42 changes: 42 additions & 0 deletions tests/testthat/test-complete_separation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
skip_if(getRversion() < "4.0.0")
skip_if_not_installed("withr")

withr::with_options(
list(parameters_warning_exponentiate = TRUE),
{
test_that("print warning about complete separation", {
d_sep <- data.frame(
y = c(0, 0, 0, 0, 1, 1, 1, 1),
x1 = c(1, 2, 3, 3, 5, 6, 10, 11),
x2 = c(3, 2, -1, -1, 2, 4, 1, 0)
)
m_sep <- suppressWarnings(glm(y ~ x1 + x2, data = d_sep, family = binomial))
out <- model_parameters(m_sep)
expect_snapshot(print(out))
})
}
)

withr::with_options(
list(parameters_warning_exponentiate = TRUE),
{
test_that("print warning about complete separation", {
data(mtcars)
m_sep2 <- suppressWarnings(glm(am ~ gear, data = mtcars, family = binomial))
out <- model_parameters(m_sep2)
expect_snapshot(print(out))
})
}
)

withr::with_options(
list(parameters_warning_exponentiate = TRUE),
{
test_that("print warning about quasi complete separation", {
data(mtcars)
m_sep3 <- suppressWarnings(glm(vs ~ qsec, data = mtcars, family = binomial))
out <- model_parameters(m_sep3)
expect_snapshot(print(out))
})
}
)
2 changes: 2 additions & 0 deletions tests/testthat/test-model_parameters.cgam.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ test_that("model_parameters - cgam", {
ran_pars = TRUE,
show_summary = FALSE,
log_link = FALSE,
logit_link = FALSE,
pretty_labels = c(`(Intercept)` = "(Intercept)"),
test_statistic = "t-statistic",
log_response = FALSE,
Expand Down Expand Up @@ -140,6 +141,7 @@ test_that("model_parameters - cgamm", {
ran_pars = TRUE,
show_summary = FALSE,
log_link = FALSE,
logit_link = FALSE,
pretty_labels = c(
`(Intercept)` = "(Intercept)",
`cgam::s.incr(x)` = "cgam::s.incr(x)"
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test-printing.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ skip_if_not_installed("withr")
skip_if(getRversion() < "4.0.0")

withr::with_options(
list("parameters_interaction" = "*"),
list(parameters_interaction = "*"),
{
# Splitting model components ----
test_that("print model with multiple components", {
Expand Down

0 comments on commit 065bfc4

Please sign in to comment.