Warn about complete separation issues (#913)

* Warn about complete separation issues * wording * minor * style * fix * fixes * add tests * lintr * minor * lintr, styler * fix * fix
easystats · Oct 25, 2023 · 065bfc4 · 065bfc4
1 parent cec62fa
commit 065bfc4
Show file tree

Hide file tree

Showing 8 changed files with 162 additions and 26 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: parameters
 Title: Processing of Model Parameters
-Version: 0.21.2.10
+Version: 0.21.2.11
 Authors@R:
     c(person(given = "Daniel",
              family = "Lüdecke",

diff --git a/R/ci_profile_boot.R b/R/ci_profile_boot.R
@@ -1,7 +1,10 @@
 .ci_profiled <- function(model, ci) {
   glm_ci <- tryCatch(
     {
-      out <- as.data.frame(stats::confint(model, level = ci), stringsAsFactors = FALSE)
+      out <- as.data.frame(
+        suppressWarnings(stats::confint(model, level = ci)),
+        stringsAsFactors = FALSE
+      )
       names(out) <- c("CI_low", "CI_high")
 
       out$CI <- ci
@@ -132,9 +135,9 @@
   )
 
   param_names <- switch(component,
-    "conditional" = pars$Parameter,
-    "zi" = ,
-    "zero_inflated" = paste0("zi~", pars$Parameter),
+    conditional = pars$Parameter,
+    zi = ,
+    zero_inflated = paste0("zi~", pars$Parameter),
     c(
       pars$Parameter[pars$Component == "conditional"],
       paste0("zi~", pars$Parameter[pars$Component == "zero_inflated"])

diff --git a/R/format.R b/R/format.R
@@ -306,12 +306,12 @@ format.compare_parameters <- function(x,
   ran_pars <- which(x$Effects == "random")
 
   # find all random effect groups
-  if (!is.null(x$Group)) {
-    ran_groups <- unique(insight::compact_character(x$Group))
-    ran_group_rows <- which(nchar(x$Group) > 0)
-  } else {
+  if (is.null(x$Group)) {
     ran_groups <- NULL
     ran_group_rows <- NULL
+  } else {
+    ran_groups <- unique(insight::compact_character(x$Group))
+    ran_group_rows <- which(nchar(x$Group) > 0)
   }
 
   for (i in models) {
@@ -646,10 +646,10 @@ format.parameters_sem <- function(x,
 .add_footer_sigma <- function(footer = NULL, digits = 3, sigma = NULL, residual_df = NULL, type = "text") {
   if (!is.null(sigma)) {
     # format residual df
-    if (!is.null(residual_df)) {
-      res_df <- paste0(" (df = ", residual_df, ")")
-    } else {
+    if (is.null(residual_df)) {
       res_df <- ""
+    } else {
+      res_df <- paste0(" (df = ", residual_df, ")")
     }
 
     if (type == "text" || type == "markdown") {
@@ -785,10 +785,10 @@ format.parameters_sem <- function(x,
 .add_footer_formula <- function(footer = NULL, model_formula = NULL, n_obs = NULL, type = "text") {
   if (!is.null(model_formula)) {
     # format n of observations
-    if (!is.null(n_obs)) {
-      n <- paste0(" (", n_obs, " Observations)")
-    } else {
+    if (is.null(n_obs)) {
       n <- ""
+    } else {
+      n <- paste0(" (", n_obs, " Observations)")
     }
 
     if (type == "text" || type == "markdown") {
@@ -919,18 +919,44 @@ format.parameters_sem <- function(x,
 .print_footer_exp <- function(x) {
   if (isTRUE(getOption("parameters_exponentiate", TRUE))) {
     msg <- NULL
+    # we need this to check whether we have extremely large cofficients
+    if (all(c("Coefficient", "Parameter") %in% colnames(x))) {
+      spurious_coefficients <- abs(x$Coefficient[!.in_intercepts(x$Parameter)])
+    } else {
+      spurious_coefficients <- NULL
+    }
     exponentiate <- .additional_arguments(x, "exponentiate", FALSE)
     if (!.is_valid_exponentiate_argument(exponentiate)) {
       if (isTRUE(.additional_arguments(x, "log_link", FALSE))) {
         msg <- "The model has a log- or logit-link. Consider using `exponentiate = TRUE` to interpret coefficients as ratios." # nolint
+        # we only check for exp(coef), so exp() here soince coefficients are on logit-scale
+        spurious_coefficients <- exp(spurious_coefficients)
       } else if (isTRUE(.additional_arguments(x, "log_response", FALSE))) {
         msg <- "The model has a log-transformed response variable. Consider using `exponentiate = TRUE` to interpret coefficients as ratios." # nolint
+        # don't show warning about complete separation
+        spurious_coefficients <- NULL
       }
     } else if (.is_valid_exponentiate_argument(exponentiate) && isTRUE(.additional_arguments(x, "log_response", FALSE))) { # nolint
       msg <- c(
         "This model has a log-transformed response variable, and exponentiated parameters are reported.",
         "A one-unit increase in the predictor is associated with multiplying the outcome by that predictor's coefficient." # nolint
       )
+      # don't show warning about complete separation
+      spurious_coefficients <- NULL
+    }
+
+    # following check only for models with logit-link
+    logit_model <- isTRUE(.additional_arguments(x, "logit_link", FALSE)) ||
+      isTRUE(attributes(x)$coefficient_name %in% c("Log-Odds", "Odds Ratio"))
+
+    # check for complete separation coefficients or possible issues with
+    # too few data points
+    if (!is.null(spurious_coefficients) && logit_model) {
+      if (any(spurious_coefficients > 140)) {
+        msg <- c(msg, "Note that some coefficients are very large, which may indicate issues with complete separation.") # nolint
+      } else if (any(spurious_coefficients > 20)) {
+        msg <- c(msg, "Note that some coefficients are very large, which may indicate issues with (quasi) complete separation. Consider using bias-corrected or penalized regression models.") # nolint
+      }
     }
 
     if (!is.null(msg) && isTRUE(getOption("parameters_warning_exponentiate", TRUE))) {

diff --git a/R/utils_model_parameters.R b/R/utils_model_parameters.R
@@ -56,14 +56,15 @@
   attr(params, "ran_pars") <- isFALSE(group_level)
   attr(params, "show_summary") <- isTRUE(summary)
   attr(params, "log_link") <- isTRUE(grepl("log", info$link_function, fixed = TRUE))
+  attr(params, "logit_link") <- isTRUE(identical(info$link_function, "logit"))
 
   # add parameters with value and variable
   attr(params, "pretty_labels") <- .format_value_labels(params, model)
 
   # use tryCatch, these might fail...
   attr(params, "test_statistic") <- .safe(insight::find_statistic(model))
   attr(params, "log_response") <- .safe(isTRUE(grepl("log", insight::find_transformation(model), fixed = TRUE)))
-  attr(params, "log_predictors") <- .safe(any(grepl("log", unlist(insight::find_terms(model)[c("conditional", "zero_inflated", "instruments")]), fixed = TRUE)))
+  attr(params, "log_predictors") <- .safe(any(grepl("log", unlist(insight::find_terms(model)[c("conditional", "zero_inflated", "instruments")]), fixed = TRUE))) # nolint
 
   # save if model is multivariate response model
   if (isTRUE(info$is_multivariate)) {
@@ -301,13 +302,13 @@
   }
 
   # pattern for marginaleffects objects
-  if (!is.null(attr(params, "coefficient_name"))) {
+  if (is.null(attr(params, "coefficient_name"))) {
+    pattern <- "^(Coefficient|Mean|Median|MAP|Std_Coefficient|CI_|Std_CI)"
+  } else {
     pattern <- sprintf(
       "^(Coefficient|Mean|Median|MAP|Std_Coefficient|%s|CI_|Std_CI)",
       attr(params, "coefficient_name")
     )
-  } else {
-    pattern <- "^(Coefficient|Mean|Median|MAP|Std_Coefficient|CI_|Std_CI)"
   }
 
   columns <- grepl(pattern = pattern, colnames(params))
@@ -319,10 +320,10 @@
       rows <- !tolower(params$Component) %in% c("location", "scale")
     } else {
       # don't exponentiate dispersion
-      if (!is.null(params$Component)) {
-        rows <- !tolower(params$Component) %in% c("dispersion", "residual")
-      } else {
+      if (is.null(params$Component)) {
         rows <- seq_len(nrow(params))
+      } else {
+        rows <- !tolower(params$Component) %in% c("dispersion", "residual")
       }
     }
     params[rows, columns] <- exp(params[rows, columns])
@@ -362,7 +363,7 @@
 
 #' @keywords internal
 .add_anova_attributes <- function(params, model, ci, test = NULL, alternative = NULL, ...) {
-  dot.arguments <- lapply(match.call(expand.dots = FALSE)$`...`, function(x) x)
+  dot.arguments <- lapply(match.call(expand.dots = FALSE)$`...`, function(x) x) # nolint
 
   attr(params, "ci") <- ci
   attr(params, "model_class") <- class(model)
@@ -448,8 +449,8 @@
     if (verbose) {
       not_allowed_string <- datawizard::text_concatenate(not_allowed, enclose = "\"")
       insight::format_alert(
-        sprintf("Following arguments are not supported in `%s()` for models of class `%s` and will be ignored: %s", function_name, model_class, not_allowed_string),
-        sprintf("Please run `%s()` again without specifying the above mentioned arguments to obtain expected results.", function_name)
+        sprintf("Following arguments are not supported in `%s()` for models of class `%s` and will be ignored: %s", function_name, model_class, not_allowed_string), # nolint
+        sprintf("Please run `%s()` again without specifying the above mentioned arguments to obtain expected results.", function_name) # nolint
       )
     }
     dots[not_allowed] <- NULL

diff --git a/tests/testthat/_snaps/complete_separation.md b/tests/testthat/_snaps/complete_separation.md
@@ -0,0 +1,62 @@
+# print warning about complete separation
+
+    Code
+      print(out)
+    Output
+      Parameter   | Log-Odds |       SE |                95% CI |         z |      p
+      ------------------------------------------------------------------------------
+      (Intercept) |   -66.10 | 1.83e+05 | [-10644.72, 10512.52] | -3.60e-04 | > .999
+      x1          |    15.29 | 27362.84 | [ -3122.69,         ] |  5.59e-04 | > .999
+      x2          |     6.24 | 81543.72 | [-12797.28,         ] |  7.65e-05 | > .999
+    Message
+      
+      Uncertainty intervals (profile-likelihood) and p-values (two-tailed)
+        computed using a Wald z-distribution approximation.
+      
+      The model has a log- or logit-link. Consider using `exponentiate =
+        TRUE` to interpret coefficients as ratios.
+        
+      Note that some coefficients are very large, which may indicate issues
+        with complete separation.
+
+---
+
+    Code
+      print(out)
+    Output
+      Parameter   | Log-Odds |       SE |            95% CI |         z |     p
+      -------------------------------------------------------------------------
+      (Intercept) |   -83.33 | 15505.03 | [       , 816.56] | -5.37e-03 | 0.996
+      gear        |    21.01 |  3876.26 | [-248.93,       ] |  5.42e-03 | 0.996
+    Message
+      
+      Uncertainty intervals (profile-likelihood) and p-values (two-tailed)
+        computed using a Wald z-distribution approximation.
+      
+      The model has a log- or logit-link. Consider using `exponentiate =
+        TRUE` to interpret coefficients as ratios.
+        
+      Note that some coefficients are very large, which may indicate issues
+        with complete separation.
+
+# print warning about quasi complete separation
+
+    Code
+      print(out)
+    Output
+      Parameter   | Log-Odds |    SE |            95% CI |     z |     p
+      ------------------------------------------------------------------
+      (Intercept) |   -56.36 | 21.45 | [-114.76, -25.33] | -2.63 | 0.009
+      qsec        |     3.12 |  1.19 | [   1.40,   6.37] |  2.62 | 0.009
+    Message
+      
+      Uncertainty intervals (profile-likelihood) and p-values (two-tailed)
+        computed using a Wald z-distribution approximation.
+      
+      The model has a log- or logit-link. Consider using `exponentiate =
+        TRUE` to interpret coefficients as ratios.
+        
+      Note that some coefficients are very large, which may indicate issues
+        with (quasi) complete separation. Consider using bias-corrected or
+        penalized regression models.
+
diff --git a/tests/testthat/test-complete_separation.R b/tests/testthat/test-complete_separation.R
@@ -0,0 +1,42 @@
+skip_if(getRversion() < "4.0.0")
+skip_if_not_installed("withr")
+
+withr::with_options(
+  list(parameters_warning_exponentiate = TRUE),
+  {
+    test_that("print warning about complete separation", {
+      d_sep <- data.frame(
+        y = c(0, 0, 0, 0, 1, 1, 1, 1),
+        x1 = c(1, 2, 3, 3, 5, 6, 10, 11),
+        x2 = c(3, 2, -1, -1, 2, 4, 1, 0)
+      )
+      m_sep <- suppressWarnings(glm(y ~ x1 + x2, data = d_sep, family = binomial))
+      out <- model_parameters(m_sep)
+      expect_snapshot(print(out))
+    })
+  }
+)
+
+withr::with_options(
+  list(parameters_warning_exponentiate = TRUE),
+  {
+    test_that("print warning about complete separation", {
+      data(mtcars)
+      m_sep2 <- suppressWarnings(glm(am ~ gear, data = mtcars, family = binomial))
+      out <- model_parameters(m_sep2)
+      expect_snapshot(print(out))
+    })
+  }
+)
+
+withr::with_options(
+  list(parameters_warning_exponentiate = TRUE),
+  {
+    test_that("print warning about quasi complete separation", {
+      data(mtcars)
+      m_sep3 <- suppressWarnings(glm(vs ~ qsec, data = mtcars, family = binomial))
+      out <- model_parameters(m_sep3)
+      expect_snapshot(print(out))
+    })
+  }
+)
diff --git a/tests/testthat/test-model_parameters.cgam.R b/tests/testthat/test-model_parameters.cgam.R
@@ -47,6 +47,7 @@ test_that("model_parameters - cgam", {
       ran_pars = TRUE,
       show_summary = FALSE,
       log_link = FALSE,
+      logit_link = FALSE,
       pretty_labels = c(`(Intercept)` = "(Intercept)"),
       test_statistic = "t-statistic",
       log_response = FALSE,
@@ -140,6 +141,7 @@ test_that("model_parameters - cgamm", {
       ran_pars = TRUE,
       show_summary = FALSE,
       log_link = FALSE,
+      logit_link = FALSE,
       pretty_labels = c(
         `(Intercept)` = "(Intercept)",
         `cgam::s.incr(x)` = "cgam::s.incr(x)"

diff --git a/tests/testthat/test-printing.R b/tests/testthat/test-printing.R
@@ -2,7 +2,7 @@ skip_if_not_installed("withr")
 skip_if(getRversion() < "4.0.0")
 
 withr::with_options(
-  list("parameters_interaction" = "*"),
+  list(parameters_interaction = "*"),
   {
     # Splitting model components ----
     test_that("print model with multiple components", {