From f9cc36bb2a34fc02997b833c24e241e97f65984c Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 31 May 2024 08:27:28 +0200 Subject: [PATCH] Check for misspelled colnames in `report_sample()` (#434) --- DESCRIPTION | 3 +- NEWS.md | 1 + R/report_sample.R | 6 +++ R/utils_misspelled_variables.R | 75 +++++++++++++++++++++++++++++ tests/testthat/test-report_sample.R | 2 + 5 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 R/utils_misspelled_variables.R diff --git a/DESCRIPTION b/DESCRIPTION index 7ad0c624..5fbfc3dd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: report Type: Package Title: Automated Reporting of Results and Statistical Models -Version: 0.5.8.3 +Version: 0.5.8.4 Authors@R: c(person(given = "Dominique", family = "Makowski", @@ -148,6 +148,7 @@ Collate: 'report_table.R' 'utils_error_message.R' 'utils_grouped_df.R' + 'utils_misspelled_variables.R' 'zzz.R' Roxygen: list(markdown = TRUE) Remotes: easystats/insight, easystats/datawizard, easystats/parameters, easystats/performance, easystats/modelbased diff --git a/NEWS.md b/NEWS.md index 1d5f9e21..2693e9bf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,7 @@ Minor changes * `report` now supports reporting of Bayesian model comparison with variables of class `brms::loo_compare`. * `report` now supports reporting of BayesFactor objects with variables of class `BFBayesFactor`. +* `report_sample()` now suggests valid column names for misspelled columns in the `select`, `by`, `weights` and `exclude` arguments. # report 0.5.8 diff --git a/R/report_sample.R b/R/report_sample.R index 41d6a8f8..83b95115 100644 --- a/R/report_sample.R +++ b/R/report_sample.R @@ -114,6 +114,12 @@ report_sample <- function(data, select <- colnames(data)[select] } + # sanity check for existing columns + .check_spelling(data, select) + .check_spelling(data, exclude) + .check_spelling(data, by) + .check_spelling(data, weights) + # variables to keep if (!is.null(weights)) { select <- unique(c(select, weights)) diff --git a/R/utils_misspelled_variables.R b/R/utils_misspelled_variables.R new file mode 100644 index 00000000..6537cf85 --- /dev/null +++ b/R/utils_misspelled_variables.R @@ -0,0 +1,75 @@ +# call this function to check arguments. "select" is the argument where user +# specified column names. "arg_name" is the name of that argument, can be NULL +.check_spelling <- function(data, select) { + wrong_arg <- paste0("specified in `", deparse(substitute(select)), "` ") + if (!is.null(select) && isTRUE(nzchar(select)) && !all(select %in% colnames(data))) { + not_found <- select[!select %in% colnames(data)] + insight::format_error( + paste0( + sprintf("The following column(s) %sdon't exist in the dataset: ", wrong_arg), + datawizard::text_concatenate(not_found), "." + ), + .misspelled_string(colnames(data), not_found, "Possibly misspelled?") + ) + } +} + + +#' Fuzzy grep, matches pattern that are close, but not identical +#' @examples +#' colnames(iris) +#' p <- sprintf("(%s){~%i}", "Spela", 2) +#' grep(pattern = p, x = colnames(iris), ignore.case = FALSE) +#' @keywords internal +#' @noRd +.fuzzy_grep <- function(x, pattern, precision = NULL) { + if (is.null(precision)) { + precision <- round(nchar(pattern) / 3) + } + if (precision > nchar(pattern)) { + return(NULL) + } + p <- sprintf("(%s){~%i}", pattern, precision) + grep(pattern = p, x = x, ignore.case = FALSE) +} + + +#' create a message string to tell user about matches that could possibly +#' be the string they were looking for +#' +#' @keywords internal +#' @noRd +.misspelled_string <- function(source, searchterm, default_message = NULL) { + if (is.null(searchterm) || length(searchterm) < 1) { + return(default_message) + } + # used for many matches + more_found <- "" + # init default + msg <- "" + # guess the misspelled string + possible_strings <- unlist(lapply(searchterm, function(s) { + source[.fuzzy_grep(source, s)] # nolint + }), use.names = FALSE) + if (length(possible_strings)) { + msg <- "Did you mean " + if (length(possible_strings) > 1) { + # make sure we don't print dozens of alternatives for larger data frames + if (length(possible_strings) > 5) { + more_found <- sprintf( + " We even found %i more possible matches, not shown here.", + length(possible_strings) - 5 + ) + possible_strings <- possible_strings[1:5] + } + msg <- paste0(msg, "one of ", datawizard::text_concatenate(possible_strings, enclose = "\"", last = " or ")) + } else { + msg <- paste0(msg, "\"", possible_strings, "\"") + } + msg <- paste0(msg, "?", more_found) + } else { + msg <- default_message + } + # no double white space + insight::trim_ws(msg) +} diff --git a/tests/testthat/test-report_sample.R b/tests/testthat/test-report_sample.R index a012f969..c3d99e35 100644 --- a/tests/testthat/test-report_sample.R +++ b/tests/testthat/test-report_sample.R @@ -51,6 +51,8 @@ test_that("report_sample check input", { data(iris) expect_error(report_sample(lm(Sepal.Length ~ Species, data = iris))) expect_silent(report_sample(iris$Species)) + expect_error(report_sample(iris, by = "Spedies"), regex = "The following column") + expect_error(report_sample(iris, select = "Spedies"), regex = "The following column") }) test_that("report_sample default", {