From ea2f16f98927c8265b3302eafbe0298484bc5622 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 16:32:57 +0200 Subject: [PATCH 1/6] Default values of `to_numeric()` (#547) Fixes #544 --- R/to_numeric.R | 18 +++++++++--------- man/datawizard-package.Rd | 10 +++++----- man/to_numeric.Rd | 16 ++++++++-------- tests/testthat/_snaps/data_to_numeric.md | 4 ++-- tests/testthat/test-data_to_numeric.R | 8 ++++---- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/R/to_numeric.R b/R/to_numeric.R index e38e12e80..3e75bccbd 100644 --- a/R/to_numeric.R +++ b/R/to_numeric.R @@ -17,11 +17,11 @@ #' @inheritParams extract_column_names #' @inheritParams categorize #' -#' @note By default, `to_numeric()` converts factors into "binary" dummies, i.e. +#' @note When factors should be converted into multiple "binary" dummies, i.e. #' each factor level is converted into a separate column filled with a binary -#' 0-1 value. If only one column is required, use `dummy_factors = FALSE`. If -#' you want to preserve the original factor levels (in case these represent -#' numeric values), use `preserve_levels = TRUE`. +#' 0-1 value, set `dummy_factors = TRUE`. If you want to preserve the original +#' factor levels (in case these represent numeric values), use +#' `preserve_levels = TRUE`. #' #' @section Selection of variables - `select` argument: #' For most functions that have a `select` argument the complete input data @@ -34,12 +34,12 @@ #' #' @examples #' to_numeric(head(ToothGrowth)) -#' to_numeric(head(ToothGrowth), dummy_factors = FALSE) +#' to_numeric(head(ToothGrowth), dummy_factors = TRUE) #' #' # factors #' x <- as.factor(mtcars$gear) -#' to_numeric(x, dummy_factors = FALSE) -#' to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE) +#' to_numeric(x) +#' to_numeric(x, preserve_levels = TRUE) #' # same as: #' coerce_to_numeric(x) #' @@ -69,7 +69,7 @@ to_numeric.default <- function(x, verbose = TRUE, ...) { to_numeric.data.frame <- function(x, select = NULL, exclude = NULL, - dummy_factors = TRUE, + dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, append = FALSE, @@ -191,7 +191,7 @@ to_numeric.POSIXlt <- to_numeric.Date #' @export to_numeric.factor <- function(x, - dummy_factors = TRUE, + dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, verbose = TRUE, diff --git a/man/datawizard-package.Rd b/man/datawizard-package.Rd index db38bc334..d389df6ac 100644 --- a/man/datawizard-package.Rd +++ b/man/datawizard-package.Rd @@ -33,16 +33,16 @@ Useful links: Authors: \itemize{ - \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID}) (@patilindrajeets) - \item Dominique Makowski \email{dom.makowski@gmail.com} (\href{https://orcid.org/0000-0001-5375-9967}{ORCID}) (@Dom_Makowski) - \item Daniel Lüdecke \email{d.luedecke@uke.de} (\href{https://orcid.org/0000-0002-8895-3206}{ORCID}) (@strengejacke) + \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID}) + \item Dominique Makowski \email{dom.makowski@gmail.com} (\href{https://orcid.org/0000-0001-5375-9967}{ORCID}) + \item Daniel Lüdecke \email{d.luedecke@uke.de} (\href{https://orcid.org/0000-0002-8895-3206}{ORCID}) \item Mattan S. Ben-Shachar \email{matanshm@post.bgu.ac.il} (\href{https://orcid.org/0000-0002-4287-4801}{ORCID}) - \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID}) (@bmwiernik) + \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID}) } Other contributors: \itemize{ - \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) (@rempsyc) [contributor] + \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) [contributor] \item Thomas J. Faulkenberry \email{faulkenberry@tarleton.edu} [reviewer] \item Robert Garrett \email{rcg4@illinois.edu} [reviewer] } diff --git a/man/to_numeric.Rd b/man/to_numeric.Rd index 7478c9579..634906e4a 100644 --- a/man/to_numeric.Rd +++ b/man/to_numeric.Rd @@ -11,7 +11,7 @@ to_numeric(x, ...) x, select = NULL, exclude = NULL, - dummy_factors = TRUE, + dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, append = FALSE, @@ -107,11 +107,11 @@ either numeric levels or dummy variables. The "counterpart" to convert variables into factors is \code{to_factor()}. } \note{ -By default, \code{to_numeric()} converts factors into "binary" dummies, i.e. +When factors should be converted into multiple "binary" dummies, i.e. each factor level is converted into a separate column filled with a binary -0-1 value. If only one column is required, use \code{dummy_factors = FALSE}. If -you want to preserve the original factor levels (in case these represent -numeric values), use \code{preserve_levels = TRUE}. +0-1 value, set \code{dummy_factors = TRUE}. If you want to preserve the original +factor levels (in case these represent numeric values), use +\code{preserve_levels = TRUE}. } \section{Selection of variables - \code{select} argument}{ @@ -126,12 +126,12 @@ to also include the original variables in the returned data frame. \examples{ to_numeric(head(ToothGrowth)) -to_numeric(head(ToothGrowth), dummy_factors = FALSE) +to_numeric(head(ToothGrowth), dummy_factors = TRUE) # factors x <- as.factor(mtcars$gear) -to_numeric(x, dummy_factors = FALSE) -to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE) +to_numeric(x) +to_numeric(x, preserve_levels = TRUE) # same as: coerce_to_numeric(x) diff --git a/tests/testthat/_snaps/data_to_numeric.md b/tests/testthat/_snaps/data_to_numeric.md index 42cb00b67..e963890a5 100644 --- a/tests/testthat/_snaps/data_to_numeric.md +++ b/tests/testthat/_snaps/data_to_numeric.md @@ -1,7 +1,7 @@ # convert data frame to numeric Code - to_numeric(head(ToothGrowth)) + to_numeric(head(ToothGrowth), dummy_factors = TRUE) Output len supp.OJ supp.VC dose 1 4.2 0 1 0.5 @@ -27,7 +27,7 @@ # convert factor to numeric Code - to_numeric(f) + to_numeric(f, dummy_factors = TRUE) Output a c i s t 1 0 0 0 1 0 diff --git a/tests/testthat/test-data_to_numeric.R b/tests/testthat/test-data_to_numeric.R index 464c35e8d..816591ac0 100644 --- a/tests/testthat/test-data_to_numeric.R +++ b/tests/testthat/test-data_to_numeric.R @@ -1,5 +1,5 @@ test_that("convert data frame to numeric", { - expect_snapshot(to_numeric(head(ToothGrowth))) + expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = TRUE)) expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = FALSE)) }) @@ -41,7 +41,7 @@ test_that("convert character to numeric lowest", { test_that("convert factor to numeric", { f <- factor(substring("statistics", 1:10, 1:10)) - expect_snapshot(to_numeric(f)) + expect_snapshot(to_numeric(f, dummy_factors = TRUE)) }) test_that("convert factor to numeric", { @@ -67,12 +67,12 @@ test_that("convert factor to numeric, dummy factors", { test_that("convert factor to numeric, append", { data(efc) expect_identical( - colnames(to_numeric(efc)), + colnames(to_numeric(efc, dummy_factors = TRUE)), c("c12hour", "e16sex", "e42dep.1", "e42dep.2", "e42dep.3", "e42dep.4", "c172code", "neg_c_7"), ignore_attr = TRUE ) expect_identical( - colnames(to_numeric(efc, append = TRUE)), + colnames(to_numeric(efc, dummy_factors = TRUE, append = TRUE)), c( "c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_n", "e42dep_n.1", "e42dep_n.2", "e42dep_n.3", "e42dep_n.4" From 17b34f9dd40dc3bbd8fe8ff7f21c638141c481ac Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 16:37:58 +0200 Subject: [PATCH 2/6] update news --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 97425d031..202d45425 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,9 @@ BREAKING CHANGES * Removed deprecated arguments `group` and `na.rm` in multiple functions. Use `by` and `remove_na` instead (#546). +* The default value for the argument `dummy_factors` in `to_numeric()` has + changed from `TRUE` to `FALSE` (#544). + CHANGES * The `pattern` argument in `data_rename()` can also be a named vector. In this From 4914b96fd10aed63edecf14d4b814382e108a30f Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 17:58:20 +0200 Subject: [PATCH 3/6] update news --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index 202d45425..f2ade5883 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,13 @@ CHANGES case, names are used as values for the `replacement` argument (i.e. `pattern` can be a character vector using ` = ""`). +* `categorize()` gains a new `breaks` argument, to decide whether breaks are + inclusive or exclusive (#548). + +* The `labels` argument in `categorize()` gets two new options, `"range"` and + `"observed"`, to use the range of categorized values as labels (i.e. factor + levels) (#548). + * Minor additions to `reshape_ci()` to work with forthcoming changes in the `{bayestestR}` package. From 77d67f4dcfae2929de382687943e80a380272ab0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 18:03:09 +0200 Subject: [PATCH 4/6] categorize() add labels="range" option (#549) * categorize() add labels="range" option Fixes #548 * add test, docs * lintr * add option to decide on exclusive/inclusive breaks * docs --- R/categorize.R | 69 ++++++++++++++++++++++------- man/categorize.Rd | 27 +++++++++-- tests/testthat/_snaps/categorize.md | 47 ++++++++++++++++++++ tests/testthat/test-categorize.R | 24 ++++++++-- 4 files changed, 144 insertions(+), 23 deletions(-) create mode 100644 tests/testthat/_snaps/categorize.md diff --git a/R/categorize.R b/R/categorize.R index a6562ab68..9f8dd7505 100644 --- a/R/categorize.R +++ b/R/categorize.R @@ -31,10 +31,18 @@ #' for numeric variables, the minimum of the original input is preserved. For #' factors, the default minimum is `1`. For `split = "equal_range"`, the #' default minimum is always `1`, unless specified otherwise in `lowest`. +#' @param breaks Character, indicating whether breaks for categorizing data are +#' `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or +#' interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_ +#' group or interval to begin). Use `labels = "range"` to make this behaviour +#' easier to see. #' @param labels Character vector of value labels. If not `NULL`, `categorize()` #' will returns factors instead of numeric variables, with `labels` used -#' for labelling the factor levels. Can also be `"mean"` or `"median"` for a -#' factor with labels as the mean/median of each groups. +#' for labelling the factor levels. Can also be `"mean"`, `"median"`, +#' `"range"` or `"observed"` for a factor with labels as the mean/median, +#' the requested range (even if not all values of that range are present in +#' the data) or observed range (range of the actual recoded values) of each +#' group. See 'Examples'. #' @param append Logical or string. If `TRUE`, recoded or converted variables #' get new column names and are appended (column bind) to `x`, thus returning #' both the original and the recoded variables. The new columns get a suffix, @@ -53,7 +61,7 @@ #' #' # Splits and breaks (cut-off values) #' -#' Breaks are in general _exclusive_, this means that these values indicate +#' Breaks are by default _exclusive_, this means that these values indicate #' the lower bound of the next group or interval to begin. Take a simple #' example, a numeric variable with values from 1 to 9. The median would be 5, #' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9 @@ -63,6 +71,9 @@ #' from 1 to 3 belong to the first interval and are recoded into 1 (because #' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3. #' +#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which +#' case +#' #' # Recoding into groups with equal size or range #' #' `split = "equal_length"` and `split = "equal_range"` try to divide the @@ -119,6 +130,13 @@ #' x <- sample(1:10, size = 30, replace = TRUE) #' categorize(x, "equal_length", n_groups = 3, labels = "mean") #' categorize(x, "equal_length", n_groups = 3, labels = "median") +#' +#' # cut numeric into groups with the requested range as a label name +#' # each category has the same range, and labels indicate this range +#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") +#' # in this example, each category has the same range, but labels only refer +#' # to the ranges of the actual values (present in the data) inside each group +#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") #' @export categorize <- function(x, ...) { UseMethod("categorize") @@ -142,6 +160,7 @@ categorize.numeric <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, verbose = TRUE, ...) { @@ -152,6 +171,9 @@ categorize.numeric <- function(x, if (identical(split, "equal_length")) split <- "length" if (identical(split, "equal_range")) split <- "range" + # check for valid values + breaks <- match.arg(breaks, c("exclusive", "inclusive")) + # save original_x <- x @@ -169,9 +191,9 @@ categorize.numeric <- function(x, } if (is.numeric(split)) { - breaks <- split + category_splits <- split } else { - breaks <- switch(split, + category_splits <- switch(split, median = stats::median(x), mean = mean(x), length = n_groups, @@ -182,15 +204,18 @@ categorize.numeric <- function(x, } # complete ranges, including minimum and maximum - if (!identical(split, "length")) breaks <- unique(c(min(x), breaks, max(x))) + if (!identical(split, "length")) { + category_splits <- unique(c(min(x), category_splits, max(x))) + } # recode into groups out <- droplevels(cut( x, - breaks = breaks, + breaks = category_splits, include.lowest = TRUE, - right = FALSE + right = identical(breaks, "inclusive") )) + cut_result <- out levels(out) <- 1:nlevels(out) # fix lowest value, add back into original vector @@ -201,7 +226,7 @@ categorize.numeric <- function(x, original_x[!is.na(original_x)] <- out # turn into factor? - .original_x_to_factor(original_x, x, labels, out, verbose, ...) + .original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...) } @@ -223,6 +248,7 @@ categorize.data.frame <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -260,6 +286,7 @@ categorize.data.frame <- function(x, n_groups = n_groups, range = range, lowest = lowest, + breaks = breaks, labels = labels, verbose = verbose, ... @@ -276,6 +303,7 @@ categorize.grouped_df <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -319,6 +347,7 @@ categorize.grouped_df <- function(x, n_groups = n_groups, range = range, lowest = lowest, + breaks = breaks, labels = labels, select = select, exclude = exclude, @@ -375,20 +404,26 @@ categorize.grouped_df <- function(x, } -.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) { +.original_x_to_factor <- function(original_x, x, cut_result, labels, out, verbose, ...) { if (!is.null(labels)) { if (length(labels) == length(unique(out))) { original_x <- as.factor(original_x) levels(original_x) <- labels - } else if (length(labels) == 1 && labels %in% c("mean", "median")) { + } else if (length(labels) == 1 && labels %in% c("mean", "median", "range", "observed")) { original_x <- as.factor(original_x) no_na_x <- original_x[!is.na(original_x)] - if (labels == "mean") { - labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x - } else { - labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x - } - levels(original_x) <- insight::format_value(labels, ...) + out <- switch(labels, + mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x, + median = stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x, + # labels basically like what "cut()" returns + range = levels(cut_result), + # range based on the values that are actually present in the data + { + temp <- stats::aggregate(x, list(no_na_x), FUN = range, na.rm = TRUE)$x + apply(temp, 1, function(i) paste0("(", paste(as.vector(i), collapse = "-"), ")")) + } + ) + levels(original_x) <- insight::format_value(out, ...) } else if (isTRUE(verbose)) { insight::format_warning( "Argument `labels` and levels of the recoded variable are not of the same length.", diff --git a/man/categorize.Rd b/man/categorize.Rd index 28f823dd4..ca013ce2b 100644 --- a/man/categorize.Rd +++ b/man/categorize.Rd @@ -14,6 +14,7 @@ categorize(x, ...) n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, verbose = TRUE, ... @@ -27,6 +28,7 @@ categorize(x, ...) n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -67,10 +69,19 @@ for numeric variables, the minimum of the original input is preserved. For factors, the default minimum is \code{1}. For \code{split = "equal_range"}, the default minimum is always \code{1}, unless specified otherwise in \code{lowest}.} +\item{breaks}{Character, indicating whether breaks for categorizing data are +\code{"inclusive"} (values indicate the \emph{upper} bound of the \emph{previous} group or +interval) or \code{"exclusive"} (values indicate the \emph{lower} bound of the \emph{next} +group or interval to begin). Use \code{labels = "range"} to make this behaviour +easier to see.} + \item{labels}{Character vector of value labels. If not \code{NULL}, \code{categorize()} will returns factors instead of numeric variables, with \code{labels} used -for labelling the factor levels. Can also be \code{"mean"} or \code{"median"} for a -factor with labels as the mean/median of each groups.} +for labelling the factor levels. Can also be \code{"mean"}, \code{"median"}, +\code{"range"} or \code{"observed"} for a factor with labels as the mean/median, +the requested range (even if not all values of that range are present in +the data) or observed range (range of the actual recoded values) of each +group. See 'Examples'.} \item{verbose}{Toggle warnings.} @@ -145,7 +156,7 @@ It is basically a wrapper around base R's \code{cut()}, providing a simplified and more accessible way to define the interval breaks (cut-off values). } \section{Splits and breaks (cut-off values)}{ -Breaks are in general \emph{exclusive}, this means that these values indicate +Breaks are by default \emph{exclusive}, this means that these values indicate the lower bound of the next group or interval to begin. Take a simple example, a numeric variable with values from 1 to 9. The median would be 5, thus the first interval ranges from 1-4 and is recoded into 1, while 5-9 @@ -154,6 +165,9 @@ using \code{split = "quantile"} and \code{n_groups = 3} would define breaks at 3 and 6.33 (see \code{quantile(1:9, probs = c(1/3, 2/3))}), which means that values from 1 to 3 belong to the first interval and are recoded into 1 (because the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3. + +The opposite behaviour can be achieved using \code{breaks = "inclusive"}, in which +case } \section{Recoding into groups with equal size or range}{ @@ -217,6 +231,13 @@ categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")) x <- sample(1:10, size = 30, replace = TRUE) categorize(x, "equal_length", n_groups = 3, labels = "mean") categorize(x, "equal_length", n_groups = 3, labels = "median") + +# cut numeric into groups with the requested range as a label name +# each category has the same range, and labels indicate this range +categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") +# in this example, each category has the same range, but labels only refer +# to the ranges of the actual values (present in the data) inside each group +categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") } \seealso{ \itemize{ diff --git a/tests/testthat/_snaps/categorize.md b/tests/testthat/_snaps/categorize.md new file mode 100644 index 000000000..d08c14c4d --- /dev/null +++ b/tests/testthat/_snaps/categorize.md @@ -0,0 +1,47 @@ +# categorize labelling ranged + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5) + Output + [1] 3 3 3 3 2 2 1 3 3 2 2 2 2 2 1 1 1 5 5 5 3 2 2 1 2 4 4 5 2 2 1 3 + +--- + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") + Output + [1] [19.8,24.5) [19.8,24.5) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) + [7] [10.4,15.1) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) [15.1,19.8) + [13] [15.1,19.8) [15.1,19.8) [10.4,15.1) [10.4,15.1) [10.4,15.1) [29.2,33.9] + [19] [29.2,33.9] [29.2,33.9] [19.8,24.5) [15.1,19.8) [15.1,19.8) [10.4,15.1) + [25] [15.1,19.8) [24.5,29.2) [24.5,29.2) [29.2,33.9] [15.1,19.8) [15.1,19.8) + [31] [10.4,15.1) [19.8,24.5) + Levels: [10.4,15.1) [15.1,19.8) [19.8,24.5) [24.5,29.2) [29.2,33.9] + +--- + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") + Output + [1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) + [7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7) + [13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9) + [19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15) + [25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7) + [31] (10.4-15) (21-24.4) + Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) + +--- + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", + breaks = "inclusive") + Output + [1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) + [7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7) + [13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9) + [19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15) + [25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7) + [31] (10.4-15) (21-24.4) + Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) + diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R index 0e0b5d317..9ab8eadde 100644 --- a/tests/testthat/test-categorize.R +++ b/tests/testthat/test-categorize.R @@ -1,5 +1,5 @@ set.seed(123) -d <- sample(1:10, size = 500, replace = TRUE) +d <- sample.int(10, size = 500, replace = TRUE) test_that("recode median", { expect_identical(categorize(d), ifelse(d >= median(d), 2, 1)) @@ -22,7 +22,7 @@ test_that("recode quantile", { }) set.seed(123) -d <- sample(1:100, size = 1000, replace = TRUE) +d <- sample.int(100, size = 1000, replace = TRUE) test_that("recode range", { expect_error(categorize(d, split = "range")) @@ -84,7 +84,7 @@ test_that("recode length", { }) set.seed(123) -x <- sample(1:10, size = 30, replace = TRUE) +x <- sample.int(10, size = 30, replace = TRUE) test_that("recode factor labels", { expect_type(categorize(x, "equal_length", n_groups = 3), "double") expect_s3_class(categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")), "factor") @@ -232,3 +232,21 @@ test_that("categorize regex", { categorize(mtcars, select = "mpg") ) }) + + +# select helpers ------------------------------ +test_that("categorize labelling ranged", { + data(mtcars) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5)) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")) +}) + +test_that("categorize breaks", { + data(mtcars) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", breaks = "inclusive")) + expect_error( + categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"), + regex = "should be one of" + ) +}) From 41aec02ed50f78865d6b9083563c9fa0dc2843d9 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 18:06:36 +0200 Subject: [PATCH 5/6] typo in comment --- tests/testthat/test-categorize.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R index 9ab8eadde..217797893 100644 --- a/tests/testthat/test-categorize.R +++ b/tests/testthat/test-categorize.R @@ -234,7 +234,7 @@ test_that("categorize regex", { }) -# select helpers ------------------------------ +# labelling ranges ------------------------------ test_that("categorize labelling ranged", { data(mtcars) expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5)) From be756b644b5431bf85c969ff69dac4ac4be7cf6a Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 18:09:56 +0200 Subject: [PATCH 6/6] add correct snapshot test --- tests/testthat/_snaps/categorize.md | 19 +++++++++---------- tests/testthat/test-categorize.R | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/testthat/_snaps/categorize.md b/tests/testthat/_snaps/categorize.md index d08c14c4d..9ed3c1115 100644 --- a/tests/testthat/_snaps/categorize.md +++ b/tests/testthat/_snaps/categorize.md @@ -31,17 +31,16 @@ [31] (10.4-15) (21-24.4) Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) ---- +# categorize breaks Code - categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", - breaks = "inclusive") + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive") Output - [1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) - [7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7) - [13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9) - [19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15) - [25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7) - [31] (10.4-15) (21-24.4) - Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) + [1] (19.8,24.5] (19.8,24.5] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8] + [7] [10.4,15.1] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8] (15.1,19.8] + [13] (15.1,19.8] (15.1,19.8] [10.4,15.1] [10.4,15.1] [10.4,15.1] (29.2,33.9] + [19] (29.2,33.9] (29.2,33.9] (19.8,24.5] (15.1,19.8] (15.1,19.8] [10.4,15.1] + [25] (15.1,19.8] (24.5,29.2] (24.5,29.2] (29.2,33.9] (15.1,19.8] (15.1,19.8] + [31] [10.4,15.1] (19.8,24.5] + Levels: [10.4,15.1] (15.1,19.8] (19.8,24.5] (24.5,29.2] (29.2,33.9] diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R index 217797893..30453d9ad 100644 --- a/tests/testthat/test-categorize.R +++ b/tests/testthat/test-categorize.R @@ -244,7 +244,7 @@ test_that("categorize labelling ranged", { test_that("categorize breaks", { data(mtcars) - expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", breaks = "inclusive")) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive")) expect_error( categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"), regex = "should be one of"