From ea2f16f98927c8265b3302eafbe0298484bc5622 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Wed, 2 Oct 2024 16:32:57 +0200
Subject: [PATCH 1/6] Default values of `to_numeric()` (#547)

Fixes #544
---
 R/to_numeric.R                           | 18 +++++++++---------
 man/datawizard-package.Rd                | 10 +++++-----
 man/to_numeric.Rd                        | 16 ++++++++--------
 tests/testthat/_snaps/data_to_numeric.md |  4 ++--
 tests/testthat/test-data_to_numeric.R    |  8 ++++----
 5 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/R/to_numeric.R b/R/to_numeric.R
index e38e12e80..3e75bccbd 100644
--- a/R/to_numeric.R
+++ b/R/to_numeric.R
@@ -17,11 +17,11 @@
 #' @inheritParams extract_column_names
 #' @inheritParams categorize
 #'
-#' @note By default, `to_numeric()` converts factors into "binary" dummies, i.e.
+#' @note When factors should be converted into multiple "binary" dummies, i.e.
 #' each factor level is converted into a separate column filled with a binary
-#' 0-1 value. If only one column is required, use `dummy_factors = FALSE`. If
-#' you want to preserve the original factor levels (in case these represent
-#' numeric values), use `preserve_levels = TRUE`.
+#' 0-1 value, set `dummy_factors = TRUE`. If you want to preserve the original
+#' factor levels (in case these represent numeric values), use
+#' `preserve_levels = TRUE`.
 #'
 #' @section Selection of variables - `select` argument:
 #' For most functions that have a `select` argument the complete input data
@@ -34,12 +34,12 @@
 #'
 #' @examples
 #' to_numeric(head(ToothGrowth))
-#' to_numeric(head(ToothGrowth), dummy_factors = FALSE)
+#' to_numeric(head(ToothGrowth), dummy_factors = TRUE)
 #'
 #' # factors
 #' x <- as.factor(mtcars$gear)
-#' to_numeric(x, dummy_factors = FALSE)
-#' to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE)
+#' to_numeric(x)
+#' to_numeric(x, preserve_levels = TRUE)
 #' # same as:
 #' coerce_to_numeric(x)
 #'
@@ -69,7 +69,7 @@ to_numeric.default <- function(x, verbose = TRUE, ...) {
 to_numeric.data.frame <- function(x,
                                   select = NULL,
                                   exclude = NULL,
-                                  dummy_factors = TRUE,
+                                  dummy_factors = FALSE,
                                   preserve_levels = FALSE,
                                   lowest = NULL,
                                   append = FALSE,
@@ -191,7 +191,7 @@ to_numeric.POSIXlt <- to_numeric.Date
 
 #' @export
 to_numeric.factor <- function(x,
-                              dummy_factors = TRUE,
+                              dummy_factors = FALSE,
                               preserve_levels = FALSE,
                               lowest = NULL,
                               verbose = TRUE,
diff --git a/man/datawizard-package.Rd b/man/datawizard-package.Rd
index db38bc334..d389df6ac 100644
--- a/man/datawizard-package.Rd
+++ b/man/datawizard-package.Rd
@@ -33,16 +33,16 @@ Useful links:
 
 Authors:
 \itemize{
-  \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID}) (@patilindrajeets)
-  \item Dominique Makowski \email{dom.makowski@gmail.com} (\href{https://orcid.org/0000-0001-5375-9967}{ORCID}) (@Dom_Makowski)
-  \item Daniel Lüdecke \email{d.luedecke@uke.de} (\href{https://orcid.org/0000-0002-8895-3206}{ORCID}) (@strengejacke)
+  \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID})
+  \item Dominique Makowski \email{dom.makowski@gmail.com} (\href{https://orcid.org/0000-0001-5375-9967}{ORCID})
+  \item Daniel Lüdecke \email{d.luedecke@uke.de} (\href{https://orcid.org/0000-0002-8895-3206}{ORCID})
   \item Mattan S. Ben-Shachar \email{matanshm@post.bgu.ac.il} (\href{https://orcid.org/0000-0002-4287-4801}{ORCID})
-  \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID}) (@bmwiernik)
+  \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID})
 }
 
 Other contributors:
 \itemize{
-  \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) (@rempsyc) [contributor]
+  \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) [contributor]
   \item Thomas J. Faulkenberry \email{faulkenberry@tarleton.edu} [reviewer]
   \item Robert Garrett \email{rcg4@illinois.edu} [reviewer]
 }
diff --git a/man/to_numeric.Rd b/man/to_numeric.Rd
index 7478c9579..634906e4a 100644
--- a/man/to_numeric.Rd
+++ b/man/to_numeric.Rd
@@ -11,7 +11,7 @@ to_numeric(x, ...)
   x,
   select = NULL,
   exclude = NULL,
-  dummy_factors = TRUE,
+  dummy_factors = FALSE,
   preserve_levels = FALSE,
   lowest = NULL,
   append = FALSE,
@@ -107,11 +107,11 @@ either numeric levels or dummy variables. The "counterpart" to convert
 variables into factors is \code{to_factor()}.
 }
 \note{
-By default, \code{to_numeric()} converts factors into "binary" dummies, i.e.
+When factors should be converted into multiple "binary" dummies, i.e.
 each factor level is converted into a separate column filled with a binary
-0-1 value. If only one column is required, use \code{dummy_factors = FALSE}. If
-you want to preserve the original factor levels (in case these represent
-numeric values), use \code{preserve_levels = TRUE}.
+0-1 value, set \code{dummy_factors = TRUE}. If you want to preserve the original
+factor levels (in case these represent numeric values), use
+\code{preserve_levels = TRUE}.
 }
 \section{Selection of variables - \code{select} argument}{
 
@@ -126,12 +126,12 @@ to also include the original variables in the returned data frame.
 
 \examples{
 to_numeric(head(ToothGrowth))
-to_numeric(head(ToothGrowth), dummy_factors = FALSE)
+to_numeric(head(ToothGrowth), dummy_factors = TRUE)
 
 # factors
 x <- as.factor(mtcars$gear)
-to_numeric(x, dummy_factors = FALSE)
-to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE)
+to_numeric(x)
+to_numeric(x, preserve_levels = TRUE)
 # same as:
 coerce_to_numeric(x)
 
diff --git a/tests/testthat/_snaps/data_to_numeric.md b/tests/testthat/_snaps/data_to_numeric.md
index 42cb00b67..e963890a5 100644
--- a/tests/testthat/_snaps/data_to_numeric.md
+++ b/tests/testthat/_snaps/data_to_numeric.md
@@ -1,7 +1,7 @@
 # convert data frame to numeric
 
     Code
-      to_numeric(head(ToothGrowth))
+      to_numeric(head(ToothGrowth), dummy_factors = TRUE)
     Output
          len supp.OJ supp.VC dose
       1  4.2       0       1  0.5
@@ -27,7 +27,7 @@
 # convert factor to numeric
 
     Code
-      to_numeric(f)
+      to_numeric(f, dummy_factors = TRUE)
     Output
          a c i s t
       1  0 0 0 1 0
diff --git a/tests/testthat/test-data_to_numeric.R b/tests/testthat/test-data_to_numeric.R
index 464c35e8d..816591ac0 100644
--- a/tests/testthat/test-data_to_numeric.R
+++ b/tests/testthat/test-data_to_numeric.R
@@ -1,5 +1,5 @@
 test_that("convert data frame to numeric", {
-  expect_snapshot(to_numeric(head(ToothGrowth)))
+  expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = TRUE))
   expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = FALSE))
 })
 
@@ -41,7 +41,7 @@ test_that("convert character to numeric lowest", {
 
 test_that("convert factor to numeric", {
   f <- factor(substring("statistics", 1:10, 1:10))
-  expect_snapshot(to_numeric(f))
+  expect_snapshot(to_numeric(f, dummy_factors = TRUE))
 })
 
 test_that("convert factor to numeric", {
@@ -67,12 +67,12 @@ test_that("convert factor to numeric, dummy factors", {
 test_that("convert factor to numeric, append", {
   data(efc)
   expect_identical(
-    colnames(to_numeric(efc)),
+    colnames(to_numeric(efc, dummy_factors = TRUE)),
     c("c12hour", "e16sex", "e42dep.1", "e42dep.2", "e42dep.3", "e42dep.4", "c172code", "neg_c_7"),
     ignore_attr = TRUE
   )
   expect_identical(
-    colnames(to_numeric(efc, append = TRUE)),
+    colnames(to_numeric(efc, dummy_factors = TRUE, append = TRUE)),
     c(
       "c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_n",
       "e42dep_n.1", "e42dep_n.2", "e42dep_n.3", "e42dep_n.4"

From 17b34f9dd40dc3bbd8fe8ff7f21c638141c481ac Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Wed, 2 Oct 2024 16:37:58 +0200
Subject: [PATCH 2/6] update news

---
 NEWS.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 97425d031..202d45425 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,6 +9,9 @@ BREAKING CHANGES
 
 * Removed deprecated arguments `group` and `na.rm` in multiple functions. Use `by` and `remove_na` instead (#546).
 
+* The default value for the argument `dummy_factors` in `to_numeric()` has
+  changed from `TRUE` to `FALSE` (#544).
+
 CHANGES
 
 * The `pattern` argument in `data_rename()` can also be a named vector. In this

From 4914b96fd10aed63edecf14d4b814382e108a30f Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Wed, 2 Oct 2024 17:58:20 +0200
Subject: [PATCH 3/6] update news

---
 NEWS.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 202d45425..f2ade5883 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -18,6 +18,13 @@ CHANGES
   case, names are used as values for the `replacement` argument (i.e. `pattern`
   can be a character vector using `<new name> = "<old name>"`).
 
+* `categorize()` gains a new `breaks` argument, to decide whether breaks are
+  inclusive or exclusive (#548).
+
+* The `labels` argument in `categorize()` gets two new options, `"range"` and
+  `"observed"`, to use the range of categorized values as labels (i.e. factor
+  levels) (#548).
+
 * Minor additions to `reshape_ci()` to work with forthcoming changes in the
   `{bayestestR}` package.
 

From 77d67f4dcfae2929de382687943e80a380272ab0 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Wed, 2 Oct 2024 18:03:09 +0200
Subject: [PATCH 4/6] categorize() add labels="range" option (#549)

* categorize() add labels="range" option
Fixes #548

* add test, docs

* lintr

* add option to decide on exclusive/inclusive breaks

* docs
---
 R/categorize.R                      | 69 ++++++++++++++++++++++-------
 man/categorize.Rd                   | 27 +++++++++--
 tests/testthat/_snaps/categorize.md | 47 ++++++++++++++++++++
 tests/testthat/test-categorize.R    | 24 ++++++++--
 4 files changed, 144 insertions(+), 23 deletions(-)
 create mode 100644 tests/testthat/_snaps/categorize.md

diff --git a/R/categorize.R b/R/categorize.R
index a6562ab68..9f8dd7505 100644
--- a/R/categorize.R
+++ b/R/categorize.R
@@ -31,10 +31,18 @@
 #'   for numeric variables, the minimum of the original input is preserved. For
 #'   factors, the default minimum is `1`. For `split = "equal_range"`, the
 #'   default minimum is always `1`, unless specified otherwise in `lowest`.
+#' @param breaks Character, indicating whether breaks for categorizing data are
+#'   `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or
+#'   interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_
+#'   group or interval to begin). Use `labels = "range"` to make this behaviour
+#'   easier to see.
 #' @param labels Character vector of value labels. If not `NULL`, `categorize()`
 #'   will returns factors instead of numeric variables, with `labels` used
-#'   for labelling the factor levels. Can also be `"mean"` or `"median"` for a
-#'   factor with labels as the mean/median of each groups.
+#'   for labelling the factor levels. Can also be `"mean"`, `"median"`,
+#'   `"range"` or `"observed"` for a factor with labels as the mean/median,
+#'   the requested range (even if not all values of that range are present in
+#'   the data) or observed range (range of the actual recoded values) of each
+#'   group. See 'Examples'.
 #' @param append Logical or string. If `TRUE`, recoded or converted variables
 #'   get new column names and are appended (column bind) to `x`, thus returning
 #'   both the original and the recoded variables. The new columns get a suffix,
@@ -53,7 +61,7 @@
 #'
 #' # Splits and breaks (cut-off values)
 #'
-#' Breaks are in general _exclusive_, this means that these values indicate
+#' Breaks are by default _exclusive_, this means that these values indicate
 #' the lower bound of the next group or interval to begin. Take a simple
 #' example, a numeric variable with values from 1 to 9. The median would be 5,
 #' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
@@ -63,6 +71,9 @@
 #' from 1 to 3 belong to the first interval and are recoded into 1 (because
 #' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
 #'
+#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which
+#' case
+#'
 #' # Recoding into groups with equal size or range
 #'
 #' `split = "equal_length"` and `split = "equal_range"` try to divide the
@@ -119,6 +130,13 @@
 #' x <- sample(1:10, size = 30, replace = TRUE)
 #' categorize(x, "equal_length", n_groups = 3, labels = "mean")
 #' categorize(x, "equal_length", n_groups = 3, labels = "median")
+#'
+#' # cut numeric into groups with the requested range as a label name
+#' # each category has the same range, and labels indicate this range
+#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
+#' # in this example, each category has the same range, but labels only refer
+#' # to the ranges of the actual values (present in the data) inside each group
+#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
 #' @export
 categorize <- function(x, ...) {
   UseMethod("categorize")
@@ -142,6 +160,7 @@ categorize.numeric <- function(x,
                                n_groups = NULL,
                                range = NULL,
                                lowest = 1,
+                               breaks = "exclusive",
                                labels = NULL,
                                verbose = TRUE,
                                ...) {
@@ -152,6 +171,9 @@ categorize.numeric <- function(x,
   if (identical(split, "equal_length")) split <- "length"
   if (identical(split, "equal_range")) split <- "range"
 
+  # check for valid values
+  breaks <- match.arg(breaks, c("exclusive", "inclusive"))
+
   # save
   original_x <- x
 
@@ -169,9 +191,9 @@ categorize.numeric <- function(x,
   }
 
   if (is.numeric(split)) {
-    breaks <- split
+    category_splits <- split
   } else {
-    breaks <- switch(split,
+    category_splits <- switch(split,
       median = stats::median(x),
       mean = mean(x),
       length = n_groups,
@@ -182,15 +204,18 @@ categorize.numeric <- function(x,
   }
 
   # complete ranges, including minimum and maximum
-  if (!identical(split, "length")) breaks <- unique(c(min(x), breaks, max(x)))
+  if (!identical(split, "length")) {
+    category_splits <- unique(c(min(x), category_splits, max(x)))
+  }
 
   # recode into groups
   out <- droplevels(cut(
     x,
-    breaks = breaks,
+    breaks = category_splits,
     include.lowest = TRUE,
-    right = FALSE
+    right = identical(breaks, "inclusive")
   ))
+  cut_result <- out
   levels(out) <- 1:nlevels(out)
 
   # fix lowest value, add back into original vector
@@ -201,7 +226,7 @@ categorize.numeric <- function(x,
   original_x[!is.na(original_x)] <- out
 
   # turn into factor?
-  .original_x_to_factor(original_x, x, labels, out, verbose, ...)
+  .original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...)
 }
 
 
@@ -223,6 +248,7 @@ categorize.data.frame <- function(x,
                                   n_groups = NULL,
                                   range = NULL,
                                   lowest = 1,
+                                  breaks = "exclusive",
                                   labels = NULL,
                                   append = FALSE,
                                   ignore_case = FALSE,
@@ -260,6 +286,7 @@ categorize.data.frame <- function(x,
     n_groups = n_groups,
     range = range,
     lowest = lowest,
+    breaks = breaks,
     labels = labels,
     verbose = verbose,
     ...
@@ -276,6 +303,7 @@ categorize.grouped_df <- function(x,
                                   n_groups = NULL,
                                   range = NULL,
                                   lowest = 1,
+                                  breaks = "exclusive",
                                   labels = NULL,
                                   append = FALSE,
                                   ignore_case = FALSE,
@@ -319,6 +347,7 @@ categorize.grouped_df <- function(x,
       n_groups = n_groups,
       range = range,
       lowest = lowest,
+      breaks = breaks,
       labels = labels,
       select = select,
       exclude = exclude,
@@ -375,20 +404,26 @@ categorize.grouped_df <- function(x,
 }
 
 
-.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) {
+.original_x_to_factor <- function(original_x, x, cut_result, labels, out, verbose, ...) {
   if (!is.null(labels)) {
     if (length(labels) == length(unique(out))) {
       original_x <- as.factor(original_x)
       levels(original_x) <- labels
-    } else if (length(labels) == 1 && labels %in% c("mean", "median")) {
+    } else if (length(labels) == 1 && labels %in% c("mean", "median", "range", "observed")) {
       original_x <- as.factor(original_x)
       no_na_x <- original_x[!is.na(original_x)]
-      if (labels == "mean") {
-        labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x
-      } else {
-        labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x
-      }
-      levels(original_x) <- insight::format_value(labels, ...)
+      out <- switch(labels,
+        mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x,
+        median = stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x,
+        # labels basically like what "cut()" returns
+        range = levels(cut_result),
+        # range based on the values that are actually present in the data
+        {
+          temp <- stats::aggregate(x, list(no_na_x), FUN = range, na.rm = TRUE)$x
+          apply(temp, 1, function(i) paste0("(", paste(as.vector(i), collapse = "-"), ")"))
+        }
+      )
+      levels(original_x) <- insight::format_value(out, ...)
     } else if (isTRUE(verbose)) {
       insight::format_warning(
         "Argument `labels` and levels of the recoded variable are not of the same length.",
diff --git a/man/categorize.Rd b/man/categorize.Rd
index 28f823dd4..ca013ce2b 100644
--- a/man/categorize.Rd
+++ b/man/categorize.Rd
@@ -14,6 +14,7 @@ categorize(x, ...)
   n_groups = NULL,
   range = NULL,
   lowest = 1,
+  breaks = "exclusive",
   labels = NULL,
   verbose = TRUE,
   ...
@@ -27,6 +28,7 @@ categorize(x, ...)
   n_groups = NULL,
   range = NULL,
   lowest = 1,
+  breaks = "exclusive",
   labels = NULL,
   append = FALSE,
   ignore_case = FALSE,
@@ -67,10 +69,19 @@ for numeric variables, the minimum of the original input is preserved. For
 factors, the default minimum is \code{1}. For \code{split = "equal_range"}, the
 default minimum is always \code{1}, unless specified otherwise in \code{lowest}.}
 
+\item{breaks}{Character, indicating whether breaks for categorizing data are
+\code{"inclusive"} (values indicate the \emph{upper} bound of the \emph{previous} group or
+interval) or \code{"exclusive"} (values indicate the \emph{lower} bound of the \emph{next}
+group or interval to begin). Use \code{labels = "range"} to make this behaviour
+easier to see.}
+
 \item{labels}{Character vector of value labels. If not \code{NULL}, \code{categorize()}
 will returns factors instead of numeric variables, with \code{labels} used
-for labelling the factor levels. Can also be \code{"mean"} or \code{"median"} for a
-factor with labels as the mean/median of each groups.}
+for labelling the factor levels. Can also be \code{"mean"}, \code{"median"},
+\code{"range"} or \code{"observed"} for a factor with labels as the mean/median,
+the requested range (even if not all values of that range are present in
+the data) or observed range (range of the actual recoded values) of each
+group. See 'Examples'.}
 
 \item{verbose}{Toggle warnings.}
 
@@ -145,7 +156,7 @@ It is basically a wrapper around base R's \code{cut()}, providing a simplified
 and more accessible way to define the interval breaks (cut-off values).
 }
 \section{Splits and breaks (cut-off values)}{
-Breaks are in general \emph{exclusive}, this means that these values indicate
+Breaks are by default \emph{exclusive}, this means that these values indicate
 the lower bound of the next group or interval to begin. Take a simple
 example, a numeric variable with values from 1 to 9. The median would be 5,
 thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
@@ -154,6 +165,9 @@ using \code{split = "quantile"} and \code{n_groups = 3} would define breaks at 3
 and 6.33 (see \code{quantile(1:9, probs = c(1/3, 2/3))}), which means that values
 from 1 to 3 belong to the first interval and are recoded into 1 (because
 the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
+
+The opposite behaviour can be achieved using \code{breaks = "inclusive"}, in which
+case
 }
 
 \section{Recoding into groups with equal size or range}{
@@ -217,6 +231,13 @@ categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high"))
 x <- sample(1:10, size = 30, replace = TRUE)
 categorize(x, "equal_length", n_groups = 3, labels = "mean")
 categorize(x, "equal_length", n_groups = 3, labels = "median")
+
+# cut numeric into groups with the requested range as a label name
+# each category has the same range, and labels indicate this range
+categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
+# in this example, each category has the same range, but labels only refer
+# to the ranges of the actual values (present in the data) inside each group
+categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
 }
 \seealso{
 \itemize{
diff --git a/tests/testthat/_snaps/categorize.md b/tests/testthat/_snaps/categorize.md
new file mode 100644
index 000000000..d08c14c4d
--- /dev/null
+++ b/tests/testthat/_snaps/categorize.md
@@ -0,0 +1,47 @@
+# categorize labelling ranged
+
+    Code
+      categorize(mtcars$mpg, "equal_length", n_groups = 5)
+    Output
+       [1] 3 3 3 3 2 2 1 3 3 2 2 2 2 2 1 1 1 5 5 5 3 2 2 1 2 4 4 5 2 2 1 3
+
+---
+
+    Code
+      categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
+    Output
+       [1] [19.8,24.5) [19.8,24.5) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8)
+       [7] [10.4,15.1) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) [15.1,19.8)
+      [13] [15.1,19.8) [15.1,19.8) [10.4,15.1) [10.4,15.1) [10.4,15.1) [29.2,33.9]
+      [19] [29.2,33.9] [29.2,33.9] [19.8,24.5) [15.1,19.8) [15.1,19.8) [10.4,15.1)
+      [25] [15.1,19.8) [24.5,29.2) [24.5,29.2) [29.2,33.9] [15.1,19.8) [15.1,19.8)
+      [31] [10.4,15.1) [19.8,24.5)
+      Levels: [10.4,15.1) [15.1,19.8) [19.8,24.5) [24.5,29.2) [29.2,33.9]
+
+---
+
+    Code
+      categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
+    Output
+       [1] (21-24.4)   (21-24.4)   (21-24.4)   (21-24.4)   (15.2-19.7) (15.2-19.7)
+       [7] (10.4-15)   (21-24.4)   (21-24.4)   (15.2-19.7) (15.2-19.7) (15.2-19.7)
+      [13] (15.2-19.7) (15.2-19.7) (10.4-15)   (10.4-15)   (10.4-15)   (30.4-33.9)
+      [19] (30.4-33.9) (30.4-33.9) (21-24.4)   (15.2-19.7) (15.2-19.7) (10.4-15)  
+      [25] (15.2-19.7) (26-27.3)   (26-27.3)   (30.4-33.9) (15.2-19.7) (15.2-19.7)
+      [31] (10.4-15)   (21-24.4)  
+      Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9)
+
+---
+
+    Code
+      categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed",
+      breaks = "inclusive")
+    Output
+       [1] (21-24.4)   (21-24.4)   (21-24.4)   (21-24.4)   (15.2-19.7) (15.2-19.7)
+       [7] (10.4-15)   (21-24.4)   (21-24.4)   (15.2-19.7) (15.2-19.7) (15.2-19.7)
+      [13] (15.2-19.7) (15.2-19.7) (10.4-15)   (10.4-15)   (10.4-15)   (30.4-33.9)
+      [19] (30.4-33.9) (30.4-33.9) (21-24.4)   (15.2-19.7) (15.2-19.7) (10.4-15)  
+      [25] (15.2-19.7) (26-27.3)   (26-27.3)   (30.4-33.9) (15.2-19.7) (15.2-19.7)
+      [31] (10.4-15)   (21-24.4)  
+      Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9)
+
diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R
index 0e0b5d317..9ab8eadde 100644
--- a/tests/testthat/test-categorize.R
+++ b/tests/testthat/test-categorize.R
@@ -1,5 +1,5 @@
 set.seed(123)
-d <- sample(1:10, size = 500, replace = TRUE)
+d <- sample.int(10, size = 500, replace = TRUE)
 
 test_that("recode median", {
   expect_identical(categorize(d), ifelse(d >= median(d), 2, 1))
@@ -22,7 +22,7 @@ test_that("recode quantile", {
 })
 
 set.seed(123)
-d <- sample(1:100, size = 1000, replace = TRUE)
+d <- sample.int(100, size = 1000, replace = TRUE)
 
 test_that("recode range", {
   expect_error(categorize(d, split = "range"))
@@ -84,7 +84,7 @@ test_that("recode length", {
 })
 
 set.seed(123)
-x <- sample(1:10, size = 30, replace = TRUE)
+x <- sample.int(10, size = 30, replace = TRUE)
 test_that("recode factor labels", {
   expect_type(categorize(x, "equal_length", n_groups = 3), "double")
   expect_s3_class(categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")), "factor")
@@ -232,3 +232,21 @@ test_that("categorize regex", {
     categorize(mtcars, select = "mpg")
   )
 })
+
+
+# select helpers ------------------------------
+test_that("categorize labelling ranged", {
+  data(mtcars)
+  expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5))
+  expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range"))
+  expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed"))
+})
+
+test_that("categorize breaks", {
+  data(mtcars)
+  expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", breaks = "inclusive"))
+  expect_error(
+    categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"),
+    regex = "should be one of"
+  )
+})

From 41aec02ed50f78865d6b9083563c9fa0dc2843d9 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Wed, 2 Oct 2024 18:06:36 +0200
Subject: [PATCH 5/6] typo in comment

---
 tests/testthat/test-categorize.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R
index 9ab8eadde..217797893 100644
--- a/tests/testthat/test-categorize.R
+++ b/tests/testthat/test-categorize.R
@@ -234,7 +234,7 @@ test_that("categorize regex", {
 })
 
 
-# select helpers ------------------------------
+# labelling ranges ------------------------------
 test_that("categorize labelling ranged", {
   data(mtcars)
   expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5))

From be756b644b5431bf85c969ff69dac4ac4be7cf6a Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Wed, 2 Oct 2024 18:09:56 +0200
Subject: [PATCH 6/6] add correct snapshot test

---
 tests/testthat/_snaps/categorize.md | 19 +++++++++----------
 tests/testthat/test-categorize.R    |  2 +-
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/tests/testthat/_snaps/categorize.md b/tests/testthat/_snaps/categorize.md
index d08c14c4d..9ed3c1115 100644
--- a/tests/testthat/_snaps/categorize.md
+++ b/tests/testthat/_snaps/categorize.md
@@ -31,17 +31,16 @@
       [31] (10.4-15)   (21-24.4)  
       Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9)
 
----
+# categorize breaks
 
     Code
-      categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed",
-      breaks = "inclusive")
+      categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive")
     Output
-       [1] (21-24.4)   (21-24.4)   (21-24.4)   (21-24.4)   (15.2-19.7) (15.2-19.7)
-       [7] (10.4-15)   (21-24.4)   (21-24.4)   (15.2-19.7) (15.2-19.7) (15.2-19.7)
-      [13] (15.2-19.7) (15.2-19.7) (10.4-15)   (10.4-15)   (10.4-15)   (30.4-33.9)
-      [19] (30.4-33.9) (30.4-33.9) (21-24.4)   (15.2-19.7) (15.2-19.7) (10.4-15)  
-      [25] (15.2-19.7) (26-27.3)   (26-27.3)   (30.4-33.9) (15.2-19.7) (15.2-19.7)
-      [31] (10.4-15)   (21-24.4)  
-      Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9)
+       [1] (19.8,24.5] (19.8,24.5] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8]
+       [7] [10.4,15.1] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8] (15.1,19.8]
+      [13] (15.1,19.8] (15.1,19.8] [10.4,15.1] [10.4,15.1] [10.4,15.1] (29.2,33.9]
+      [19] (29.2,33.9] (29.2,33.9] (19.8,24.5] (15.1,19.8] (15.1,19.8] [10.4,15.1]
+      [25] (15.1,19.8] (24.5,29.2] (24.5,29.2] (29.2,33.9] (15.1,19.8] (15.1,19.8]
+      [31] [10.4,15.1] (19.8,24.5]
+      Levels: [10.4,15.1] (15.1,19.8] (19.8,24.5] (24.5,29.2] (29.2,33.9]
 
diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R
index 217797893..30453d9ad 100644
--- a/tests/testthat/test-categorize.R
+++ b/tests/testthat/test-categorize.R
@@ -244,7 +244,7 @@ test_that("categorize labelling ranged", {
 
 test_that("categorize breaks", {
   data(mtcars)
-  expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", breaks = "inclusive"))
+  expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive"))
   expect_error(
     categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"),
     regex = "should be one of"