From d605707de1df3450593400fce4c5580d6725c394 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Fri, 20 Oct 2023 11:14:23 +0200 Subject: [PATCH] split quant_approx into two arguments --- NEWS.md | 2 +- R/hstats.R | 97 ++++++------------- R/partial_dep.R | 2 +- R/utils_calculate.R | 59 +++++++++++ R/utils_input.R | 16 --- man/hstats.Rd | 35 ++++--- packaging.R | 1 - .../{test_utils.R => test_calculate.R} | 53 +++------- tests/testthat/test_hstats.R | 9 +- tests/testthat/test_statistics.R | 38 ++++++++ 10 files changed, 168 insertions(+), 144 deletions(-) rename tests/testthat/{test_utils.R => test_calculate.R} (81%) diff --git a/NEWS.md b/NEWS.md index 409c78e3..b0c5a53e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ ## Major changes -- `hstats()` has received an argument `quant_approx` to speed-up calculations by quantile binning. Dense numeric variables are replaced by midpoints of `quant_approx + 1` uniform quantiles. By default, the value is `NULL` (no approximation). Even relatively high values like 50 will bring a massive speed-up for dense features, mainly for the one-way calculations. Use this option when calculations are slow, or when you want to increase `n_max`. +- Quantile approximation: `hstats()` now has the option `approx = FALSE`. Set to `TRUE` to replace values of dense numeric columns by `grid_size = 50` quantile midpoints. This will bring a massive speed-up for one-way calculations. Use this option when one-way calculations are slow, or when you want to increase `n_max`. - `hstats()`: `n_max` has been increased from 300 to 500 rows. This will make estimates of H statistics more stable at the price of longer run time. Reduce to 300 for the old behaviour. - `hstats()`: Three-way interactions are not anymore calculated by default. Set `threeway_m` to 5 for the old behaviour. - Revised plots: The colors and color palettes have changed and can now also be controlled via global options. For instance, to change the fill color of all bars, set `options(hstats.fill = new value)`. Value labels are more clear, and there are more options. Varying color/fill scales now use viridis (inferno). This can be modified on the fly or via `options(hstats.viridis_args = list(...))`. diff --git a/R/hstats.R b/R/hstats.R index 5d554eab..58c2c1e7 100644 --- a/R/hstats.R +++ b/R/hstats.R @@ -41,14 +41,16 @@ #' @param threeway_m Like `pairwise_m`, but controls the feature count for #' three-way interactions. Cannot be larger than `pairwise_m`. #' To save computation time, the default is 0. -#' @param quant_approx Integer. Dense numeric variables in `X` are replaced by midpoints -#' of `quant_approx + 1` uniform quantiles. By default, the value is `NULL` -#' (no approximation). Even relatively high values like 50 will bring a massive -#' speed-up for dense features, mainly for one-way statistics. -#' Note that the quantiles are calculated after subsampling to `n_max` rows. -#' @param eps Threshold below which numerator values are set to 0. Default is 1e-10. +#' @param approx Should quantile approximation be applied to dense numeric features? +#' The default is `FALSE`. Setting this option to `TRUE` brings a massive speed-up +#' for one-way calculations. It can, e.g., be used when the number of features is +#' very large. +#' @param grid_size Integer controlling the number of quantile midpoints used to +#' approximate dense numerics. The quantile midpoints are calculated after +#' subampling via `n_max`. Only relevant if `approx = TRUE`. #' @param n_max If `X` has more than `n_max` rows, a random sample of `n_max` rows is #' selected from `X`. In this case, set a random seed for reproducibility. +#' @param eps Threshold below which numerator values are set to 0. Default is 1e-10. #' @param w Optional vector of case weights. Can also be a column name of `X`. #' @param verbose Should a progress bar be shown? The default is `TRUE`. #' @param ... Additional arguments passed to `pred_fun(object, X, ...)`, @@ -141,8 +143,9 @@ hstats <- function(object, ...) { hstats.default <- function(object, X, v = NULL, pred_fun = stats::predict, pairwise_m = 5L, threeway_m = 0L, - quant_approx = NULL, eps = 1e-10, - n_max = 500L, w = NULL, verbose = TRUE, ...) { + approx = FALSE, grid_size = 50L, + n_max = 500L, eps = 1e-10, + w = NULL, verbose = TRUE, ...) { stopifnot( is.matrix(X) || is.data.frame(X), is.function(pred_fun) @@ -180,8 +183,8 @@ hstats.default <- function(object, X, v = NULL, } # Quantile approximation to speedup things for dense features - if (!is.null(quant_approx)) { - X <- approx_matrix_or_df(X = X, v = v, m = quant_approx) + if (isTRUE(approx)) { + X <- approx_matrix_or_df(X = X, v = v, m = grid_size) } # Predictions ("F" in Friedman and Popescu) always calculated (cheap) @@ -277,8 +280,9 @@ hstats.default <- function(object, X, v = NULL, hstats.ranger <- function(object, X, v = NULL, pred_fun = function(m, X, ...) stats::predict(m, X, ...)$predictions, pairwise_m = 5L, threeway_m = 0L, - quant_approx = NULL, eps = 1e-10, - n_max = 500L, w = NULL, verbose = TRUE, ...) { + approx = FALSE, grid_size = 50L, + n_max = 500L, eps = 1e-10, + w = NULL, verbose = TRUE, ...) { hstats.default( object = object, X = X, @@ -286,9 +290,10 @@ hstats.ranger <- function(object, X, v = NULL, pred_fun = pred_fun, pairwise_m = pairwise_m, threeway_m = threeway_m, - quant_approx = quant_approx, - eps = eps, + approx = approx, + grid_size = grid_size, n_max = n_max, + eps = eps, w = w, verbose = verbose, ... @@ -300,8 +305,9 @@ hstats.ranger <- function(object, X, v = NULL, hstats.Learner <- function(object, X, v = NULL, pred_fun = NULL, pairwise_m = 5L, threeway_m = 0L, - quant_approx = NULL, eps = 1e-10, - n_max = 500L, w = NULL, verbose = TRUE, ...) { + approx = FALSE, grid_size = 50L, + n_max = 500L, eps = 1e-10, + w = NULL, verbose = TRUE, ...) { if (is.null(pred_fun)) { pred_fun <- mlr3_pred_fun(object, X = X) } @@ -312,9 +318,10 @@ hstats.Learner <- function(object, X, v = NULL, pred_fun = pred_fun, pairwise_m = pairwise_m, threeway_m = threeway_m, - quant_approx = quant_approx, - eps = eps, + approx = approx, + grid_size = grid_size, n_max = n_max, + eps = eps, w = w, verbose = verbose, ... @@ -327,9 +334,9 @@ hstats.explainer <- function(object, X = object[["data"]], v = NULL, pred_fun = object[["predict_function"]], pairwise_m = 5L, threeway_m = 0L, - quant_approx = NULL, eps = 1e-10, - n_max = 500L, w = object[["weights"]], - verbose = TRUE, ...) { + approx = FALSE, grid_size = 50L, + n_max = 500L, eps = 1e-10, + w = object[["weights"]], verbose = TRUE, ...) { hstats.default( object = object[["model"]], X = X, @@ -337,9 +344,10 @@ hstats.explainer <- function(object, X = object[["data"]], pred_fun = pred_fun, pairwise_m = pairwise_m, threeway_m = threeway_m, - quant_approx = quant_approx, - eps = eps, + approx = approx, + grid_size = grid_size, n_max = n_max, + eps = eps, w = w, verbose = verbose, ... @@ -548,46 +556,3 @@ get_v <- function(H, m) { } v[v %in% v_cand] } - -#' Approximate Vector -#' -#' Internal function. Approximates values by the average of the two closest quantiles. -#' -#' @noRd -#' @keywords internal -#' -#' @param x A vector or factor. -#' @param m Number of unique values. -#' @returns An approximation of `x` (or `x` if non-numeric or discrete). -approx_vector <- function(x, m = 25L) { - if (!is.numeric(x) || length(unique(x)) <= m) { - return(x) - } - p <- seq(0, 1, length.out = m + 1L) - q <- unique(stats::quantile(x, probs = p, names = FALSE, na.rm = TRUE)) - mids <- (q[-length(q)] + q[-1L]) / 2 - return(mids[findInterval(x, q, rightmost.closed = TRUE)]) -} - -#' Approximate df or Matrix -#' -#' Internal function. Calls `approx_vector()` to each column in matrix or data.frame. -#' -#' @noRd -#' @keywords internal -#' -#' @param X A matrix or data.frame. -#' @param m Number of unique values. -#' @returns An approximation of `X` (or `X` if non-numeric or discrete). -approx_matrix_or_df <- function(X, v = colnames(X), m = 25L) { - stopifnot( - m >= 2L, - is.data.frame(X) || is.matrix(X) - ) - if (is.data.frame(X)) { - X[v] <- lapply(X[v], FUN = approx_vector, m = m) - } else { # Matrix - X[, v] <- apply(X[, v, drop = FALSE], MARGIN = 2L, FUN = approx_vector, m = m) - } - return(X) -} diff --git a/R/partial_dep.R b/R/partial_dep.R index 0ed60f89..73b4b5f0 100644 --- a/R/partial_dep.R +++ b/R/partial_dep.R @@ -29,8 +29,8 @@ #' A partial dependence plot (PDP) plots the values of \eqn{\hat F_s(\mathbf{x}_s)} #' over a grid of evaluation points \eqn{\mathbf{x}_s}. #' -#' @inheritParams hstats #' @inheritParams multivariate_grid +#' @inheritParams hstats #' @param v One or more column names over which you want to calculate the partial #' dependence. #' @param grid Evaluation grid. A vector (if `length(v) == 1L`), or a matrix/data.frame diff --git a/R/utils_calculate.R b/R/utils_calculate.R index d15b8933..1ea70a54 100644 --- a/R/utils_calculate.R +++ b/R/utils_calculate.R @@ -112,3 +112,62 @@ wcenter <- function(x, w = NULL) { # sweep(x, MARGIN = 2L, STATS = wcolMeans(x, w = w)) # Slower x - matrix(wcolMeans(x, w = w), nrow = nrow(x), ncol = ncol(x), byrow = TRUE) } + +#' Bin into Quantiles +#' +#' Internal function. Applies [cut()] to quantile breaks. +#' +#' @noRd +#' @keywords internal +#' +#' @param x A numeric vector. +#' @param m Number of intervals. +#' @returns A factor, representing binned `x`. +qcut <- function(x, m) { + p <- seq(0, 1, length.out = m + 1L) + g <- stats::quantile(x, probs = p, names = FALSE, type = 1L, na.rm = TRUE) + cut(x, breaks = unique(g), include.lowest = TRUE) +} + +#' Approximate Vector +#' +#' Internal function. Approximates values by the average of the two closest quantiles. +#' +#' @noRd +#' @keywords internal +#' +#' @param x A vector or factor. +#' @param m Number of unique values. +#' @returns An approximation of `x` (or `x` if non-numeric or discrete). +approx_vector <- function(x, m = 50L) { + if (!is.numeric(x) || length(unique(x)) <= m) { + return(x) + } + p <- seq(0, 1, length.out = m + 1L) + q <- unique(stats::quantile(x, probs = p, names = FALSE, na.rm = TRUE)) + mids <- (q[-length(q)] + q[-1L]) / 2 + return(mids[findInterval(x, q, rightmost.closed = TRUE)]) +} + +#' Approximate df or Matrix +#' +#' Internal function. Calls `approx_vector()` to each column in matrix or data.frame. +#' +#' @noRd +#' @keywords internal +#' +#' @param X A matrix or data.frame. +#' @param m Number of unique values. +#' @returns An approximation of `X` (or `X` if non-numeric or discrete). +approx_matrix_or_df <- function(X, v = colnames(X), m = 50L) { + stopifnot( + m >= 2L, + is.data.frame(X) || is.matrix(X) + ) + if (is.data.frame(X)) { + X[v] <- lapply(X[v], FUN = approx_vector, m = m) + } else { # Matrix + X[, v] <- apply(X[, v, drop = FALSE], MARGIN = 2L, FUN = approx_vector, m = m) + } + return(X) +} diff --git a/R/utils_input.R b/R/utils_input.R index ac671a9c..c59d5027 100644 --- a/R/utils_input.R +++ b/R/utils_input.R @@ -1,19 +1,3 @@ -#' Bin into Quantiles -#' -#' Internal function. Applies [cut()] to quantile breaks. -#' -#' @noRd -#' @keywords internal -#' -#' @param x A numeric vector. -#' @param m Number of intervals. -#' @returns A factor, representing binned `x`. -qcut <- function(x, m) { - p <- seq(0, 1, length.out = m + 1L) - g <- stats::quantile(x, probs = p, names = FALSE, type = 1L, na.rm = TRUE) - cut(x, breaks = unique(g), include.lowest = TRUE) -} - #' Prepares Group BY Variable #' #' Internal function that prepares a BY variable or BY column name. diff --git a/man/hstats.Rd b/man/hstats.Rd index a2bebb79..10fb6507 100644 --- a/man/hstats.Rd +++ b/man/hstats.Rd @@ -17,9 +17,10 @@ hstats(object, ...) pred_fun = stats::predict, pairwise_m = 5L, threeway_m = 0L, - quant_approx = NULL, - eps = 1e-10, + approx = FALSE, + grid_size = 50L, n_max = 500L, + eps = 1e-10, w = NULL, verbose = TRUE, ... @@ -32,9 +33,10 @@ hstats(object, ...) pred_fun = function(m, X, ...) stats::predict(m, X, ...)$predictions, pairwise_m = 5L, threeway_m = 0L, - quant_approx = NULL, - eps = 1e-10, + approx = FALSE, + grid_size = 50L, n_max = 500L, + eps = 1e-10, w = NULL, verbose = TRUE, ... @@ -47,9 +49,10 @@ hstats(object, ...) pred_fun = NULL, pairwise_m = 5L, threeway_m = 0L, - quant_approx = NULL, - eps = 1e-10, + approx = FALSE, + grid_size = 50L, n_max = 500L, + eps = 1e-10, w = NULL, verbose = TRUE, ... @@ -62,9 +65,10 @@ hstats(object, ...) pred_fun = object[["predict_function"]], pairwise_m = 5L, threeway_m = 0L, - quant_approx = NULL, - eps = 1e-10, + approx = FALSE, + grid_size = 50L, n_max = 500L, + eps = 1e-10, w = object[["weights"]], verbose = TRUE, ... @@ -99,17 +103,20 @@ strongest variable names is taken. This can lead to very long run-times.} three-way interactions. Cannot be larger than \code{pairwise_m}. To save computation time, the default is 0.} -\item{quant_approx}{Integer. Dense numeric variables in \code{X} are replaced by midpoints -of \code{quant_approx + 1} uniform quantiles. By default, the value is \code{NULL} -(no approximation). Even relatively high values like 50 will bring a massive -speed-up for dense features, mainly for one-way statistics. -Note that the quantiles are calculated after subsampling to \code{n_max} rows.} +\item{approx}{Should quantile approximation be applied to dense numeric features? +The default is \code{FALSE}. Setting this option to \code{TRUE} brings a massive speed-up +for one-way calculations. It can, e.g., be used when the number of features is +very large.} -\item{eps}{Threshold below which numerator values are set to 0. Default is 1e-10.} +\item{grid_size}{Integer controlling the number of quantile midpoints used to +approximate dense numerics. The quantile midpoints are calculated after +subampling via \code{n_max}. Only relevant if \code{approx = TRUE}.} \item{n_max}{If \code{X} has more than \code{n_max} rows, a random sample of \code{n_max} rows is selected from \code{X}. In this case, set a random seed for reproducibility.} +\item{eps}{Threshold below which numerator values are set to 0. Default is 1e-10.} + \item{w}{Optional vector of case weights. Can also be a column name of \code{X}.} \item{verbose}{Should a progress bar be shown? The default is \code{TRUE}.} diff --git a/packaging.R b/packaging.R index b7be2b63..18f66c91 100644 --- a/packaging.R +++ b/packaging.R @@ -83,7 +83,6 @@ build() # build(binary = TRUE) install(upgrade = FALSE) - # Run only if package is public(!) and should go to CRAN if (FALSE) { check_win_devel() diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_calculate.R similarity index 81% rename from tests/testthat/test_utils.R rename to tests/testthat/test_calculate.R index b8c2e7b8..3f0e26aa 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_calculate.R @@ -122,43 +122,6 @@ test_that("wcenter() works for vectors", { expect_equal(wcenter(x, w = w), xpected) }) -test_that("poor_man_stack() works (test could be improved", { - y <- c("a", "b", "c") - z <- c("aa", "bb", "cc") - X <- data.frame(x = 1:3, y = y, z = z) - out <- poor_man_stack(X, to_stack = c("y", "z")) - xpected <- data.frame( - x = rep(1:3, times = 2L), - varying_ = factor(rep(c("y", "z"), each = 3L)), - value_ = c(y, z) - ) - expect_equal(out, xpected) - - expect_error(poor_man_stack(cbind(a = 1:3, b = 2:4), to_stack = "b")) -}) - -test_that("mat2df() works (test could be improved)", { - mat <- cbind(y = 1:2, z = c(0.5, 0.5)) - rownames(mat) <- letters[seq_len(nrow(mat))] - out <- mat2df(mat) - rownames(out) <- NULL - xpected <- data.frame( - id_ = "Overall", - variable_ = factor(c("a", "b", "a", "b")), - varying_ = factor(c("y", "y", "z", "z")), - value_ = c(1, 2, 0.5, 0.5), - stringsAsFactors = FALSE - ) - expect_equal(out, xpected) - - mat_no_names <- mat - colnames(mat_no_names) <- NULL - expect_equal(unique(mat2df(mat_no_names)$varying_), factor(c("y1", "y2"))) - - expect_error(mat2df(head(iris))) - expect_error(mat2df(1:4)) -}) - test_that("qcut() works (test should be improved)", { x <- 1:100 expect_equal(levels(qcut(x, m = 2)), c("[1,50]", "(50,100]")) @@ -169,11 +132,6 @@ test_that("qcut() works with missings", { expect_true(is.na(qcut(c(NA, 1:9), m = 2)[1L])) }) -test_that("approx_vector() works with missings", { - expect_equal(approx_vector(c(NA, "A", "B"), m = 2), c(NA, "A", "B")) - expect_true(is.na(approx_vector(c(NA, 1:9), m = 2)[1L])) -}) - test_that("approx_matrix_or_df works as expected", { expect_equal(approx_matrix_or_df(iris, m = 200L), iris) expect_false(identical(r <- approx_matrix_or_df(iris, m = 5L), iris)) @@ -184,4 +142,15 @@ test_that("approx_matrix_or_df works as expected", { expect_equal(approx_matrix_or_df(ir, m = 200L), ir) expect_false(identical(approx_matrix_or_df(ir, m = 5L), ir)) expect_equal(length(unique(r[, "Sepal.Width"])), 5L) + + X <- cbind(dense = 1:20, discrete = rep(1:2, each = 10)) + expect_equal( + apply(approx_matrix_or_df(X, m = 5L), 2L, function(x) length(unique(x))), + c(dense = 5L, discrete = 2L) + ) +}) + +test_that("approx_vector() works with missings", { + expect_equal(approx_vector(c(NA, "A", "B"), m = 2), c(NA, "A", "B")) + expect_true(is.na(approx_vector(c(NA, 1:9), m = 2)[1L])) }) diff --git a/tests/testthat/test_hstats.R b/tests/testthat/test_hstats.R index 1c10a1a4..bdc35a4b 100644 --- a/tests/testthat/test_hstats.R +++ b/tests/testthat/test_hstats.R @@ -20,7 +20,9 @@ test_that("Additive models show 0 interactions (univariate)", { expect_message(plot(h2_overall(s, zero = FALSE))) # With quantile approximation - s <- hstats(fit, X = iris[-1L], verbose = FALSE, threeway_m = 5L, quant_approx = 5L) + s <- hstats( + fit, X = iris[-1L], verbose = FALSE, threeway_m = 5L, approx = TRUE, grid_size = 5L, + ) expect_null(h2_pairwise(s, zero = FALSE)$M) }) @@ -72,7 +74,7 @@ test_that("Non-additive models show interactions > 0 (one interaction)", { expect_null(h2_threeway(s, zero = FALSE)$M) # With quantile approximation - s <- hstats(fit, X = iris[-1L], verbose = FALSE, quant_approx = 5L) + s <- hstats(fit, X = iris[-1L], verbose = FALSE, approx = TRUE, grid_size = 5L) expect_true(h2(s)$M > 0) }) @@ -312,7 +314,8 @@ test_that("matrix case works as well", { v = colnames(iris[2:4]), pred_fun = pred_fun, verbose = FALSE, - quant_approx = 5L + approx = TRUE, + grid_size = 20L ) expect_equal(c(h2_overall(s)$M), c(0, 0, 0)) }) diff --git a/tests/testthat/test_statistics.R b/tests/testthat/test_statistics.R index a43d4232..06662b8c 100644 --- a/tests/testthat/test_statistics.R +++ b/tests/testthat/test_statistics.R @@ -1,3 +1,41 @@ + +test_that("poor_man_stack() works (test could be improved", { + y <- c("a", "b", "c") + z <- c("aa", "bb", "cc") + X <- data.frame(x = 1:3, y = y, z = z) + out <- poor_man_stack(X, to_stack = c("y", "z")) + xpected <- data.frame( + x = rep(1:3, times = 2L), + varying_ = factor(rep(c("y", "z"), each = 3L)), + value_ = c(y, z) + ) + expect_equal(out, xpected) + + expect_error(poor_man_stack(cbind(a = 1:3, b = 2:4), to_stack = "b")) +}) + +test_that("mat2df() works (test could be improved)", { + mat <- cbind(y = 1:2, z = c(0.5, 0.5)) + rownames(mat) <- letters[seq_len(nrow(mat))] + out <- mat2df(mat) + rownames(out) <- NULL + xpected <- data.frame( + id_ = "Overall", + variable_ = factor(c("a", "b", "a", "b")), + varying_ = factor(c("y", "y", "z", "z")), + value_ = c(1, 2, 0.5, 0.5), + stringsAsFactors = FALSE + ) + expect_equal(out, xpected) + + mat_no_names <- mat + colnames(mat_no_names) <- NULL + expect_equal(unique(mat2df(mat_no_names)$varying_), factor(c("y1", "y2"))) + + expect_error(mat2df(head(iris))) + expect_error(mat2df(1:4)) +}) + test_that("postprocess() works for matrix input", { num <- cbind(a = 1:3, b = c(1, 1, 1)) denom <- cbind(a = 1:3, b = 1:3)