diff --git a/.gitignore b/.gitignore index e6b4b3974..619cbb459 100644 --- a/.gitignore +++ b/.gitignore @@ -5,32 +5,22 @@ Rprof.out src/*.o src/*.so - *.o - *.so - tmp/ - \.RDataTmp - *.out - *.Rout - src-i386/ - src-x64/ - paper_experiments/res/single_res/ - paper_experiments/res/single_res_old/ inst/doc - docs/* doc Meta docs /doc/ /Meta/ -.idea \ No newline at end of file +.idea +.DS_Store diff --git a/DESCRIPTION b/DESCRIPTION index 42e96754a..f7994608d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,8 +32,6 @@ Imports: stats, data.table, Rcpp (>= 0.12.15), - condMVNorm, - mvnfast, Matrix, future.apply Suggests: diff --git a/NAMESPACE b/NAMESPACE index 956c374c8..6537b3bcb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -63,6 +63,8 @@ export(observation_impute_cpp) export(plot_MSEv_eval_crit) export(predict_model) export(prepare_data) +export(prepare_data_copula_cpp) +export(prepare_data_gaussian_cpp) export(rss_cpp) export(setup) export(setup_approach) @@ -96,6 +98,7 @@ importFrom(stats,model.matrix) importFrom(stats,predict) importFrom(stats,pt) importFrom(stats,qt) +importFrom(stats,rnorm) importFrom(stats,sd) importFrom(stats,setNames) importFrom(utils,head) diff --git a/R/RcppExports.R b/R/RcppExports.R index bbe62a76d..1f27325fe 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -80,6 +80,88 @@ aicc_full_cpp <- function(h, X_list, mcov_list, S_scale_dist, y_list, negative) .Call(`_shapr_aicc_full_cpp`, h, X_list, mcov_list, S_scale_dist, y_list, negative) } +#' Compute the quantiles using quantile type seven +#' +#' @details Using quantile type number seven from stats::quantile in R. +#' +#' @param x arma::vec. Numeric vector whose sample quantiles are wanted. +#' @param probs arma::vec. Numeric vector of probabilities with values between zero and one. +#' +#' @return A vector of length `length(probs)` with the quantiles is returned. +#' +#' @keywords internal +#' @author Lars Henry Berge Olsen +quantile_type7_cpp <- function(x, probs) { + .Call(`_shapr_quantile_type7_cpp`, x, probs) +} + +#' Transforms new data to a standardized normal distribution +#' +#' @param z arma::mat. The data are the Gaussian Monte Carlos samples to transform. +#' @param x arma::mat. The data with the original transformation. Used to conduct the transformation of `z`. +#' +#' @return arma::mat of the same dimension as `z` +#' +#' @keywords internal +#' @author Lars Henry Berge Olsen +inv_gaussian_transform_cpp <- function(z, x) { + .Call(`_shapr_inv_gaussian_transform_cpp`, z, x) +} + +#' Generate (Gaussian) Copula MC samples +#' +#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the +#' univariate standard normal. +#' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations +#' to explain on the original scale. +#' @param x_explain_gaussian_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the +#' observations to explain after being transformed using the Gaussian transform, i.e., the samples have been +#' transformed to a standardized normal distribution. +#' @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations. +#' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of +#' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones. +#' This is not a problem internally in shapr as the empty and grand coalitions treated differently. +#' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed +#' using the Gaussian transform, i.e., the samples have been transformed to a standardized normal distribution. +#' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance +#' between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been +#' transformed to a standardized normal distribution. +#' +#' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where +#' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian +#' copula MC samples for each explicand and coalition on the original scale. +#' +#' @export +#' @keywords internal +#' @author Lars Henry Berge Olsen +prepare_data_copula_cpp <- function(MC_samples_mat, x_explain_mat, x_explain_gaussian_mat, x_train_mat, S, mu, cov_mat) { + .Call(`_shapr_prepare_data_copula_cpp`, MC_samples_mat, x_explain_mat, x_explain_gaussian_mat, x_train_mat, S, mu, cov_mat) +} + +#' Generate Gaussian MC samples +#' +#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the +#' univariate standard normal. +#' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations +#' to explain. +#' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of +#' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones. +#' This is not a problem internally in shapr as the empty and grand coalitions treated differently. +#' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature. +#' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance +#' between all pairs of features. +#' +#' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where +#' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian +#' MC samples for each explicand and coalition. +#' +#' @export +#' @keywords internal +#' @author Lars Henry Berge Olsen +prepare_data_gaussian_cpp <- function(MC_samples_mat, x_explain_mat, S, mu, cov_mat) { + .Call(`_shapr_prepare_data_gaussian_cpp`, MC_samples_mat, x_explain_mat, S, mu, cov_mat) +} + #' (Generalized) Mahalanobis distance #' #' Used to get the Euclidean distance as well by setting \code{mcov} = \code{diag(m)}. diff --git a/R/approach_copula.R b/R/approach_copula.R index 403d88809..4e7f5e914 100644 --- a/R/approach_copula.R +++ b/R/approach_copula.R @@ -1,12 +1,11 @@ #' @rdname setup_approach -#' #' @inheritParams default_doc_explain -#' #' @export +#' @author Martin Jullum setup_approach.copula <- function(internal, ...) { parameters <- internal$parameters - x_train <- internal$data$x_train - x_explain <- internal$data$x_explain + x_train_mat <- as.matrix(internal$data$x_train) + x_explain_mat <- as.matrix(internal$data$x_explain) # Checking if factor features are present feature_specs <- internal$objects$feature_specs @@ -23,28 +22,21 @@ setup_approach.copula <- function(internal, ...) { } # Prepare transformed data - parameters$copula.mu <- rep(0, ncol(x_train)) - x_train0 <- apply( - X = x_train, - MARGIN = 2, - FUN = gaussian_transform - ) - parameters$copula.cov_mat <- get_cov_mat(x_train0) - + parameters$copula.mu <- rep(0, ncol(x_train_mat)) + x_train_mat0 <- apply(X = x_train_mat, MARGIN = 2, FUN = gaussian_transform) + parameters$copula.cov_mat <- get_cov_mat(x_train_mat0) x_explain_gaussian <- apply( - X = rbind(x_explain, x_train), + X = rbind(x_explain_mat, x_train_mat), MARGIN = 2, FUN = gaussian_transform_separate, - n_y = nrow(x_explain) + n_y = nrow(x_explain_mat) ) + if (is.null(dim(x_explain_gaussian))) x_explain_gaussian <- t(as.matrix(x_explain_gaussian)) - if (is.null(dim(x_explain_gaussian))) { - x_explain_gaussian <- t(as.matrix(x_explain_gaussian)) - } - # TODO: Change to this a data.table for consistency (not speed/memory) - internal$data$copula.x_explain_gaussian <- x_explain_gaussian + # Add objects to internal list internal$parameters <- parameters + internal$data$copula.x_explain_gaussian <- as.data.table(x_explain_gaussian) return(internal) } @@ -52,130 +44,75 @@ setup_approach.copula <- function(internal, ...) { #' @inheritParams default_doc #' @rdname prepare_data #' @export -prepare_data.copula <- function(internal, index_features = NULL, ...) { - x_train <- internal$data$x_train - x_explain <- internal$data$x_explain +#' @author Lars Henry Berge Olsen +prepare_data.copula <- function(internal, index_features, ...) { + # Extract used variables + S <- internal$objects$S[index_features, , drop = FALSE] + feature_names <- internal$parameters$feature_names n_explain <- internal$parameters$n_explain - copula.cov_mat <- internal$parameters$copula.cov_mat n_samples <- internal$parameters$n_samples - copula.mu <- internal$parameters$copula.mu n_features <- internal$parameters$n_features + n_combinations_now <- length(index_features) + x_train_mat <- as.matrix(internal$data$x_train) + x_explain_mat <- as.matrix(internal$data$x_explain) + copula.mu <- internal$parameters$copula.mu + copula.cov_mat <- internal$parameters$copula.cov_mat + copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian) + + # Generate the MC samples from N(0, 1) + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Use C++ to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}), for all coalitions and explicands, + # and then transforming them back to the original scale using the inverse Gaussian transform in C++. + # The object `dt` is a 3D array of dimension (n_samples, n_explain * n_coalitions, n_features). + dt <- prepare_data_copula_cpp( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + x_explain_gaussian_mat = copula.x_explain_gaussian_mat, + x_train_mat = x_train_mat, + S = S, + mu = copula.mu, + cov_mat = copula.cov_mat + ) - copula.x_explain_gaussian <- internal$data$copula.x_explain_gaussian - X <- internal$objects$X - - - x_explain0 <- as.matrix(x_explain) - dt_l <- list() - if (is.null(index_features)) { - features <- X$features - } else { - features <- X$features[index_features] - } - + # Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features). + dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features) - for (i in seq_len(n_explain)) { - l <- lapply( - X = features, - FUN = sample_copula, - n_samples = n_samples, - mu = copula.mu, - cov_mat = copula.cov_mat, - m = n_features, - x_explain = x_explain0[i, , drop = FALSE], - x_train = as.matrix(x_train), - x_explain_gaussian = copula.x_explain_gaussian[i, , drop = FALSE] - ) + # Convert to a data.table and add extra identification columns + dt <- data.table::as.data.table(dt) + data.table::setnames(dt, feature_names) + dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)] + dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))] + dt[, w := 1 / n_samples] + dt[, id_combination := index_features[id_combination]] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) - dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / n_samples] - dt_l[[i]][, id := i] - if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] - } - dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE) return(dt) } -#' Sample conditional variables using the Gaussian copula approach -#' -#' @param index_given Integer vector. The indices of the features to condition upon. Note that -#' `min(index_given) >= 1` and `max(index_given) <= m`. -#' @param m Positive integer. The total number of features. -#' @param x_explain_gaussian Numeric matrix. Contains the observation whose predictions ought -#' to be explained (test data), -#' after quantile-transforming them to standard Gaussian variables. -#' @param x_explain Numeric matrix. Contains the features of the observation whose -#' predictions ought to be explained (test data). -#' -#' @return data.table -#' -#' @keywords internal -#' -#' @author Martin Jullum -sample_copula <- function(index_given, n_samples, mu, cov_mat, m, x_explain_gaussian, x_train, x_explain) { - # Handles the unconditional and full conditional separtely when predicting - if (length(index_given) %in% c(0, m)) { - ret <- matrix(x_explain, ncol = m, nrow = 1) - } else { - dependent_ind <- (seq_len(length(mu)))[-index_given] - - tmp <- condMVNorm::condMVN( - mean = mu, - sigma = cov_mat, - dependent.ind = dependent_ind, - given.ind = index_given, - X.given = x_explain_gaussian[index_given] - ) - - ret0_z <- mvnfast::rmvn(n = n_samples, mu = tmp$condMean, sigma = tmp$condVar) - - ret0_x <- apply( - X = rbind(ret0_z, x_train[, dependent_ind, drop = FALSE]), - MARGIN = 2, - FUN = inv_gaussian_transform, - n_z = n_samples - ) - - ret <- matrix(NA, ncol = m, nrow = n_samples) - ret[, index_given] <- rep(x_explain[index_given], each = n_samples) - ret[, dependent_ind] <- ret0_x - } - colnames(ret) <- colnames(x_explain) - return(as.data.table(ret)) -} - - -#' Transforms new data to a standardized normal distribution +#' Transforms a sample to standardized normal distribution #' -#' @param zx Numeric vector. The first `n_z` items are the Gaussian data, and the last part is -#' the data with the original transformation. -#' @param n_z Positive integer. Number of elements of `zx` that belongs to new data. +#' @param x Numeric vector.The data which should be transformed to a standard normal distribution. #' -#' @return Numeric vector of length `n_z` +#' @return Numeric vector of length `length(x)` #' #' @keywords internal -#' #' @author Martin Jullum -inv_gaussian_transform <- function(zx, n_z) { - if (n_z >= length(zx)) stop("n_z should be less than length(zx)") - ind <- 1:n_z - z <- zx[ind] - x <- zx[-ind] - u <- stats::pnorm(z) - x_new <- stats::quantile(x, probs = u) - return(as.double(x_new)) +gaussian_transform <- function(x) { + u <- rank(x) / (length(x) + 1) + z <- stats::qnorm(u) + return(z) } #' Transforms new data to standardized normal (dimension 1) based on other data transformations #' #' @param yx Numeric vector. The first `n_y` items is the data that is transformed, and last #' part is the data with the original transformation. -#' @param n_y Positive integer. Number of elements of `yx` that belongs to the gaussian data. +#' @param n_y Positive integer. Number of elements of `yx` that belongs to the Gaussian data. #' #' @return Vector of back-transformed Gaussian data #' #' @keywords internal -#' #' @author Martin Jullum gaussian_transform_separate <- function(yx, n_y) { if (n_y >= length(yx)) stop("n_y should be less than length(yx)") @@ -187,18 +124,3 @@ gaussian_transform_separate <- function(yx, n_y) { z_y <- stats::qnorm(u_y) return(z_y) } - -#' Transforms a sample to standardized normal distribution -#' -#' @param x Numeric vector.The data which should be transformed to a standard normal distribution. -#' -#' @return Numeric vector of length `length(x)` -#' -#' @keywords internal -#' -#' @author Martin Jullum -gaussian_transform <- function(x) { - u <- rank(x) / (length(x) + 1) - z <- stats::qnorm(u) - return(z) -} diff --git a/R/approach_gaussian.R b/R/approach_gaussian.R index 8191757a2..23dd34d98 100644 --- a/R/approach_gaussian.R +++ b/R/approach_gaussian.R @@ -45,46 +45,47 @@ setup_approach.gaussian <- function(internal, return(internal) } +#' @inheritParams default_doc #' @rdname prepare_data #' @export -prepare_data.gaussian <- function(internal, index_features = NULL, ...) { - x_train <- internal$data$x_train - x_explain <- internal$data$x_explain +#' @author Lars Henry Berge Olsen +prepare_data.gaussian <- function(internal, index_features, ...) { + # Extract used variables + S <- internal$objects$S[index_features, , drop = FALSE] + feature_names <- internal$parameters$feature_names n_explain <- internal$parameters$n_explain - gaussian.cov_mat <- internal$parameters$gaussian.cov_mat - n_samples <- internal$parameters$n_samples - gaussian.mu <- internal$parameters$gaussian.mu n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + n_combinations_now <- length(index_features) + x_explain_mat <- as.matrix(internal$data$x_explain) + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + + # Generate the MC samples from N(0, 1) + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Use Cpp to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}) for all coalitions and explicands. + # The object `dt` is a 3D array of dimension (n_samples, n_explain * n_coalitions, n_features). + dt <- prepare_data_gaussian_cpp( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + S = S, + mu = mu, + cov_mat = cov_mat + ) - X <- internal$objects$X - - x_explain0 <- as.matrix(x_explain) - dt_l <- list() - - if (is.null(index_features)) { - features <- X$features - } else { - features <- X$features[index_features] - } - - for (i in seq_len(n_explain)) { - l <- lapply( - X = features, - FUN = sample_gaussian, - n_samples = n_samples, - mu = gaussian.mu, - cov_mat = gaussian.cov_mat, - m = n_features, - x_explain = x_explain0[i, , drop = FALSE] - ) + # Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features). + dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features) - dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / n_samples] - dt_l[[i]][, id := i] - if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] - } + # Convert to a data.table and add extra identification columns + dt <- data.table::as.data.table(dt) + data.table::setnames(dt, feature_names) + dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)] + dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))] + dt[, w := 1 / n_samples] + dt[, id_combination := index_features[id_combination]] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) - dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE) return(dt) } @@ -111,47 +112,3 @@ get_cov_mat <- function(x_train, min_eigen_value = 1e-06) { get_mu_vec <- function(x_train) { unname(colMeans(x_train)) } - -#' Sample conditional Gaussian variables -#' -#' @inheritParams sample_copula -#' -#' @return data.table -#' -#' @keywords internal -#' -#' @author Martin Jullum -sample_gaussian <- function(index_given, n_samples, mu, cov_mat, m, x_explain) { - # Check input - stopifnot(is.matrix(x_explain)) - - # Handles the unconditional and full conditional separtely when predicting - cnms <- colnames(x_explain) - if (length(index_given) %in% c(0, m)) { - return(data.table::as.data.table(x_explain)) - } - - dependent_ind <- seq_along(mu)[-index_given] - x_explain_gaussian <- x_explain[index_given] - tmp <- condMVNorm::condMVN( - mean = mu, - sigma = cov_mat, - dependent.ind = dependent_ind, - given.ind = index_given, - X.given = x_explain_gaussian - ) - - # Makes the conditional covariance matrix symmetric in the rare case where numerical instability made it unsymmetric - if (!isSymmetric(tmp[["condVar"]])) { - tmp[["condVar"]] <- Matrix::symmpart(tmp$condVar) - } - - ret0 <- mvnfast::rmvn(n = n_samples, mu = tmp$condMean, sigma = tmp$condVar) - - ret <- matrix(NA, ncol = m, nrow = n_samples) - ret[, index_given] <- rep(x_explain_gaussian, each = n_samples) - ret[, dependent_ind] <- ret0 - - colnames(ret) <- cnms - return(as.data.table(ret)) -} diff --git a/R/shapr-package.R b/R/shapr-package.R index 320619c33..75bb26ba7 100644 --- a/R/shapr-package.R +++ b/R/shapr-package.R @@ -23,6 +23,8 @@ #' #' @importFrom stats sd qt pt #' +#' @importFrom stats rnorm +#' #' @importFrom Rcpp sourceCpp #' #' @keywords internal diff --git a/inst/scripts/compare_copula_in_R_and_C++.R b/inst/scripts/compare_copula_in_R_and_C++.R new file mode 100644 index 000000000..fd6b1cfb4 --- /dev/null +++ b/inst/scripts/compare_copula_in_R_and_C++.R @@ -0,0 +1,1551 @@ +# Libraries ------------------------------------------------------------------------------------------------------- +# library(shapr) +# library(rbenchmark) +library(data.table) +devtools::load_all(".") + +# Old R code ------------------------------------------------------------------------------------------------------ +## R Old version --------------------------------------------------------------------------------------------------- +#' @inheritParams default_doc +#' @rdname prepare_data +#' @export +prepare_data.copula_old <- function(internal, index_features = NULL, ...) { + X <- internal$objects$X + x_train <- internal$data$x_train + x_explain <- internal$data$x_explain + n_explain <- internal$parameters$n_explain + n_samples <- internal$parameters$n_samples + n_features <- internal$parameters$n_features + copula.mu <- internal$parameters$copula.mu + copula.cov_mat <- internal$parameters$copula.cov_mat + copula.x_explain_gaussian <- internal$data$copula.x_explain_gaussian + + x_explain0 <- as.matrix(x_explain) + dt_l <- list() + if (is.null(index_features)) { + features <- X$features + } else { + features <- X$features[index_features] + } + + for (i in seq_len(n_explain)) { + cat(sprintf("%d,", i)) + l <- lapply( + X = features, + FUN = sample_copula_old, + n_samples = n_samples, + mu = copula.mu, + cov_mat = copula.cov_mat, + m = n_features, + x_explain = x_explain0[i, , drop = FALSE], + x_train = as.matrix(x_train), + x_explain_gaussian = as.matrix(copula.x_explain_gaussian)[i, , drop = FALSE] + ) + dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") + dt_l[[i]][, w := 1 / n_samples] + dt_l[[i]][, id := i] + if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] + } + + dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE) + + return(dt) +} + +#' Sample conditional variables using the Gaussian copula approach +#' +#' @param index_given Integer vector. The indices of the features to condition upon. Note that +#' `min(index_given) >= 1` and `max(index_given) <= m`. +#' @param m Positive integer. The total number of features. +#' @param x_explain_gaussian Numeric matrix. Contains the observation whose predictions ought +#' to be explained (test data), +#' after quantile-transforming them to standard Gaussian variables. +#' @param x_explain Numeric matrix. Contains the features of the observation whose +#' predictions ought to be explained (test data). +#' +#' @return data.table +#' +#' @keywords internal +#' +#' @author Martin Jullum +sample_copula_old <- function(index_given, n_samples, mu, cov_mat, m, x_explain_gaussian, x_train, x_explain) { + # Handles the unconditional and full conditional separtely when predicting + if (length(index_given) %in% c(0, m)) { + ret <- matrix(x_explain, ncol = m, nrow = 1) + } else { + dependent_ind <- (seq_len(length(mu)))[-index_given] + + tmp <- condMVNorm::condMVN( + mean = mu, + sigma = cov_mat, + dependent.ind = dependent_ind, + given.ind = index_given, + X.given = x_explain_gaussian[index_given] + ) + + ret0_z <- mvnfast::rmvn(n = n_samples, mu = tmp$condMean, sigma = tmp$condVar) + + ret0_x <- apply( + X = rbind(ret0_z, x_train[, dependent_ind, drop = FALSE]), + MARGIN = 2, + FUN = inv_gaussian_transform_old, + n_z = n_samples, + type = 7 + ) + + ret <- matrix(NA, ncol = m, nrow = n_samples) + ret[, index_given] <- rep(x_explain[index_given], each = n_samples) + ret[, dependent_ind] <- ret0_x + } + colnames(ret) <- colnames(x_explain) + return(as.data.table(ret)) +} + + +#' Transforms new data to a standardized normal distribution +#' +#' @param zx Numeric vector. The first `n_z` items are the Gaussian data, and the last part is +#' the data with the original transformation. +#' @param n_z Positive integer. Number of elements of `zx` that belongs to new data. +#' +#' @return Numeric vector of length `n_z` +#' +#' @keywords internal +#' +#' @author Martin Jullum +inv_gaussian_transform_old <- function(zx, n_z, type) { + if (n_z >= length(zx)) stop("n_z should be less than length(zx)") + ind <- 1:n_z + z <- zx[ind] + x <- zx[-ind] + u <- stats::pnorm(z) + x_new <- stats::quantile(x, probs = u, type = type) + return(as.double(x_new)) +} + +#' Transforms a sample to standardized normal distribution +#' +#' @param x Numeric vector.The data which should be transformed to a standard normal distribution. +#' +#' @return Numeric vector of length `length(x)` +#' +#' @keywords internal +#' @author Martin Jullum +gaussian_transform_old <- function(x) { + u <- rank(x) / (length(x) + 1) + z <- stats::qnorm(u) + return(z) +} + +#' Transforms new data to standardized normal (dimension 1) based on other data transformations +#' +#' @param yx Numeric vector. The first `n_y` items is the data that is transformed, and last +#' part is the data with the original transformation. +#' @param n_y Positive integer. Number of elements of `yx` that belongs to the Gaussian data. +#' +#' @return Vector of back-transformed Gaussian data +#' +#' @keywords internal +#' @author Martin Jullum +gaussian_transform_separate_old <- function(yx, n_y) { + if (n_y >= length(yx)) stop("n_y should be less than length(yx)") + ind <- 1:n_y + x <- yx[-ind] + tmp <- rank(yx)[ind] + tmp <- tmp - rank(tmp) + 0.5 + u_y <- tmp / (length(x) + 1) + z_y <- stats::qnorm(u_y) + return(z_y) +} + + +## C++ arma version ------------------------------------------------------------------------------------------------- +#' @inheritParams default_doc +#' @rdname prepare_data +#' @export +#' @author Lars Henry Berge Olsen +prepare_data.copula_cpp_arma <- function(internal, index_features, ...) { + # Extract used variables + S <- internal$objects$S[index_features, , drop = FALSE] + feature_names <- internal$parameters$feature_names + n_explain <- internal$parameters$n_explain + n_samples <- internal$parameters$n_samples + n_features <- internal$parameters$n_features + n_combinations_now <- length(index_features) + x_train_mat <- as.matrix(internal$data$x_train) + x_explain_mat <- as.matrix(internal$data$x_explain) + copula.mu <- internal$parameters$copula.mu + copula.cov_mat <- internal$parameters$copula.cov_mat + copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian) + + # TODO: Note that `as.matrix` is not needed for `copula.x_explain_gaussian_mat` as it is already defined as a matrix + # in `setup_approach.copula`, however, it seems that Martin plans to make it into a data.table, thus, I include + # `as.matrix` as future safety. DISCUSS WITH MARTIN WHAT HIS PLANS ARE! + + # Generate the MC samples from N(0, 1) + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Use C++ to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}), for all coalitions and explicands, + # and then transforming them back to the original scale using the inverse Gaussian transform in C++. + # The object `dt` is a 3D array of dimension (n_samples, n_explain * n_coalitions, n_features). + dt <- prepare_data_copula_cpp_arma( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + x_explain_gaussian_mat = copula.x_explain_gaussian_mat, + x_train_mat = x_train_mat, + S = S, + mu = copula.mu, + cov_mat = copula.cov_mat + ) + + # Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features). + dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features) + + # Convert to a data.table and add extra identification columns + dt <- data.table::as.data.table(dt) + data.table::setnames(dt, feature_names) + dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)] + dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))] + dt[, w := 1 / n_samples] + dt[, id_combination := index_features[id_combination]] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + return(dt) +} + + + + + +## C++ and R version ---------------------------------------------------------------------------------------------- +#' @inheritParams default_doc +#' @rdname prepare_data +#' @export +#' @author Lars Henry Berge Olsen +prepare_data.copula_cpp_and_R <- function(internal, index_features, ...) { + # Extract used variables + S <- internal$objects$S[index_features, , drop = FALSE] + feature_names <- internal$parameters$feature_names + n_explain <- internal$parameters$n_explain + n_samples <- internal$parameters$n_samples + n_features <- internal$parameters$n_features + n_combinations_now <- length(index_features) + x_train_mat <- as.matrix(internal$data$x_train) + x_explain_mat <- as.matrix(internal$data$x_explain) + copula.mu <- internal$parameters$copula.mu + copula.cov_mat <- internal$parameters$copula.cov_mat + copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian) + + # TODO: Note that `as.matrix` is not needed for `copula.x_explain_gaussian_mat` as it is already defined as a matrix + # in `setup_approach.copula`, however, it seems that Martin plans to make it into a data.table, thus, I include + # `as.matrix` as future safety. DISCUSS WITH MARTIN WHAT HIS PLANS ARE! + + # Generate the MC samples from N(0, 1) + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Use C++ to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}), for all coalitions and explicands, + # and then transforming them back to the original scale using the inverse Gaussian transform in C++. + # The object `dt` is a 3D array of dimension (n_samples, n_explain * n_coalitions, n_features). + dt <- prepare_data_copula_cpp_and_R( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + x_explain_gaussian_mat = copula.x_explain_gaussian_mat, + x_train_mat = x_train_mat, + S = S, + mu = copula.mu, + cov_mat = copula.cov_mat + ) + + # Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features). + dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features) + + # Convert to a data.table and add extra identification columns + dt <- data.table::as.data.table(dt) + data.table::setnames(dt, feature_names) + dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)] + dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))] + dt[, w := 1 / n_samples] + dt[, id_combination := index_features[id_combination]] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + return(dt) +} + +#' Transform data using the inverse Gaussian transformation. +#' +#' @details This function is called from `prepare_data_copula_cpp()` as the this was faster +#' +#' @param z Matrix. The data are the Gaussian Monte Carlos samples to transform. +#' @param x Matrix. The data with the original transformation. Used to conduct the transformation of `z`. +#' +#' @return Matrix of the same dimension as `z`. +#' +#' @keywords internal +#' @author Lars Henry Berge Olsen +inv_gaussian_transform_R <- function(z, x) { + u <- stats::pnorm(z) + x_new <- sapply(seq_len(ncol(u)), function(idx) quantile.type7(x[, idx], probs = u[, idx])) + return(x_new) +} + +#' Compute the quantiles using quantile type seven +#' +#' @details Using quantile type number 7 from stats::quantile. +#' +#' @param x numeric vector whose sample quantiles are wanted. +#' @param probs numeric vector of probabilities with values between zero and one. +#' +#' @return A vector of length length(`probs`) is returned. +#' +#' @keywords internal +#' @author Lars Henry Berge Olsen +quantile.type7 <- function(x, probs) { + n <- length(x) + probs <- pmax(0, pmin(1, probs)) # allow for slight overshoot + index <- 1 + (n - 1) * probs + lo <- floor(index) + hi <- ceiling(index) + x <- sort(x, partial = unique(c(lo, hi))) + qs <- x[lo] + i <- which(index > lo) + h <- (index - lo)[i] + qs[i] <- (1 - h) * qs[i] + h * x[hi[i]] + return(qs) +} + + + +# C++ with sourceRcpp --------------------------------------------------------------------------------------------- +#' @inheritParams default_doc +#' @rdname prepare_data +#' @export +#' @author Lars Henry Berge Olsen +prepare_data.copula_sourceCpp <- function(internal, index_features, ...) { + # Extract used variables + S <- internal$objects$S[index_features, , drop = FALSE] + feature_names <- internal$parameters$feature_names + n_explain <- internal$parameters$n_explain + n_samples <- internal$parameters$n_samples + n_features <- internal$parameters$n_features + n_combinations_now <- length(index_features) + x_train_mat <- as.matrix(internal$data$x_train) + x_explain_mat <- as.matrix(internal$data$x_explain) + copula.mu <- internal$parameters$copula.mu + copula.cov_mat <- internal$parameters$copula.cov_mat + copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian) + + # Generate the MC samples from N(0, 1) + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Use C++ to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}), for all coalitions and explicands, + # and then transforming them back to the original scale using the inverse Gaussian transform in C++. + # The object `dt` is a 3D array of dimension (n_samples, n_explain * n_coalitions, n_features). + dt <- prepare_data_copula_cpp_sourceCpp( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + x_explain_gaussian_mat = copula.x_explain_gaussian_mat, + x_train_mat = x_train_mat, + S = S, + mu = copula.mu, + cov_mat = copula.cov_mat + ) + + # Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features). + dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features) + + # Convert to a data.table and add extra identification columns + dt <- data.table::as.data.table(dt) + data.table::setnames(dt, feature_names) + dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)] + dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))] + dt[, w := 1 / n_samples] + dt[, id_combination := index_features[id_combination]] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + return(dt) +} + +Rcpp::sourceCpp( + code = ' + #include +using namespace Rcpp; + +// [[Rcpp::depends(RcppArmadillo)]] + +// Compute the quantiles using quantile type seven +// +// @details Using quantile type number seven from stats::quantile in R. +// +// @param x arma::vec. Numeric vector whose sample quantiles are wanted. +// @param probs arma::vec. Numeric vector of probabilities with values between zero and one. +// +// @return A vector of length `length(probs)` with the quantiles is returned. +// +// @keywords internal +// @author Lars Henry Berge Olsen +// [[Rcpp::export]] + arma::vec quantile_type7_cpp_sourceCpp(const arma::vec& x, const arma::vec& probs) { + int n = x.n_elem; + int m = probs.n_elem; + + // Initialize output quantile vector + arma::vec qs(m); + + // Calculate indices + arma::vec index = 1 + (n - 1) * probs; + arma::vec lo = arma::floor(index); + arma::vec hi = arma::ceil(index); + + // Sort the data + arma::vec sorted_x = arma::sort(x); + + // Calculate quantiles using quantile type seven + for (int i = 0; i < m; ++i) { + qs(i) = sorted_x(lo(i) - 1); + if (index(i) > lo(i)) { + double h = index(i) - lo(i); + qs(i) = (1 - h) * qs(i) + h * sorted_x(hi(i) - 1); + } + } + + return qs; + } + +// Transforms new data to a standardized normal distribution +// +// @details The function uses `arma::quantile(...)` which corresponds to Rs `stats::quantile(..., type = 5)`. +// +// @param z arma::mat. The data are the Gaussian Monte Carlos samples to transform. +// @param x arma::mat. The data with the original transformation. Used to conduct the transformation of `z`. +// +// @return arma::mat of the same dimension as `z` +// +// @keywords internal +// @author Lars Henry Berge Olsen +// [[Rcpp::export]] + arma::mat inv_gaussian_transform_cpp_sourceCpp(const arma::mat& z, const arma::mat& x) { + int n_features = z.n_cols; + int n_samples = z.n_rows; + arma::mat z_new(n_samples, n_features); + arma::mat u = arma::normcdf(z); + for (int feature_idx = 0; feature_idx < n_features; feature_idx++) { + z_new.col(feature_idx) = quantile_type7_cpp_sourceCpp(x.col(feature_idx), u.col(feature_idx)); + } + return z_new; + } + +// Generate (Gaussian) Copula MC samples +// +// @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the +// univariate standard normal. +// @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations +// to explain on the original scale. +// @param x_explain_gaussian_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the +// observations to explain after being transformed using the Gaussian transform, i.e., the samples have been +// transformed to a standardized normal distribution. +// @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations. +// @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of +// the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones. +// This is not a problem internally in shapr as the empty and grand coalitions treated differently. +// @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed +// using the Gaussian transform, i.e., the samples have been transformed to a standardized normal distribution. +// @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance +// between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been +// transformed to a standardized normal distribution. +// +// @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where +// the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian +// copula MC samples for each explicand and coalition on the original scale. +// +// @export +// @keywords internal +// @author Lars Henry Berge Olsen +// [[Rcpp::export]] + arma::cube prepare_data_copula_cpp_sourceCpp(const arma::mat& MC_samples_mat, + const arma::mat& x_explain_mat, + const arma::mat& x_explain_gaussian_mat, + const arma::mat& x_train_mat, + const arma::mat& S, + const arma::vec& mu, + const arma::mat& cov_mat) { + + int n_explain = x_explain_mat.n_rows; + int n_samples = MC_samples_mat.n_rows; + int n_features = MC_samples_mat.n_cols; + int n_coalitions = S.n_rows; + + // Initialize auxiliary matrix and result cube + arma::mat aux_mat(n_samples, n_features); + arma::cube result_cube(n_samples, n_explain*n_coalitions, n_features); + + // Iterate over the coalitions + for (int S_ind = 0; S_ind < n_coalitions; S_ind++) { + + // Get current coalition S and the indices of the features in coalition S and mask Sbar + arma::mat S_now = S.row(S_ind); + arma::uvec S_now_idx = arma::find(S_now > 0.5); + arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); + + // Extract the features we condition on, both on the original scale and the Gaussian transformed values. + arma::mat x_S_star = x_explain_mat.cols(S_now_idx); + arma::mat x_S_star_gaussian = x_explain_gaussian_mat.cols(S_now_idx); + + // Extract the mean values of the Gaussian transformed features in the two sets + arma::vec mu_S = mu.elem(S_now_idx); + arma::vec mu_Sbar = mu.elem(Sbar_now_idx); + + // Extract the relevant parts of the Gaussian transformed covariance matrix + arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); + arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); + arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); + arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); + + // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); + arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; + + // Ensure that the conditional covariance matrix is symmetric + if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { + cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); + } + + // Compute the conditional mean of Xsbar given Xs = Xs_star_gaussian, i.e., of the Gaussian transformed features + arma::mat x_Sbar_gaussian_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star_gaussian.each_row() - mu_S.t()).t(); + x_Sbar_gaussian_mean.each_col() += mu_Sbar; + + // Transform the samples to be from N(O, Sigma_{Sbar|S}) + arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); + + // Loop over the different explicands and combine the generated values with the values we conditioned on + for (int idx_now = 0; idx_now < n_explain; idx_now++) { + + // Transform the MC samples to be from N(mu_{Sbar|S}, Sigma_{Sbar|S}) for one coalition and one explicand + arma::mat MC_samples_mat_now_now = + MC_samples_mat_now + repmat(trans(x_Sbar_gaussian_mean.col(idx_now)), n_samples, 1); + + // Transform the MC to the original scale using the inverse Gaussian transform + arma::mat MC_samples_mat_now_now_trans = + inv_gaussian_transform_cpp_sourceCpp(MC_samples_mat_now_now, x_train_mat.cols(Sbar_now_idx)); + + // Insert the generate Gaussian copula MC samples and the feature values we condition on into an auxiliary matrix + aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now_now_trans; + aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); + + // Insert the auxiliary matrix into the result cube + result_cube.col(S_ind*n_explain + idx_now) = aux_mat; + } + } + + return result_cube; + } + +// Transforms a sample to standardized normal distribution +// +// @param x Numeric matrix. The data which should be transformed to a standard normal distribution. +// +// @return Numeric matrix of dimension `dim(x)` +// +// @keywords internal +// @author Lars Henry Berge Olsen +// [[Rcpp::export]] +Rcpp::NumericMatrix gaussian_transform_cpp_sourceCpp(const arma::mat& x) { + int n_obs = x.n_rows; + int n_features = x.n_cols; + + // Pre allocate the return matrix + Rcpp::NumericMatrix x_trans(n_obs, n_features); + + // Iterate over the columns, i.e., the features + for (int idx_feature = 0; idx_feature < n_features; ++idx_feature) { + // Compute the rank and transform to standardized normal distribution + arma::vec rank_now = arma::conv_to::from(arma::sort_index(arma::sort_index(x.col(idx_feature)))); + Rcpp::NumericVector u = Rcpp::wrap((rank_now + 1) / (n_obs + 1)); + x_trans(Rcpp::_, idx_feature) = Rcpp::qnorm(u); + } + + return x_trans; +} + +// Transforms new data to standardized normal (column-wise) based on other data transformations +// +// @param y arma::mat. A numeric matrix containing the data that is to be transformed. +// @param x arma::mat. A numeric matrix containing the data of the original transformation. +// +// @return An arma::mat matrix of the same dimension as `y` containing the back-transformed Gaussian data. +// +// @keywords internal +// @author Lars Henry Berge Olsen, Martin Jullum +// [[Rcpp::export]] +Rcpp::NumericMatrix gaussian_transform_separate_cpp_sourceCpp(const arma::mat& y, const arma::mat& x) { + int n_features = x.n_cols; + int n_y_rows = y.n_rows; + int n_x_rows = x.n_rows; + + // Pre allocate the return matrix + Rcpp::NumericMatrix z_y(n_y_rows, n_features); + + // Compute the transformation for each feature at the time + for (int idx_feature = 0; idx_feature < n_features; ++idx_feature) { + arma::vec yx_now = arma::join_cols(y.col(idx_feature), x.col(idx_feature)); + arma::vec rank_now_1 = arma::conv_to::from(arma::sort_index(arma::sort_index(yx_now))).head(n_y_rows); + arma::vec rank_now_2 = arma::conv_to::from(arma::sort_index(arma::sort_index(rank_now_1))); + arma::vec tmp = rank_now_1 - rank_now_2 + 0.5; + Rcpp::NumericVector u_y = Rcpp::wrap(tmp / (n_x_rows + 1)); + z_y(Rcpp::_, idx_feature) = Rcpp::qnorm(u_y); + } + + return z_y; +} + + ' +) + + + +# Old C++ code ---------------------------------------------------------------------------------------------------- +Rcpp::sourceCpp( + code = ' +// [[Rcpp::depends("RcppArmadillo")]] +#include +using namespace Rcpp; + +// Transforms new data to a standardized normal distribution +// +// @details The function uses `arma::quantile(...)` which corresponds to Rs `stats::quantile(..., type = 5)`. +// +// @param z arma::mat. The data are the Gaussian Monte Carlos samples to transform. +// @param x arma::mat. The data with the original transformation. Used to conduct the transformation of `z`. +// +// @return arma::mat of the same dimension as `z` +// +// @keywords internal +// @author Lars Henry Berge Olsen +// [[Rcpp::export]] +arma::mat inv_gaussian_transform_cpp_arma(arma::mat z, arma::mat x) { + int n_features = z.n_cols; + int n_samples = z.n_rows; + arma::mat z_new(n_samples, n_features); + arma::mat u = arma::normcdf(z); + for (int feature_idx = 0; feature_idx < n_features; feature_idx++) { + z_new.col(feature_idx) = arma::quantile(x.col(feature_idx), u.col(feature_idx)); + } + return z_new; +} + +// Generate (Gaussian) Copula MC samples +// +// @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the +// univariate standard normal. +// @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations +// to explain on the original scale. +// @param x_explain_gaussian_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the +// observations to explain after being transformed using the Gaussian transform, i.e., the samples have been +// transformed to a standardized normal distribution. +// @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations. +// @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of +// the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones. +// This is not a problem internally in shapr as the empty and grand coalitions treated differently. +// @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed +// using the Gaussian transform, i.e., the samples have been transformed to a standardized normal distribution. +// @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance +// between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been +// transformed to a standardized normal distribution. +// +// @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where +// the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian +// copula MC samples for each explicand and coalition on the original scale. +// +// @export +// @keywords internal +// @author Lars Henry Berge Olsen +// [[Rcpp::export]] +arma::cube prepare_data_copula_cpp_arma(arma::mat MC_samples_mat, + arma::mat x_explain_mat, + arma::mat x_explain_gaussian_mat, + arma::mat x_train_mat, + arma::mat S, + arma::vec mu, + arma::mat cov_mat) { + + int n_explain = x_explain_mat.n_rows; + int n_samples = MC_samples_mat.n_rows; + int n_features = MC_samples_mat.n_cols; + int n_coalitions = S.n_rows; + + // Initialize auxiliary matrix and result cube + arma::mat aux_mat(n_samples, n_features); + arma::cube result_cube(n_samples, n_explain*n_coalitions, n_features); + + // Iterate over the coalitions + for (int S_ind = 0; S_ind < n_coalitions; S_ind++) { + + // Get current coalition S and the indices of the features in coalition S and mask Sbar + arma::mat S_now = S.row(S_ind); + arma::uvec S_now_idx = arma::find(S_now > 0.5); + arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); + + // Extract the features we condition on, both on the original scale and the Gaussian transformed values. + arma::mat x_S_star = x_explain_mat.cols(S_now_idx); + arma::mat x_S_star_gaussian = x_explain_gaussian_mat.cols(S_now_idx); + + // Extract the mean values of the Gaussian transformed features in the two sets + arma::vec mu_S = mu.elem(S_now_idx); + arma::vec mu_Sbar = mu.elem(Sbar_now_idx); + + // Extract the relevant parts of the Gaussian transformed covariance matrix + arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); + arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); + arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); + arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); + + // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); + arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; + + // Ensure that the conditional covariance matrix is symmetric + if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { + cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); + } + + // Compute the conditional mean of Xsbar given Xs = Xs_star_gaussian, i.e., of the Gaussian transformed features + arma::mat x_Sbar_gaussian_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star_gaussian.each_row() - mu_S.t()).t(); + x_Sbar_gaussian_mean.each_col() += mu_Sbar; + + // Transform the samples to be from N(O, Sigma_{Sbar|S}) + arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); + + // Loop over the different explicands and combine the generated values with the values we conditioned on + for (int idx_now = 0; idx_now < n_explain; idx_now++) { + + // Transform the MC samples to be from N(mu_{Sbar|S}, Sigma_{Sbar|S}) for one coalition and one explicand + arma::mat MC_samples_mat_now_now = + MC_samples_mat_now + repmat(trans(x_Sbar_gaussian_mean.col(idx_now)), n_samples, 1); + + // Transform the MC to the original scale using the inverse Gaussian transform + arma::mat MC_samples_mat_now_now_trans = + inv_gaussian_transform_cpp_arma(MC_samples_mat_now_now, x_train_mat.cols(Sbar_now_idx)); + + // Insert the generate Gaussian copula MC samples and the feature values we condition on into an auxiliary matrix + aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now_now_trans; + aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); + + // Insert the auxiliary matrix into the result cube + result_cube.col(S_ind*n_explain + idx_now) = aux_mat; + } + } + + return result_cube; +} + +// Generate (Gaussian) Copula MC samples +// +// @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the +// univariate standard normal. +// @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations +// to explain on the original scale. +// @param x_explain_gaussian_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the +// observations to explain after being transformed using the Gaussian transform, i.e., the samples have been +// transformed to a standardized normal distribution. +// @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations. +// @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of +// the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones. +// This is not a problem internally in shapr as the empty and grand coalitions treated differently. +// @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed +// using the Gaussian transform, i.e., the samples have been transformed to a standardized normal distribution. +// @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance +// between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been +// transformed to a standardized normal distribution. +// +// @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where +// the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian +// copula MC samples for each explicand and coalition on the original scale. +// +// @export +// @keywords internal +// @author Lars Henry Berge Olsen +// [[Rcpp::export]] +arma::cube prepare_data_copula_cpp_and_R(arma::mat MC_samples_mat, + arma::mat x_explain_mat, + arma::mat x_explain_gaussian_mat, + arma::mat x_train_mat, + arma::mat S, + arma::vec mu, + arma::mat cov_mat) { + + int n_explain = x_explain_mat.n_rows; + int n_samples = MC_samples_mat.n_rows; + int n_features = MC_samples_mat.n_cols; + int n_coalitions = S.n_rows; + + // Get the R functions for computing the inverse gaussian transform + Rcpp::Function inv_gaussian_transform_R("inv_gaussian_transform_R"); + + // Initialize auxiliary matrix and result cube + arma::mat aux_mat(n_samples, n_features); + arma::cube result_cube(n_samples, n_explain*n_coalitions, n_features); + + // Iterate over the coalitions + for (int S_ind = 0; S_ind < n_coalitions; S_ind++) { + + // Get current coalition S and the indices of the features in coalition S and mask Sbar + arma::mat S_now = S.row(S_ind); + arma::uvec S_now_idx = arma::find(S_now > 0.5); + arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); + + // Extract the features we condition on, both on the original scale and the Gaussian transformed values. + arma::mat x_S_star = x_explain_mat.cols(S_now_idx); + arma::mat x_S_star_gaussian = x_explain_gaussian_mat.cols(S_now_idx); + + // Extract the mean values of the Gaussian transformed features in the two sets + arma::vec mu_S = mu.elem(S_now_idx); + arma::vec mu_Sbar = mu.elem(Sbar_now_idx); + + // Extract the relevant parts of the Gaussian transformed covariance matrix + arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); + arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); + arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); + arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); + + // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); + arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; + + // Ensure that the conditional covariance matrix is symmetric + if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { + cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); + } + + // Compute the conditional mean of Xsbar given Xs = Xs_star_gaussian, i.e., of the Gaussian transformed features + arma::mat x_Sbar_gaussian_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star_gaussian.each_row() - mu_S.t()).t(); + x_Sbar_gaussian_mean.each_col() += mu_Sbar; + + // Transform the samples to be from N(O, Sigma_{Sbar|S}) + arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); + + // Loop over the different explicands and combine the generated values with the values we conditioned on + for (int idx_now = 0; idx_now < n_explain; idx_now++) { + + // Transform the MC samples to be from N(mu_{Sbar|S}, Sigma_{Sbar|S}) for one coalition and one explicand + arma::mat MC_samples_mat_now_now = + MC_samples_mat_now + repmat(trans(x_Sbar_gaussian_mean.col(idx_now)), n_samples, 1); + + arma::mat x_train_mat_now = x_train_mat.cols(Sbar_now_idx); + //arma::mat x_train_mat_now = arma::normcdf(x_train_mat.cols(Sbar_now_idx)); + + // Transform the MC to the original scale using the inverse Gaussian transform + arma::mat MC_samples_mat_now_now_trans = + Rcpp::as(inv_gaussian_transform_R(Rcpp::wrap(MC_samples_mat_now_now), + Rcpp::wrap(x_train_mat_now))); + + // Insert the generate Gaussian copula MC samples and the feature values we condition on into an auxiliary matrix + aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now_now_trans; + aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); + + // Insert the auxiliary matrix into the result cube + result_cube.col(S_ind*n_explain + idx_now) = aux_mat; + } + } + + return result_cube; +} +') + + + + + + + + + + +# Setup ----------------------------------------------------------------------------------------------------------- +{ + n_samples <- 1000 + n_train <- 1000 + n_test <- 20 + M <- 8 + rho <- 0.5 + betas <- c(0, rep(1, M)) + + # We use the Gaussian copula approach + approach <- "copula" + + # Mean of the multivariate Gaussian distribution + mu <- rep(0, times = M) + mu <- seq(M) + + # Create the covariance matrix + sigma <- matrix(rho, ncol = M, nrow = M) # Old + for (i in seq(1, M - 1)) { + for (j in seq(i + 1, M)) { + sigma[i, j] <- sigma[j, i] <- rho^abs(i - j) + } + } + diag(sigma) <- 1 + + # Set seed for reproducibility + seed_setup <- 1996 + set.seed(seed_setup) + + # Make Gaussian data + data_train <- data.table(mvtnorm::rmvnorm(n = n_train, mean = mu, sigma = sigma)) + data_test <- data.table(mvtnorm::rmvnorm(n = n_test, mean = mu, sigma = sigma)) + colnames(data_train) <- paste("X", seq(M), sep = "") + colnames(data_test) <- paste("X", seq(M), sep = "") + + # Make the response + response_train <- as.vector(cbind(1, as.matrix(data_train)) %*% betas) + response_test <- as.vector(cbind(1, as.matrix(data_test)) %*% betas) + + # Put together the data + data_train_with_response <- copy(data_train)[, y := response_train] + data_test_with_response <- copy(data_test)[, y := response_test] + + # Fit a LM model + predictive_model <- lm(y ~ ., data = data_train_with_response) + + # Get the prediction zero, i.e., the phi0 Shapley value. + prediction_zero <- mean(response_train) + + model <- predictive_model + x_explain <- data_test + x_train <- data_train + keep_samp_for_vS <- FALSE + predict_model <- NULL + get_model_specs <- NULL + timing <- TRUE + n_combinations <- NULL + group <- NULL + feature_specs <- shapr:::get_feature_specs(get_model_specs, model) + n_batches <- 1 + seed <- 1 + + internal <- setup( + x_train = x_train, + x_explain = x_explain, + approach = approach, + prediction_zero = prediction_zero, + n_combinations = n_combinations, + group = group, + n_samples = n_samples, + n_batches = n_batches, + seed = seed, + feature_specs = feature_specs, + keep_samp_for_vS = keep_samp_for_vS, + predict_model = predict_model, + get_model_specs = get_model_specs, + timing = timing + ) + + # Gets predict_model (if not passed to explain) + predict_model <- shapr:::get_predict_model( + predict_model = predict_model, + model = model + ) + + # Sets up the Shapley (sampling) framework and prepares the + # conditional expectation computation for the chosen approach + # Note: model and predict_model are ONLY used by the AICc-methods of approach empirical to find optimal parameters + internal <- shapr:::setup_computation(internal, model, predict_model) +} + + +# Compare shapr compile and rcpp compile -------------------------------------------------------------------------- +look_at_coalitions <- seq(1, 2^M - 2) +index_features = internal$objects$S_batch$`1`[look_at_coalitions] +S <- internal$objects$S[index_features, , drop = FALSE] +feature_names <- internal$parameters$feature_names +n_explain <- internal$parameters$n_explain +n_samples <- internal$parameters$n_samples +n_features <- internal$parameters$n_features +n_combinations_now <- length(index_features) +x_train_mat <- as.matrix(internal$data$x_train) +x_explain_mat <- as.matrix(internal$data$x_explain) +copula.mu <- internal$parameters$copula.mu +copula.cov_mat <- internal$parameters$copula.cov_mat +copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian) + +# Generate the MC samples from N(0, 1) +MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + +# Use C++ to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}), for all coalitions and explicands, +# and then transforming them back to the original scale using the inverse Gaussian transform in C++. +# The object `dt` is a 3D array of dimension (n_samples, n_explain * n_coalitions, n_features). +time_1 = system.time({dt_1 <- prepare_data_copula_cpp( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + x_explain_gaussian_mat = copula.x_explain_gaussian_mat, + x_train_mat = x_train_mat, + S = S, + mu = copula.mu, + cov_mat = copula.cov_mat +)}) +time_1 + +time_2 = system.time({dt_2 <- shapr:::prepare_data_copula_cpp( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + x_explain_gaussian_mat = copula.x_explain_gaussian_mat, + x_train_mat = x_train_mat, + S = S, + mu = copula.mu, + cov_mat = copula.cov_mat +)}) +time_2 + +time_3 = system.time({dt_3 <- + .Call(`_shapr_prepare_data_copula_cpp`, + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + x_explain_gaussian_mat = copula.x_explain_gaussian_mat, + x_train_mat = x_train_mat, + S = S, + mu = copula.mu, + cov_mat = copula.cov_mat + )}) +time_3 + +Rcpp::sourceCpp("src/Copula.cpp") +time_4 = system.time({dt_4 <- prepare_data_copula_cpp( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + x_explain_gaussian_mat = copula.x_explain_gaussian_mat, + x_train_mat = x_train_mat, + S = S, + mu = copula.mu, + cov_mat = copula.cov_mat +)}) +time_4 + +time_5 = system.time({dt_5 <- shapr::prepare_data_copula_cpp( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + x_explain_gaussian_mat = copula.x_explain_gaussian_mat, + x_train_mat = x_train_mat, + S = S, + mu = copula.mu, + cov_mat = copula.cov_mat +)}) + +rbind(time_1, time_2, time_3, time_4, time_5) +all.equal(dt_1, dt_2) +all.equal(dt_2, dt_3) +all.equal(dt_3, dt_4) +all.equal(dt_4, dt_5) + +# Compare prepare_data.copula ---------------------------------------------------------------------------------------- +set.seed(123) + +# Recall that old version iterate over the observations and then the coalitions. +# While the new version iterate over the coalitions and then the observations. +# The latter lets us reuse the computed conditional distributions for all observations. +look_at_coalitions <- seq(1, 2^M - 2) +# look_at_coalitions <- seq(1, 2^M - 2, 10) +# look_at_coalitions <- seq(1, 2^M - 2, 25) + +# The old R code +time_only_R <- system.time({ + res_only_R <- prepare_data.copula_old( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions] + ) +}) +time_only_R + +# The C++ code with my own quantile function +time_only_cpp <- system.time({ + res_only_cpp <- prepare_data.copula( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions] + ) +}) +data.table::setorderv(res_only_cpp, c("id", "id_combination")) +time_only_cpp + +# The C++ code with my own quantile function +time_only_cpp_sourceCpp <- system.time({ + res_only_cpp_sourceCpp <- prepare_data.copula_sourceCpp( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions] + ) +}) +data.table::setorderv(res_only_cpp_sourceCpp, c("id", "id_combination")) +time_only_cpp_sourceCpp + +# The C++ code with quantile functions from arma +time_only_cpp_arma <- system.time({ + res_only_cpp_arma <- prepare_data.copula_cpp_arma( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions] + ) +}) +data.table::setorderv(res_only_cpp_arma, c("id", "id_combination")) +time_only_cpp_arma + +# The new C++ code with quantile from R +time_cpp_and_R <- system.time({ + res_cpp_and_R <- prepare_data.copula_cpp_and_R( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions] + ) +}) +data.table::setorderv(res_cpp_and_R, c("id", "id_combination")) +time_cpp_and_R + +# Create a table of the times. Less is better +times <- rbind( + time_only_R, + time_only_cpp, + time_only_cpp_sourceCpp, + time_only_cpp_arma, + time_cpp_and_R +) +times + +# TIMES for all coalitions (254), n_samples <- 1000, n_train <- 1000, n_test <- 20, M <- 8 + +# user.self sys.self elapsed user.child sys.child +# time_only_R 65.266 2.142 70.896 0 0 +# time_only_cpp 4.622 0.393 5.212 0 0 +# time_only_cpp_sourceCpp 1.823 0.423 2.279 0 0 +# time_only_cpp_arma 23.874 0.604 27.801 0 0 +# time_cpp_and_R 6.826 1.493 8.683 0 0 + +# Relative speedup of new method +times_relative <- t(sapply(seq_len(nrow(times)), function(idx) times[1, ] / times[idx, ])) +rownames(times_relative) <- paste0(rownames(times), "_rel") +times_relative + +# RELATIVE TIMES for all coalitions, n_samples <- 1000, n_train <- 1000, n_test <- 20, M <- 8 +# user.self sys.self elapsed user.child sys.child +# time_only_R_rel 1.0000 1.0000 1.0000 NaN NaN +# time_only_cpp_rel 14.1207 5.4504 13.6025 NaN NaN +# time_only_cpp_sourceCpp_rel 35.8014 5.0638 31.1084 NaN NaN +# time_only_cpp_arma_rel 2.7338 3.5464 2.5501 NaN NaN +# time_cpp_and_R_rel 9.5614 1.4347 8.1649 NaN NaN + +# Aggregate the MC sample values for each explicand and combination +res_only_R <- res_only_R[, w := NULL] +res_only_cpp <- res_only_cpp[, w := NULL] +res_only_cpp_sourceCpp <- res_only_cpp_sourceCpp[, w := NULL] +res_only_cpp_arma <- res_only_cpp_arma[, w := NULL] +res_cpp_and_R <- res_cpp_and_R[, w := NULL] +res_only_R_agr <- res_only_R[, lapply(.SD, mean), by = c("id", "id_combination")] +res_only_cpp_agr <- res_only_cpp[, lapply(.SD, mean), by = c("id", "id_combination")] +res_only_cpp_sourceCpp_agr <- res_only_cpp_sourceCpp[, lapply(.SD, mean), by = c("id", "id_combination")] +res_only_cpp_arma_agr <- res_only_cpp_arma[, lapply(.SD, mean), by = c("id", "id_combination")] +res_cpp_and_R_agr <- res_cpp_and_R[, lapply(.SD, mean), by = c("id", "id_combination")] + +# Difference +res_only_R_agr - res_only_cpp_agr +res_only_R_agr - res_only_cpp_sourceCpp_agr +res_only_R_agr - res_only_cpp_arma_agr +res_only_R_agr - res_cpp_and_R_agr + +# Max absolute difference +max(abs(res_only_R_agr - res_only_cpp_agr)) +max(abs(res_only_R_agr - res_only_cpp_sourceCpp_agr)) +max(abs(res_only_R_agr - res_only_cpp_arma_agr)) +max(abs(res_only_R_agr - res_cpp_and_R_agr)) + +# Max absolute relative difference +max(abs(res_only_R_agr - res_only_cpp_agr) / res_only_cpp_agr) +max(abs(res_only_R_agr - res_only_cpp_sourceCpp_agr) / res_only_cpp_sourceCpp_agr) +max(abs(res_only_R_agr - res_only_cpp_arma_agr) / res_only_cpp_arma_agr) +max(abs(res_only_R_agr - res_cpp_and_R_agr) / res_cpp_and_R_agr) + +# Compare gaussian_transform -------------------------------------------------------------------------------------- +set.seed(123) +x_temp_rows = 10000 +x_temp_cols = 20 +x_temp = matrix(rnorm(x_temp_rows*x_temp_cols), x_temp_rows, x_temp_cols) + +# Compare for equal values +system.time({gaussian_transform_R_res = apply(X = x_temp, MARGIN = 2, FUN = gaussian_transform_old)}) +system.time({gaussian_transform_cpp_res = gaussian_transform_cpp(x_temp)}) +system.time({gaussian_transform_cpp_sourceCpp_res = gaussian_transform_cpp_sourceCpp(x_temp)}) +all.equal(gaussian_transform_R_res, gaussian_transform_cpp_res) # TRUE +all.equal(gaussian_transform_R_res, gaussian_transform_cpp_sourceCpp_res) # TRUE + +# Compare time (generate new data each time such that the result is not stored in the cache) +rbenchmark::benchmark(R = apply(X = matrix(rnorm(x_temp_rows*x_temp_cols), x_temp_rows, x_temp_cols), + MARGIN = 2, + FUN = gaussian_transform_old), + cpp = gaussian_transform_cpp(matrix(rnorm(x_temp_rows*x_temp_cols), x_temp_rows, x_temp_cols)), + cpp_sourceCpp = gaussian_transform_cpp_sourceCpp(matrix(rnorm(x_temp_rows*x_temp_cols), + x_temp_rows, + x_temp_cols)), + replications = 100) +# test replications elapsed relative user.self sys.self user.child sys.child +# 2 cpp 100 7.604 1.987 7.059 0.294 0 0 +# 3 cpp_sourceCpp 100 3.827 1.000 3.529 0.254 0 0 +# 1 R 100 6.183 1.616 4.899 0.738 0 0 + + +# Compare gaussian_transform_separate ------------------------------------------------------------------------- +set.seed(123) +x_cols = 10 +x_train_rows = 50000 +x_explain_rows = 50000 +x_train_temp = matrix(rnorm(x_train_rows*x_cols), x_train_rows, x_cols) +x_explain_temp = matrix(rnorm(x_explain_rows*x_cols), x_explain_rows, x_cols) +x_explain_train_temp = rbind(x_explain_temp, x_train_temp) + +system.time({r = apply(X = rbind(x_explain_temp, x_train_temp), + MARGIN = 2, + FUN = gaussian_transform_separate_old, + n_y = nrow(x_explain_temp))}) +system.time({cpp = gaussian_transform_separate_cpp(x_explain_temp, x_train_temp)}) +system.time({cpp_sourceCpp = gaussian_transform_separate_cpp_sourceCpp(x_explain_temp, x_train_temp)}) +all.equal(r, cpp) +all.equal(r, cpp_sourceCpp) + +# Rcpp::sourceCpp("src/Copula.cpp") # C++ code is faster when I recompile it? I don't understand. +rbenchmark::benchmark(r = apply(X = rbind(x_explain_temp, x_train_temp), + MARGIN = 2, + FUN = gaussian_transform_separate_old, + n_y = nrow(x_explain_temp)), + cpp = gaussian_transform_separate_cpp(x_explain_temp, x_train_temp), + cpp_sourceCpp = gaussian_transform_separate_cpp_sourceCpp(x_explain_temp, x_train_temp), + replications = 20) +# test replications elapsed relative user.self sys.self user.child sys.child +# 2 cpp 20 10.933 2.352 10.082 0.179 0 0 +# 3 cpp_sourceCpp 20 4.648 1.000 4.389 0.100 0 0 +# 1 r 20 9.787 2.106 8.409 0.797 0 0 + +rbenchmark::benchmark(r = apply(X = rbind(matrix(rnorm(x_explain_rows*x_cols), x_explain_rows, x_cols), + matrix(rnorm(x_train_rows*x_cols), x_train_rows, x_cols)), + MARGIN = 2, + FUN = gaussian_transform_separate_old, + n_y = nrow(matrix(rnorm(x_explain_rows*x_cols), x_explain_rows, x_cols))), + cpp = gaussian_transform_separate_cpp(matrix(rnorm(x_explain_rows*x_cols), + x_explain_rows, + x_cols), + matrix(rnorm(x_train_rows*x_cols), x_train_rows, x_cols)), + cpp2 = .Call(`_shapr_gaussian_transform_separate_cpp`, + matrix(rnorm(x_explain_rows*x_cols), x_explain_rows, x_cols), + matrix(rnorm(x_train_rows*x_cols), x_train_rows, x_cols)), + cpp_cpp_sourceCpp = gaussian_transform_separate_cpp_sourceCpp(matrix(rnorm(x_explain_rows*x_cols), + x_explain_rows, + x_cols), + matrix(rnorm(x_train_rows*x_cols), x_train_rows, x_cols)), + replications = 20) + +# test replications elapsed relative user.self sys.self user.child sys.child +# 2 cpp 20 12.634 2.202 11.275 0.352 0.00 0.00 +# 4 cpp_cpp_sourceCpp 20 5.737 1.000 5.287 0.182 0.00 0.00 +# 3 cpp2 20 11.566 2.016 10.890 0.246 0.01 0.01 +# 1 r 20 11.937 2.081 10.232 1.027 0.00 0.00 + + + + + +# Simple C examples compile issues time -------------------------------------------------------------------------------- +sourceCpp( + code = ' +#include +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::export]] +Rcpp::NumericVector addVectors(const Rcpp::NumericVector& vec1, const Rcpp::NumericVector& vec2) { + // Check if the input vectors are of the same length + if (vec1.size() != vec2.size()) { + Rcpp::stop("Vectors must be of the same length."); + } + + // Create a result vector of the same length as the input vectors + Rcpp::NumericVector result(vec1.size()); + + // Perform element-wise addition + for (int i = 0; i < vec1.size(); ++i) { + result[i] = vec1[i] + vec2[i]; + } + + return result; +} + +// [[Rcpp::export]] +Rcpp::NumericMatrix addMatrices(const Rcpp::NumericMatrix& mat1, const Rcpp::NumericMatrix& mat2) { + // Check if the input matrices have the same dimensions + if (mat1.nrow() != mat2.nrow() || mat1.ncol() != mat2.ncol()) { + Rcpp::stop("Matrices must have the same dimensions."); + } + + // Create a result matrix of the same dimensions as the input matrices + Rcpp::NumericMatrix result(mat1.nrow(), mat1.ncol()); + + // Perform element-wise addition + for (int i = 0; i < mat1.nrow(); ++i) { + for (int j = 0; j < mat1.ncol(); ++j) { + result(i, j) = mat1(i, j) + mat2(i, j); + } + } + + return result; +} + +// [[Rcpp::export]] +arma::mat addMatricesArmadillo(const arma::mat& mat1, const arma::mat& mat2) { + // Check if the input matrices have the same dimensions + if (mat1.n_rows != mat2.n_rows || mat1.n_cols != mat2.n_cols) { + Rcpp::stop("Matrices must have the same dimensions."); + } + + // Perform element-wise addition using Armadillo + arma::mat result = mat1 + mat2; + + return result; +}') + + +# !!!!!READ!!!!! +# Copy the code above into `src/Copula.cpp` and then build the package with +devtools::load_all(".") + +# Dimension of matrix +n = 1000000 +m = 100 + +# Make matrices +mat1 = matrix(rnorm(n*m), n, m) +mat2 = matrix(rnorm(n*m), n, m) + +# Time when using the compiled code using `devtools::load_all()` +shapr_vec_time = system.time({shapr_vec_res = addVectors(mat1[,1], mat2[,1])}) +shapr_mat_rcpp_time = system.time({shapr_mat_rcpp_res <- addMatrices(mat1, mat2)}) +shapr_mat_arma_time = system.time({shapr_mat_arma_res <- addMatricesArmadillo(mat1, mat2)}) + +# Then we compile with `Rcpp::compileAttributes()` +Rcpp::compileAttributes(pkgdir = ".", verbose = TRUE) +compileAttributes_vec_time = system.time({compileAttributes_vec_res = addVectors(mat1[,1], mat2[,1])}) +compileAttributes_mat_rcpp_time = system.time({compileAttributes_mat_rcpp_res <- addMatrices(mat1, mat2)}) +compileAttributes_mat_arma_time = system.time({compileAttributes_mat_arma_res <- addMatricesArmadillo(mat1, mat2)}) + +# Then we compile with `Rcpp::sourceCpp()` +# Here a shared library is built +Rcpp::sourceCpp("src/Copula.cpp", verbose = TRUE) +sourceCpp_vec_time = system.time({sourceCpp_vec_res = addVectors(mat1[,1], mat2[,1])}) +sourceCpp_mat_rcpp_time = system.time({sourceCpp_mat_rcpp_res <- addMatrices(mat1, mat2)}) +sourceCpp_mat_arma_time = system.time({sourceCpp_mat_arma_res <- addMatricesArmadillo(mat1, mat2)}) + +# Look at the times. See a drastic decrease when using sourceCpp. Half on my mac +rbind(shapr_vec_time, + compileAttributes_vec_time, + sourceCpp_vec_time) +rbind(shapr_mat_rcpp_time, + compileAttributes_mat_rcpp_time, + sourceCpp_mat_rcpp_time) +rbind(shapr_mat_arma_time, + compileAttributes_mat_arma_time, + sourceCpp_mat_arma_time) + +# All equal +all.equal(shapr_vec_res, compileAttributes_vec_res) +all.equal(shapr_vec_res, sourceCpp_vec_res) + +all.equal(shapr_mat_rcpp_res, compileAttributes_mat_rcpp_res) +all.equal(shapr_mat_rcpp_res, sourceCpp_mat_rcpp_res) + +all.equal(shapr_mat_arma_res, compileAttributes_mat_arma_res) +all.equal(shapr_mat_arma_res, sourceCpp_mat_arma_res) + + + + +# Large n_samples equal results ---------------------------------------------------------------------------------------- +{ + n_samples <- 1000000 + n_train <- 1000 + n_test <- 5 + M <- 4 + rho <- 0.5 + betas <- c(0, rep(1, M)) + + # We use the Gaussian copula approach + approach <- "copula" + + # Mean of the multivariate Gaussian distribution + mu <- rep(0, times = M) + mu <- seq(M) + + # Create the covariance matrix + sigma <- matrix(rho, ncol = M, nrow = M) # Old + for (i in seq(1, M - 1)) { + for (j in seq(i + 1, M)) { + sigma[i, j] <- sigma[j, i] <- rho^abs(i - j) + } + } + diag(sigma) <- 1 + + # Set seed for reproducibility + seed_setup <- 1996 + set.seed(seed_setup) + + # Make Gaussian data + data_train <- data.table(mvtnorm::rmvnorm(n = n_train, mean = mu, sigma = sigma)) + data_test <- data.table(mvtnorm::rmvnorm(n = n_test, mean = mu, sigma = sigma)) + colnames(data_train) <- paste("X", seq(M), sep = "") + colnames(data_test) <- paste("X", seq(M), sep = "") + + # Make the response + response_train <- as.vector(cbind(1, as.matrix(data_train)) %*% betas) + response_test <- as.vector(cbind(1, as.matrix(data_test)) %*% betas) + + # Put together the data + data_train_with_response <- copy(data_train)[, y := response_train] + data_test_with_response <- copy(data_test)[, y := response_test] + + # Fit a LM model + predictive_model <- lm(y ~ ., data = data_train_with_response) + + # Get the prediction zero, i.e., the phi0 Shapley value. + prediction_zero <- mean(response_train) + + model <- predictive_model + x_explain <- data_test + x_train <- data_train + keep_samp_for_vS <- FALSE + predict_model <- NULL + get_model_specs <- NULL + timing <- TRUE + n_combinations <- NULL + group <- NULL + feature_specs <- shapr:::get_feature_specs(get_model_specs, model) + n_batches <- 1 + seed <- 1 + + internal <- setup( + x_train = x_train, + x_explain = x_explain, + approach = approach, + prediction_zero = prediction_zero, + n_combinations = n_combinations, + group = group, + n_samples = n_samples, + n_batches = n_batches, + seed = seed, + feature_specs = feature_specs, + keep_samp_for_vS = keep_samp_for_vS, + predict_model = predict_model, + get_model_specs = get_model_specs, + timing = timing + ) + + # Gets predict_model (if not passed to explain) + predict_model <- shapr:::get_predict_model( + predict_model = predict_model, + model = model + ) + + # Sets up the Shapley (sampling) framework and prepares the + # conditional expectation computation for the chosen approach + # Note: model and predict_model are ONLY used by the AICc-methods of approach empirical to find optimal parameters + internal <- shapr:::setup_computation(internal, model, predict_model) +} + +look_at_coalitions <- seq(1, 2^M - 2) +# look_at_coalitions <- seq(1, 2^M - 2, 10) +# look_at_coalitions <- seq(1, 2^M - 2, 25) + +# The old R code +time_only_R <- system.time({ + res_only_R <- prepare_data.copula_old( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions] + ) +}) +time_only_R + +# The C++ code with my own quantile function +time_only_cpp <- system.time({ + res_only_cpp <- prepare_data.copula( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions] + ) +}) +data.table::setorderv(res_only_cpp, c("id", "id_combination")) +time_only_cpp + +# The C++ code with my own quantile function +time_only_cpp_sourceCpp <- system.time({ + res_only_cpp_sourceCpp <- prepare_data.copula_sourceCpp( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions] + ) +}) +data.table::setorderv(res_only_cpp_sourceCpp, c("id", "id_combination")) +time_only_cpp_sourceCpp + +# Look at the differences +# Aggregate the MC sample values for each explicand and combination +# res_only_R <- res_only_R[, w := NULL] +# res_only_cpp <- res_only_cpp[, w := NULL] +# res_only_cpp_sourceCpp <- res_only_cpp_sourceCpp[, w := NULL] +res_only_R_agr <- res_only_R[, lapply(.SD, mean), by = c("id", "id_combination")] +res_only_cpp_agr <- res_only_cpp[, lapply(.SD, mean), by = c("id", "id_combination")] +res_only_cpp_sourceCpp_agr <- res_only_cpp_sourceCpp[, lapply(.SD, mean), by = c("id", "id_combination")] + +# Difference +res_only_R_agr - res_only_cpp_agr +res_only_R_agr - res_only_cpp_sourceCpp_agr + +# Max absolute difference +max(abs(res_only_R_agr - res_only_cpp_agr)) +max(abs(res_only_R_agr - res_only_cpp_sourceCpp_agr)) + +# Look at the difference in Shapley values +temp_shapley_value_func = function(dt, internal, model, predict_model) { + compute_preds( + dt, # Updating dt by reference + feature_names = internal$parameters$feature_names, + predict_model = predict_model, + model = model, + pred_cols = paste0("p_hat", seq_len(internal$parameters$output_size)), + type = internal$parameters$type, + horizon = internal$parameters$horizon, + n_endo = internal$data$n_endo, + explain_idx = internal$parameters$explain_idx, + explain_lags = internal$parameters$explain_lags, + y = internal$data$y, + xreg = internal$data$xreg + ) + dt_vS2 <- compute_MCint(dt, paste0("p_hat", seq_len(internal$parameters$output_size))) + dt_vS <- rbind(t(as.matrix(c(1, rep(prediction_zero, n_test)))), dt_vS2, t(as.matrix(c(2^M, response_test))), + use.names = FALSE) + colnames(dt_vS) = colnames(dt_vS2) + compute_shapley_new(internal, dt_vS) +} + +# Compute the Shapley values +res_shapley_R = temp_shapley_value_func(data.table::copy(res_only_R), internal, model, predict_model) +res_shapley_cpp = temp_shapley_value_func(data.table::copy(res_only_cpp), internal, model, predict_model) +res_shapley_cpp_sourceCpp = temp_shapley_value_func(data.table::copy(res_only_cpp_sourceCpp), + internal, + model, + predict_model) +# Look at the difference +abs(res_shapley_R - res_shapley_cpp) +abs(res_shapley_R - res_shapley_cpp_sourceCpp) +max(abs(res_shapley_R - res_shapley_cpp)) +max(abs(res_shapley_R - res_shapley_cpp_sourceCpp)) + +# When n_samples <- 1000000, n_train <- 1000, n_test <- 5, M <- 4 +# > abs(res_shapley_R - res_shapley_cpp) +# none X1 X2 X3 X4 +# 1: 7.2140e-11 0.00056643 0.00109848 9.5478e-05 0.00043657 +# 2: 4.3903e-10 0.00179695 0.00163158 1.8549e-03 0.00202031 +# 3: 9.3072e-11 0.00142949 0.00087037 1.2457e-03 0.00180482 +# 4: 5.1367e-11 0.00079767 0.00099899 7.2505e-04 0.00052373 +# 5: 3.8260e-10 0.00032232 0.00046644 1.1651e-03 0.00102102 +# > abs(res_shapley_R - res_shapley_cpp_sourceCpp) +# none X1 X2 X3 X4 +# 1: 3.1773e-10 0.00061369 0.00096567 0.00139486 0.00174684 +# 2: 2.1354e-10 0.00164283 0.00139693 0.00051290 0.00075879 +# 3: 1.2370e-10 0.00143125 0.00066145 0.00021455 0.00055524 +# 4: 2.0396e-10 0.00090834 0.00091129 0.00077478 0.00077773 +# 5: 1.3627e-10 0.00038308 0.00033615 0.00031426 0.00026733 +# > max(abs(res_shapley_R - res_shapley_cpp)) +# [1] 0.0020203 +# > max(abs(res_shapley_R - res_shapley_cpp_sourceCpp)) +# [1] 0.0017468 diff --git a/inst/scripts/compare_gaussian_in_R_and_C++.R b/inst/scripts/compare_gaussian_in_R_and_C++.R new file mode 100644 index 000000000..b9ca398aa --- /dev/null +++ b/inst/scripts/compare_gaussian_in_R_and_C++.R @@ -0,0 +1,2735 @@ +# Libraries ------------------------------------------------------------------------------------------------------- +# library(shapr) +# library(rbenchmark) +library(data.table) + + + +# Other functions ------------------------------------------------------------------------------------------------- +#' Sample conditional Gaussian variables +#' +#' @inheritParams sample_copula +#' +#' @return data.table +#' +#' @keywords internal +#' +#' @author Martin Jullum +sample_gaussian <- function(index_given, n_samples, mu, cov_mat, m, x_explain) { + # Check input + stopifnot(is.matrix(x_explain)) + + # Handles the unconditional and full conditional separtely when predicting + cnms <- colnames(x_explain) + if (length(index_given) %in% c(0, m)) { + return(data.table::as.data.table(x_explain)) + } + + dependent_ind <- seq_along(mu)[-index_given] + x_explain_gaussian <- x_explain[index_given] + tmp <- condMVNorm::condMVN( + mean = mu, + sigma = cov_mat, + dependent.ind = dependent_ind, + given.ind = index_given, + X.given = x_explain_gaussian + ) + + # Makes the conditional covariance matrix symmetric in the rare case where numerical instability made it unsymmetric + if (!isSymmetric(tmp[["condVar"]])) { + tmp[["condVar"]] <- Matrix::symmpart(tmp$condVar) + } + + ret0 <- mvnfast::rmvn(n = n_samples, mu = tmp$condMean, sigma = tmp$condVar) + + ret <- matrix(NA, ncol = m, nrow = n_samples) + ret[, index_given] <- rep(x_explain_gaussian, each = n_samples) + ret[, dependent_ind] <- ret0 + + colnames(ret) <- cnms + return(as.data.table(ret)) +} + + +# Cpp functions --------------------------------------------------------------------------------------------------- +# #include +# #include +# using namespace Rcpp; +# +# +# //' Generate Gaussian MC samples +# //' +# //' @param MC_samples_mat matrix. Matrix of dimension `n_samples` times `n_features` containing samples from the +# //' univariate standard normal. +# //' @param x_explain_mat matrix. Matrix of dimension `n_explain` times `n_features` containing the observations +# //' to explain. +# //' @param S matrix. Matrix of dimension `n_combinations` times `n_features` containing binary representations of +# //' the used coalitions. +# //' @param mu vector. Vector of length `n_features` containing the mean of each feature. +# //' @param cov_mat mat. Matrix of dimension `n_features` times `n_features` containing the pariwise covariance between +# //' all features. +# //' +# //' @export +# //' @keywords internal +# //' +# //' @return List of length `n_combinations`*`n_samples`, where each entry is a matrix of dimension `n_samples` times +# //' `n_features` containing the conditional MC samples for each coalition and explicand. +# //' @author Lars Henry Berge Olsen +# // [[Rcpp::export]] +# Rcpp::List prepare_data_gaussian_cpp(arma::mat MC_samples_mat, +# arma::mat x_explain_mat, +# arma::mat S, +# arma::vec mu, +# arma::mat cov_mat) { +# int n_explain = x_explain_mat.n_rows; +# int n_samples = MC_samples_mat.n_rows; +# int n_features = MC_samples_mat.n_cols; +# +# // Pre-allocate result matrix +# arma::mat ret(n_samples, n_features); +# +# // Create a list containing the MC samples for all coalitions and test observations +# Rcpp::List result_list; +# +# // Iterate over the coalitions +# for (int S_ind = 0; S_ind < S.n_rows; S_ind++) { +# +# // TODO: REMOVE IN FINAL VERSION Small printout +# Rcpp::Rcout << S_ind + 1 << ","; +# +# // Get current coalition S and the indices of the features in coalition S and mask Sbar +# arma::mat S_now = S.row(S_ind); +# arma::uvec S_now_idx = arma::find(S_now > 0.5); // må finnes en bedre løsning her +# arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); +# +# // Extract the features we condition on +# arma::mat x_S_star = x_explain_mat.cols(S_now_idx); +# +# // Extract the mean values for the features in the two sets +# arma::vec mu_S = mu.elem(S_now_idx); +# arma::vec mu_Sbar = mu.elem(Sbar_now_idx); +# +# // Extract the relevant parts of the covariance matrix +# arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); +# arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); +# arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); +# arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); +# +# // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix +# arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); +# arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; +# +# // Ensure that the conditional covariance matrix is symmetric +# if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { +# cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); +# } +# +# // Compute the conditional mean of Xsbar given Xs = Xs_star +# arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star.each_row() - mu_S.t()).t(); // Can we speed it up by reducing the number of transposes? +# x_Sbar_mean.each_col() += mu_Sbar; +# +# // Transform the samples to be from N(O, Sigma_Sbar|S) +# arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); +# +# // Loop over the different test observations and combine the generated values with the values we conditioned on +# for (int idx_now = 0; idx_now < n_explain; idx_now++) { +# ret.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); // can using .fill() speed this up? +# ret.cols(Sbar_now_idx) = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1); +# result_list.push_back(ret); +# } +# } +# +# return result_list; +# } +# +# // [[Rcpp::export]] +# Rcpp::List prepare_data_gaussian_cpp_with_wrap(arma::mat MC_samples_mat, +# arma::mat x_explain_mat, +# arma::mat S, +# arma::vec mu, +# arma::mat cov_mat) { +# int n_explain = x_explain_mat.n_rows; +# int n_samples = MC_samples_mat.n_rows; +# int n_features = MC_samples_mat.n_cols; +# +# // Pre-allocate result matrix +# arma::mat ret(n_samples, n_features); +# +# // Create a list containing the MC samples for all coalitions and test observations +# Rcpp::List result_list; +# +# // Iterate over the coalitions +# for (int S_ind = 0; S_ind < S.n_rows; S_ind++) { +# +# // TODO: REMOVE IN FINAL VERSION Small printout +# Rcpp::Rcout << S_ind + 1 << ","; +# +# // Get current coalition S and the indices of the features in coalition S and mask Sbar +# arma::mat S_now = S.row(S_ind); +# arma::uvec S_now_idx = arma::find(S_now > 0.5); // må finnes en bedre løsning her +# arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); +# +# // Extract the features we condition on +# arma::mat x_S_star = x_explain_mat.cols(S_now_idx); +# +# // Extract the mean values for the features in the two sets +# arma::vec mu_S = mu.elem(S_now_idx); +# arma::vec mu_Sbar = mu.elem(Sbar_now_idx); +# +# // Extract the relevant parts of the covariance matrix +# arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); +# arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); +# arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); +# arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); +# +# // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix +# arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); +# arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; +# +# // Ensure that the conditional covariance matrix is symmetric +# if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { +# cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); +# } +# +# // Compute the conditional mean of Xsbar given Xs = Xs_star +# arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star.each_row() - mu_S.t()).t(); // Can we speed it up by reducing the number of transposes? +# x_Sbar_mean.each_col() += mu_Sbar; +# +# // Transform the samples to be from N(O, Sigma_Sbar|S) +# arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); +# +# // Loop over the different test observations and combine the generated values with the values we conditioned on +# for (int idx_now = 0; idx_now < n_explain; idx_now++) { +# ret.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); // can using .fill() speed this up? +# ret.cols(Sbar_now_idx) = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1); +# result_list.push_back(Rcpp::wrap(ret)); +# } +# } +# +# return result_list; +# } +# +# // [[Rcpp::export]] +# Rcpp::List prepare_data_gaussian_cpp_v2(arma::mat MC_samples_mat, +# arma::mat x_explain_mat, +# arma::mat S, +# arma::vec mu, +# arma::mat cov_mat) { +# int n_explain = x_explain_mat.n_rows; +# int n_samples = MC_samples_mat.n_rows; +# int n_features = MC_samples_mat.n_cols; +# +# // Create a list containing the MC samples for all coalitions and test observations +# Rcpp::List result_list; +# +# // Iterate over the coalitions +# for (int S_ind = 0; S_ind < S.n_rows; S_ind++) { +# +# // TODO: REMOVE IN FINAL VERSION Small printout +# Rcpp::Rcout << S_ind + 1 << ","; +# +# // Get current coalition S and the indices of the features in coalition S and mask Sbar +# arma::mat S_now = S.row(S_ind); +# arma::uvec S_now_idx = arma::find(S_now > 0.5); +# arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); +# +# // Extract the features we condition on +# arma::mat x_S_star = x_explain_mat.cols(S_now_idx); +# +# // Extract the mean values for the features in the two sets +# arma::vec mu_S = mu.elem(S_now_idx); +# arma::vec mu_Sbar = mu.elem(Sbar_now_idx); +# +# // Extract the relevant parts of the covariance matrix +# arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); +# arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); +# arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); +# arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); +# +# // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix +# arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); +# arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; +# +# +# // Ensure that the conditional covariance matrix is symmetric +# if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { +# cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); +# } +# +# // Compute the conditional mean of Xsbar given Xs = Xs_star +# arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star.each_row() - mu_S.t()).t(); // Can we speed it up by reducing the number of transposes? +# x_Sbar_mean.each_col() += mu_Sbar; +# +# // Transform the samples to be from N(O, Sigma_Sbar|S) +# arma::mat MC_samples_mat_now = trans(MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S)); +# +# // Loop over the different test observations and Combine the generated values with the values we conditioned on +# for (int idx_now = 0; idx_now < n_explain; idx_now++) { +# arma::mat ret(n_samples, n_features); +# ret.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); +# ret.cols(Sbar_now_idx) = trans(MC_samples_mat_now + repmat(x_Sbar_mean.col(idx_now), 1, n_samples)); +# result_list.push_back(ret); +# } +# } +# +# return result_list; +# } +# +# // [[Rcpp::export]] +# arma::mat prepare_data_gaussian_cpp_fix_large_mat(arma::mat MC_samples_mat, +# arma::mat x_explain_mat, +# arma::mat S, +# arma::vec mu, +# arma::mat cov_mat) { +# int n_explain = x_explain_mat.n_rows; +# int n_samples = MC_samples_mat.n_rows; +# int n_features = MC_samples_mat.n_cols; +# int n_coalitions = S.n_rows; +# +# // Pre-allocate result matrix +# arma::mat return_mat(n_coalitions*n_explain*n_samples, n_features); +# +# // Create a list containing the MC samples for all coalitions and test observations +# std::list result_list; +# // Rcpp::List result_list; +# +# // Iterate over the coalitions +# for (int S_ind = 0; S_ind < n_coalitions; S_ind++) { +# +# // TODO: REMOVE IN FINAL VERSION Small printout +# Rcpp::Rcout << S_ind + 1 << ","; +# +# // Get current coalition S and the indices of the features in coalition S and mask Sbar +# arma::mat S_now = S.row(S_ind); +# arma::uvec S_now_idx = arma::find(S_now > 0.5); // må finnes en bedre løsning her +# arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); +# +# // Extract the features we condition on +# arma::mat x_S_star = x_explain_mat.cols(S_now_idx); +# +# // Extract the mean values for the features in the two sets +# arma::vec mu_S = mu.elem(S_now_idx); +# arma::vec mu_Sbar = mu.elem(Sbar_now_idx); +# +# // Extract the relevant parts of the covariance matrix +# arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); +# arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); +# arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); +# arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); +# +# // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix +# arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); +# arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; +# +# // Ensure that the conditional covariance matrix is symmetric +# if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { +# cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); +# } +# +# // Compute the conditional mean of Xsbar given Xs = Xs_star +# arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star.each_row() - mu_S.t()).t(); // Can we speed it up by reducing the number of transposes? +# x_Sbar_mean.each_col() += mu_Sbar; +# +# // Transform the samples to be from N(O, Sigma_Sbar|S) +# arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); +# +# // Loop over the different test observations and combine the generated values with the values we conditioned on +# for (int idx_now = 0; idx_now < n_explain; idx_now++) { +# // Maybe faster to create vector 0:(n_samples - 1) and then just add n_samples in each loop. +# arma::uvec row_indices_now = arma::linspace(S_ind*n_explain*n_samples + idx_now*n_samples, +# S_ind*n_explain*n_samples + idx_now*n_samples + n_samples - 1, +# n_samples); +# +# return_mat.submat(row_indices_now, S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); +# return_mat.submat(row_indices_now, Sbar_now_idx) = +# MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1); +# } +# } +# +# return return_mat; +# } +# +# // Diff in v2 is where we do the transpose +# // [[Rcpp::export]] +# arma::mat prepare_data_gaussian_cpp_fix_large_mat_v2(arma::mat MC_samples_mat, +# arma::mat x_explain_mat, +# arma::mat S, +# arma::vec mu, +# arma::mat cov_mat) { +# int n_explain = x_explain_mat.n_rows; +# int n_samples = MC_samples_mat.n_rows; +# int n_features = MC_samples_mat.n_cols; +# int n_coalitions = S.n_rows; +# +# // Pre-allocate result matrix +# arma::mat return_mat(n_coalitions*n_explain*n_samples, n_features); +# +# // Create a list containing the MC samples for all coalitions and test observations +# std::list result_list; +# // Rcpp::List result_list; +# +# // Iterate over the coalitions +# for (int S_ind = 0; S_ind < n_coalitions; S_ind++) { +# +# // TODO: REMOVE IN FINAL VERSION Small printout +# Rcpp::Rcout << S_ind + 1 << ","; +# +# // Get current coalition S and the indices of the features in coalition S and mask Sbar +# arma::mat S_now = S.row(S_ind); +# arma::uvec S_now_idx = arma::find(S_now > 0.5); // må finnes en bedre løsning her +# arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); +# +# // Extract the features we condition on +# arma::mat x_S_star = x_explain_mat.cols(S_now_idx); +# +# // Extract the mean values for the features in the two sets +# arma::vec mu_S = mu.elem(S_now_idx); +# arma::vec mu_Sbar = mu.elem(Sbar_now_idx); +# +# // Extract the relevant parts of the covariance matrix +# arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); +# arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); +# arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); +# arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); +# +# // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix +# arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); +# arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; +# +# // Ensure that the conditional covariance matrix is symmetric +# if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { +# cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); +# } +# +# // Compute the conditional mean of Xsbar given Xs = Xs_star +# arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star.each_row() - mu_S.t()).t(); // Can we speed it up by reducing the number of transposes? +# x_Sbar_mean.each_col() += mu_Sbar; +# +# // Transform the samples to be from N(O, Sigma_Sbar|S) +# arma::mat MC_samples_mat_now = trans(MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S)); +# +# // Loop over the different test observations and combine the generated values with the values we conditioned on +# for (int idx_now = 0; idx_now < n_explain; idx_now++) { +# // Maybe faster to create vector 0:(n_samples - 1) and then just add n_samples in each loop. +# arma::uvec row_indices_now = arma::linspace(S_ind*n_explain*n_samples + idx_now*n_samples, +# S_ind*n_explain*n_samples + idx_now*n_samples + n_samples - 1, +# n_samples); +# +# return_mat.submat(row_indices_now, S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); +# return_mat.submat(row_indices_now, Sbar_now_idx) = +# trans(MC_samples_mat_now + repmat(x_Sbar_mean.col(idx_now), 1, n_samples)); +# } +# } +# +# return return_mat; +# } +# +# // [[Rcpp::export]] +# arma::cube prepare_data_gaussian_cpp_fix_cube(arma::mat MC_samples_mat, +# arma::mat x_explain_mat, +# arma::mat S, +# arma::vec mu, +# arma::mat cov_mat) { +# int n_explain = x_explain_mat.n_rows; +# int n_samples = MC_samples_mat.n_rows; +# int n_features = MC_samples_mat.n_cols; +# int n_coalitions = S.n_rows; +# +# // Pre-allocate result matrix +# arma::mat aux_mat(n_samples, n_features); +# arma::cube result_cube(n_samples, n_features, n_explain*n_coalitions); +# +# // Iterate over the coalitions +# for (int S_ind = 0; S_ind < n_coalitions; S_ind++) { +# +# // TODO: REMOVE IN FINAL VERSION Small printout +# Rcpp::Rcout << S_ind + 1 << ","; +# +# // Get current coalition S and the indices of the features in coalition S and mask Sbar +# arma::mat S_now = S.row(S_ind); +# arma::uvec S_now_idx = arma::find(S_now > 0.5); // må finnes en bedre løsning her +# arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); +# +# // Extract the features we condition on +# arma::mat x_S_star = x_explain_mat.cols(S_now_idx); +# +# // Extract the mean values for the features in the two sets +# arma::vec mu_S = mu.elem(S_now_idx); +# arma::vec mu_Sbar = mu.elem(Sbar_now_idx); +# +# // Extract the relevant parts of the covariance matrix +# arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); +# arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); +# arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); +# arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); +# +# // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix +# arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); +# arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; +# +# // Ensure that the conditional covariance matrix is symmetric +# if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { +# cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); +# } +# +# // Compute the conditional mean of Xsbar given Xs = Xs_star +# arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star.each_row() - mu_S.t()).t(); // Can we speed it up by reducing the number of transposes? +# x_Sbar_mean.each_col() += mu_Sbar; +# +# // Transform the samples to be from N(O, Sigma_Sbar|S) +# arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); +# +# // Loop over the different test observations and combine the generated values with the values we conditioned on +# for (int idx_now = 0; idx_now < n_explain; idx_now++) { +# aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); // can using .fill() speed this up? +# aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1); +# result_cube.slice(S_ind*n_explain + idx_now) = aux_mat; +# } +# } +# +# return result_cube; +# } +# +# // [[Rcpp::export]] +# arma::cube prepare_data_gaussian_cpp_fix_cube_v2(arma::mat MC_samples_mat, +# arma::mat x_explain_mat, +# arma::mat S, +# arma::vec mu, +# arma::mat cov_mat) { +# int n_explain = x_explain_mat.n_rows; +# int n_samples = MC_samples_mat.n_rows; +# int n_features = MC_samples_mat.n_cols; +# int n_coalitions = S.n_rows; +# +# // Pre-allocate result matrix +# arma::mat aux_mat(n_samples, n_features); +# arma::cube result_cube(n_samples, n_explain*n_coalitions, n_features); +# +# // Iterate over the coalitions +# for (int S_ind = 0; S_ind < n_coalitions; S_ind++) { +# +# // TODO: REMOVE IN FINAL VERSION Small printout +# Rcpp::Rcout << S_ind + 1 << ","; +# +# // Get current coalition S and the indices of the features in coalition S and mask Sbar +# arma::mat S_now = S.row(S_ind); +# arma::uvec S_now_idx = arma::find(S_now > 0.5); // må finnes en bedre løsning her +# arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); +# +# // Extract the features we condition on +# arma::mat x_S_star = x_explain_mat.cols(S_now_idx); +# +# // Extract the mean values for the features in the two sets +# arma::vec mu_S = mu.elem(S_now_idx); +# arma::vec mu_Sbar = mu.elem(Sbar_now_idx); +# +# // Extract the relevant parts of the covariance matrix +# arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); +# arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); +# arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); +# arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); +# +# // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix +# arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); +# arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; +# +# // Ensure that the conditional covariance matrix is symmetric +# if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { +# cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); +# } +# +# // Compute the conditional mean of Xsbar given Xs = Xs_star +# arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star.each_row() - mu_S.t()).t(); // Can we speed it up by reducing the number of transposes? +# x_Sbar_mean.each_col() += mu_Sbar; +# +# // Transform the samples to be from N(O, Sigma_Sbar|S) +# arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); +# +# // Loop over the different test observations and combine the generated values with the values we conditioned on +# for (int idx_now = 0; idx_now < n_explain; idx_now++) { +# aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); // can using .fill() speed this up? +# aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1); +# result_cube.col(S_ind*n_explain + idx_now) = aux_mat; +# } +# } +# +# return result_cube; +# } +# +# // [[Rcpp::export]] +# Rcpp::List prepare_data_gaussian_cpp_fix_list_of_lists_of_matrices(arma::mat MC_samples_mat, +# arma::mat x_explain_mat, +# arma::mat S, +# arma::vec mu, +# arma::mat cov_mat) { +# int n_explain = x_explain_mat.n_rows; +# int n_samples = MC_samples_mat.n_rows; +# int n_features = MC_samples_mat.n_cols; +# +# // Pre-allocate result matrix +# arma::mat aux_mat(n_samples, n_features); +# +# // Create a list containing lists that contian the MC samples for all coalitions and test observations in each matrix +# Rcpp::List result_list(S.n_rows); +# +# // Iterate over the coalitions +# for (int S_ind = 0; S_ind < S.n_rows; S_ind++) { +# +# Rcpp::List result_list_now(n_explain); +# +# // TODO: REMOVE IN FINAL VERSION Small printout +# Rcpp::Rcout << S_ind + 1 << ","; +# +# // Get current coalition S and the indices of the features in coalition S and mask Sbar +# arma::mat S_now = S.row(S_ind); +# arma::uvec S_now_idx = arma::find(S_now > 0.5); // må finnes en bedre løsning her +# arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); +# +# // Extract the features we condition on +# arma::mat x_S_star = x_explain_mat.cols(S_now_idx); +# +# // Extract the mean values for the features in the two sets +# arma::vec mu_S = mu.elem(S_now_idx); +# arma::vec mu_Sbar = mu.elem(Sbar_now_idx); +# +# // Extract the relevant parts of the covariance matrix +# arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); +# arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); +# arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); +# arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); +# +# // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix +# arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); +# arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; +# +# // Ensure that the conditional covariance matrix is symmetric +# if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { +# cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); +# } +# +# // Compute the conditional mean of Xsbar given Xs = Xs_star +# arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star.each_row() - mu_S.t()).t(); // Can we speed it up by reducing the number of transposes? +# x_Sbar_mean.each_col() += mu_Sbar; +# +# // Transform the samples to be from N(O, Sigma_Sbar|S) +# arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); +# +# // Loop over the different test observations and combine the generated values with the values we conditioned on +# for (int idx_now = 0; idx_now < n_explain; idx_now++) { +# aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); // can using .fill() speed this up? +# aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1); +# result_list_now[idx_now] = aux_mat; +# } +# result_list[S_ind] = result_list_now; +# } +# +# return result_list; +# } +# +# // [[Rcpp::export]] +# std::list prepare_data_gaussian_cpp_fix_std_list(arma::mat MC_samples_mat, +# arma::mat x_explain_mat, +# arma::mat S, +# arma::vec mu, +# arma::mat cov_mat) { +# int n_explain = x_explain_mat.n_rows; +# int n_samples = MC_samples_mat.n_rows; +# int n_features = MC_samples_mat.n_cols; +# +# // Pre-allocate result matrix +# arma::mat aux_mat(n_samples, n_features); +# +# // Create a list containing the MC samples for all coalitions and test observations +# std::list result_list; +# +# // Iterate over the coalitions +# for (int S_ind = 0; S_ind < S.n_rows; S_ind++) { +# +# // TODO: REMOVE IN FINAL VERSION Small printout +# Rcpp::Rcout << S_ind + 1 << ","; +# +# // Get current coalition S and the indices of the features in coalition S and mask Sbar +# arma::mat S_now = S.row(S_ind); +# arma::uvec S_now_idx = arma::find(S_now > 0.5); // må finnes en bedre løsning her +# arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); +# +# // Extract the features we condition on +# arma::mat x_S_star = x_explain_mat.cols(S_now_idx); +# +# // Extract the mean values for the features in the two sets +# arma::vec mu_S = mu.elem(S_now_idx); +# arma::vec mu_Sbar = mu.elem(Sbar_now_idx); +# +# // Extract the relevant parts of the covariance matrix +# arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); +# arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); +# arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); +# arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); +# +# // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix +# arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); +# arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; +# +# // Ensure that the conditional covariance matrix is symmetric +# if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { +# cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); +# } +# +# // Compute the conditional mean of Xsbar given Xs = Xs_star +# arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star.each_row() - mu_S.t()).t(); // Can we speed it up by reducing the number of transposes? +# x_Sbar_mean.each_col() += mu_Sbar; +# +# // Transform the samples to be from N(O, Sigma_Sbar|S) +# arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); +# +# // Loop over the different test observations and combine the generated values with the values we conditioned on +# for (int idx_now = 0; idx_now < n_explain; idx_now++) { +# aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); // can using .fill() speed this up? +# aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1); +# result_list.push_back(aux_mat); +# } +# } +# +# return result_list; +# } + + + +# Old and new version --------------------------------------------------------------------------------------------- +prepare_data_gaussian_old <- function(internal, index_features = NULL, ...) { + x_train <- internal$data$x_train + x_explain <- internal$data$x_explain + n_explain <- internal$parameters$n_explain + gaussian.cov_mat <- internal$parameters$gaussian.cov_mat + n_samples <- internal$parameters$n_samples + gaussian.mu <- internal$parameters$gaussian.mu + n_features <- internal$parameters$n_features + + X <- internal$objects$X + + x_explain0 <- as.matrix(x_explain) + dt_l <- list() + + if (is.null(index_features)) { + features <- X$features + } else { + features <- X$features[index_features] + } + + for (i in seq_len(n_explain)) { + cat(sprintf("%d,", i)) + l <- lapply( + X = features, + FUN = sample_gaussian, #shapr:::sample_gaussian, + n_samples = n_samples, + mu = gaussian.mu, + cov_mat = gaussian.cov_mat, + m = n_features, + x_explain = x_explain0[i, , drop = FALSE] + ) + + dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") + dt_l[[i]][, w := 1 / n_samples] + dt_l[[i]][, id := i] + if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] + } + + dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE) + return(dt) +} + + +# In this version we improve the method by only computing the conditional covariance matrices once. +prepare_data_gaussian_new_v1 <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate a data table containing all Monte Carlo samples for all test observations and coalitions + dt <- data.table::rbindlist( + # Iterate over the coalitions + lapply( + seq_len(nrow(S)), + function(S_ind) { + # This function generates the conditional samples Xsbar | Xs = Xs_star + # and combine those values with the unconditional values. + cat(sprintf("%d,", S_ind)) + + # Get boolean representations if the features are in the S and the Sbar sets + S_now <- as.logical(S[S_ind, ]) + Sbar_now <- !as.logical(S[S_ind, ]) + + # Remove: + # Do not need to treat the empty and grand coalitions different as they will never be present + # if (sum(S_now) %in% c(0, n_features)) { + # return(data.table::as.data.table(cbind("id" = seq(n_explain), x_explain))) + # } + + # Extract the features we condition on + x_S_star <- x_explain_mat[, S_now, drop = FALSE] + + # Extract the mean values for the features in the two sets + mu_S <- mu[S_now] + mu_Sbar <- mu[Sbar_now] + + # Extract the relevant parts of the covariance matrix + cov_mat_SS <- cov_mat[S_now, S_now, drop = FALSE] + cov_mat_SSbar <- cov_mat[S_now, Sbar_now, drop = FALSE] + cov_mat_SbarS <- cov_mat[Sbar_now, S_now, drop = FALSE] + cov_mat_SbarSbar <- cov_mat[Sbar_now, Sbar_now, drop = FALSE] + + # Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + cov_mat_SbarS_cov_mat_SS_inv <- cov_mat_SbarS %*% solve(cov_mat_SS) + cond_cov_mat_Sbar_given_S <- cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv %*% cov_mat_SSbar + + # Ensure that the conditional covariance matrix symmetric in the + # rare case where numerical instability made it unsymmetrical. + if (!isSymmetric(cond_cov_mat_Sbar_given_S)) { + cond_cov_mat_Sbar_given_S <- Matrix::symmpart(cond_cov_mat_Sbar_given_S) + } + + # Compute the conditional mean of Xsbar given Xs = Xs_star + x_Sbar_mean <- mu_Sbar + cov_mat_SbarS_cov_mat_SS_inv %*% (t(x_S_star) - mu_S) + + # Allocate an empty matrix used in mvnfast:::rmvnCpp to store the generated MC samples. + B <- matrix(nrow = n_samples, ncol = sum(Sbar_now)) + class(B) <- "numeric" + + # Create a data.table containing the MC samples for all test observations for one coalition + data.table::rbindlist( + + # Loop over the different test observations + lapply(seq(n_explain), function(idx_now) { + # Sample the MC samples from the conditional Gaussian distribution for one test observation. + .Call("rmvnCpp", + n_ = n_samples, + mu_ = x_Sbar_mean[, idx_now], + sigma_ = cond_cov_mat_Sbar_given_S, + ncores_ = 1, + isChol_ = FALSE, + A_ = B, + PACKAGE = "mvnfast" + ) + # Combine the generated values with the values we conditioned on + ret <- matrix(NA, ncol = n_features, nrow = n_samples) + ret[, S_now] <- rep(c(x_explain_mat[idx_now, S_now]), each = n_samples) + ret[, Sbar_now] <- B + + # Set names of the columns and convert to a data.table + colnames(ret) <- feature_names + as.data.table(ret) + }), + use.names = TRUE, idcol = "id", fill = TRUE + ) + } + ), + idcol = "id_combination" + ) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +# This is similar to v1, but we compute the Cholensky decomposition only once for each coalitions. +# In v1, it is computed n_explain times. +prepare_data_gaussian_new_v2 <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate a data table containing all Monte Carlo samples for all test observations and coalitions + dt <- data.table::rbindlist( + # Iterate over the coalitions + lapply( + seq_len(nrow(S)), + function(S_ind) { + # This function generates the conditional samples Xsbar | Xs = Xs_star + # and combine those values with the unconditional values. + cat(sprintf("%d,", S_ind)) + + # Get boolean representations if the features are in the S and the Sbar sets + S_now <- as.logical(S[S_ind, ]) + Sbar_now <- !as.logical(S[S_ind, ]) + + # Remove: + # Do not need to treat the empty and grand coalitions different as they will never be present + # if (sum(S_now) %in% c(0, n_features)) { + # return(data.table::as.data.table(cbind("id" = seq(n_explain), x_explain))) + # } + + # Extract the features we condition on + x_S_star <- x_explain_mat[, S_now, drop = FALSE] + + # Extract the mean values for the features in the two sets + mu_S <- mu[S_now] + mu_Sbar <- mu[Sbar_now] + + # Extract the relevant parts of the covariance matrix + cov_mat_SS <- cov_mat[S_now, S_now, drop = FALSE] + cov_mat_SSbar <- cov_mat[S_now, Sbar_now, drop = FALSE] + cov_mat_SbarS <- cov_mat[Sbar_now, S_now, drop = FALSE] + cov_mat_SbarSbar <- cov_mat[Sbar_now, Sbar_now, drop = FALSE] + + # Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + cov_mat_SbarS_cov_mat_SS_inv <- cov_mat_SbarS %*% solve(cov_mat_SS) + cond_cov_mat_Sbar_given_S <- cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv %*% cov_mat_SSbar + + # Ensure that the conditional covariance matrix symmetric in the + # rare case where numerical instability made it unsymmetrical. + if (!isSymmetric(cond_cov_mat_Sbar_given_S)) { + cond_cov_mat_Sbar_given_S <- Matrix::symmpart(cond_cov_mat_Sbar_given_S) + } + + # Compute the conditional mean of Xsbar given Xs = Xs_star + x_Sbar_mean <- mu_Sbar + cov_mat_SbarS_cov_mat_SS_inv %*% (t(x_S_star) - mu_S) + + # Allocate an empty matrix used in mvnfast:::rmvnCpp to store the generated MC samples. + B <- matrix(nrow = n_samples, ncol = sum(Sbar_now)) + class(B) <- "numeric" + + # Compute the Cholensky decomposition + cond_cov_mat_Sbar_given_S_chol <- chol(cond_cov_mat_Sbar_given_S) + + # Create a data.table containing the MC samples for all test observations for one coalition + data.table::rbindlist( + + # Loop over the different test observations + lapply(seq(n_explain), function(idx_now) { + # Sample the MC samples from the conditional Gaussian distribution for one test observation. + .Call("rmvnCpp", + n_ = n_samples, + mu_ = x_Sbar_mean[, idx_now], + sigma_ = cond_cov_mat_Sbar_given_S_chol, + ncores_ = 1, + isChol_ = TRUE, + A_ = B, + PACKAGE = "mvnfast" + ) + # Combine the generated values with the values we conditioned on + ret <- matrix(NA, ncol = n_features, nrow = n_samples) + ret[, S_now] <- rep(c(x_explain_mat[idx_now, S_now]), each = n_samples) + ret[, Sbar_now] <- B + + # Set names of the columns and convert to a data.table + colnames(ret) <- feature_names + as.data.table(ret) + }), + use.names = TRUE, idcol = "id", fill = TRUE + ) + } + ), + idcol = "id_combination" + ) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +# Here we improve the method speed by only sampling once per coalition +# and only add the test-observation-dependent mean in a secondary call. +prepare_data_gaussian_new_v3 <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate a data table containing all Monte Carlo samples for all test observations and coalitions + dt <- data.table::rbindlist( + # Iterate over the coalitions + lapply( + seq_len(nrow(S)), + function(S_ind) { + # This function generates the conditional samples Xsbar | Xs = Xs_star + # and combine those values with the unconditional values. + cat(sprintf("%d,", S_ind)) + + # Get boolean representations if the features are in the S and the Sbar sets + S_now <- as.logical(S[S_ind, ]) + Sbar_now <- !as.logical(S[S_ind, ]) + + # Remove: + # Do not need to treat the empty and grand coalitions different as they will never be present + # if (sum(S_now) %in% c(0, n_features)) { + # return(data.table::as.data.table(cbind("id" = seq(n_explain), x_explain))) + # } + + # Extract the features we condition on + x_S_star <- x_explain_mat[, S_now, drop = FALSE] + + # Extract the mean values for the features in the two sets + mu_S <- mu[S_now] + mu_Sbar <- mu[Sbar_now] + + # Extract the relevant parts of the covariance matrix + cov_mat_SS <- cov_mat[S_now, S_now, drop = FALSE] + cov_mat_SSbar <- cov_mat[S_now, Sbar_now, drop = FALSE] + cov_mat_SbarS <- cov_mat[Sbar_now, S_now, drop = FALSE] + cov_mat_SbarSbar <- cov_mat[Sbar_now, Sbar_now, drop = FALSE] + + # Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + cov_mat_SbarS_cov_mat_SS_inv <- cov_mat_SbarS %*% solve(cov_mat_SS) + cond_cov_mat_Sbar_given_S <- cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv %*% cov_mat_SSbar + + # Ensure that the conditional covariance matrix symmetric in the + # rare case where numerical instability made it unsymmetrical. + if (!isSymmetric(cond_cov_mat_Sbar_given_S)) { + cond_cov_mat_Sbar_given_S <- Matrix::symmpart(cond_cov_mat_Sbar_given_S) + } + + # Compute the conditional mean of Xsbar given Xs = Xs_star + x_Sbar_mean <- mu_Sbar + cov_mat_SbarS_cov_mat_SS_inv %*% (t(x_S_star) - mu_S) + + # rbenchmark::benchmark( + # t(sweep(x_S_star, 2, mu_S, FUN = "-")), + # t(x_S_star) - mu_S) + + # Allocate an empty matrix used in mvnfast:::rmvnCpp to store the generated MC samples. + B <- matrix(nrow = n_samples, ncol = sum(Sbar_now)) + class(B) <- "numeric" + + .Call("rmvnCpp", + n_ = n_samples, + mu_ = rep(0, length(mu_Sbar)), + sigma_ = cond_cov_mat_Sbar_given_S, + ncores_ = 1, + isChol_ = FALSE, + A_ = B, + PACKAGE = "mvnfast" + ) + + # Transpose her and untranspose later for faster matrix addition in `t(B + x_Sbar_mean[, idx_now])` + # as it seems to be faster than using `sweep(B, 2, x_Sbar_mean[, idx_now], FUN = "+")` on the + # original B (i.e., not transposed B). + B <- t(B) + + # Create a data.table containing the MC samples for all test observations for one coalition + data.table::rbindlist( + + # Loop over the different test observations + lapply(seq(n_explain), function(idx_now) { + # Combine the generated values with the values we conditioned on + ret <- matrix(NA, ncol = n_features, nrow = n_samples) + ret[, S_now] <- rep(c(x_explain_mat[idx_now, S_now]), each = n_samples) + ret[, Sbar_now] <- t(B + x_Sbar_mean[, idx_now]) + + # Set names of the columns and convert to a data.table + colnames(ret) <- feature_names + as.data.table(ret) + }), + use.names = TRUE, idcol = "id", fill = TRUE + ) + } + ), + idcol = "id_combination" + ) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +# Same as v3, but we now use R to compute Cholensky +prepare_data_gaussian_new_v4 <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate a data table containing all Monte Carlo samples for all test observations and coalitions + dt <- data.table::rbindlist( + # Iterate over the coalitions + lapply( + seq_len(nrow(S)), + function(S_ind) { + # This function generates the conditional samples Xsbar | Xs = Xs_star + # and combine those values with the unconditional values. + cat(sprintf("%d,", S_ind)) + + # Get boolean representations if the features are in the S and the Sbar sets + S_now <- as.logical(S[S_ind, ]) + Sbar_now <- !as.logical(S[S_ind, ]) + + # Remove: + # Do not need to treat the empty and grand coalitions different as they will never be present + # if (sum(S_now) %in% c(0, n_features)) { + # return(data.table::as.data.table(cbind("id" = seq(n_explain), x_explain))) + # } + + # Extract the features we condition on + x_S_star <- x_explain_mat[, S_now, drop = FALSE] + + # Extract the mean values for the features in the two sets + mu_S <- mu[S_now] + mu_Sbar <- mu[Sbar_now] + + # Extract the relevant parts of the covariance matrix + cov_mat_SS <- cov_mat[S_now, S_now, drop = FALSE] + cov_mat_SSbar <- cov_mat[S_now, Sbar_now, drop = FALSE] + cov_mat_SbarS <- cov_mat[Sbar_now, S_now, drop = FALSE] + cov_mat_SbarSbar <- cov_mat[Sbar_now, Sbar_now, drop = FALSE] + + # Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + cov_mat_SbarS_cov_mat_SS_inv <- cov_mat_SbarS %*% solve(cov_mat_SS) + cond_cov_mat_Sbar_given_S <- cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv %*% cov_mat_SSbar + + # Ensure that the conditional covariance matrix symmetric in the + # rare case where numerical instability made it unsymmetrical. + if (!isSymmetric(cond_cov_mat_Sbar_given_S)) { + cond_cov_mat_Sbar_given_S <- Matrix::symmpart(cond_cov_mat_Sbar_given_S) + } + + # Compute the conditional mean of Xsbar given Xs = Xs_star + x_Sbar_mean <- mu_Sbar + cov_mat_SbarS_cov_mat_SS_inv %*% (t(x_S_star) - mu_S) + + # Allocate an empty matrix used in mvnfast:::rmvnCpp to store the generated MC samples. + B <- matrix(nrow = n_samples, ncol = sum(Sbar_now)) + class(B) <- "numeric" + + .Call("rmvnCpp", + n_ = n_samples, + mu_ = rep(0, length(mu_Sbar)), + sigma_ = chol(cond_cov_mat_Sbar_given_S), + ncores_ = 1, + isChol_ = TRUE, + A_ = B, + PACKAGE = "mvnfast" + ) + + # Transpose her and untranspose later for faster matrix addition in `t(B + x_Sbar_mean[, idx_now])` + # as it seems to be faster than using `sweep(B, 2, x_Sbar_mean[, idx_now], FUN = "+")` on the + # original B (i.e., not transposed B). + B <- t(B) + + # Create a data.table containing the MC samples for all test observations for one coalition + data.table::rbindlist( + + # Loop over the different test observations + lapply(seq(n_explain), function(idx_now) { + # Combine the generated values with the values we conditioned on + ret <- matrix(NA, ncol = n_features, nrow = n_samples) + ret[, S_now] <- rep(c(x_explain_mat[idx_now, S_now]), each = n_samples) + ret[, Sbar_now] <- t(B + x_Sbar_mean[, idx_now]) + + # Set names of the columns and convert to a data.table + colnames(ret) <- feature_names + as.data.table(ret) + }), + use.names = TRUE, idcol = "id", fill = TRUE + ) + } + ), + idcol = "id_combination" + ) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +# Here we only want to generate the data once. So we generate n_samples from N(0, I), +# and then use Cholensky to transform to N(O, Sigma_{Sbar|S}), and then add the means. +prepare_data_gaussian_new_v5 <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Allocate an empty matrix used in mvnfast:::rmvnCpp to store the generated MC samples. + B <- matrix(nrow = n_samples, ncol = n_features) + class(B) <- "numeric" + + .Call("rmvnCpp", + n_ = n_samples, + mu_ = rep(0, n_features), + sigma_ = diag(n_features), + ncores_ = 1, + isChol_ = TRUE, + A_ = B, + PACKAGE = "mvnfast" + ) + + # Generate a data table containing all Monte Carlo samples for all test observations and coalitions + dt <- data.table::rbindlist( + # Iterate over the coalitions + lapply( + seq_len(nrow(S)), + function(S_ind) { + # This function generates the conditional samples Xsbar | Xs = Xs_star + # and combine those values with the unconditional values. + cat(sprintf("%d,", S_ind)) + + # Get boolean representations if the features are in the S and the Sbar sets + S_now <- as.logical(S[S_ind, ]) + Sbar_now <- !as.logical(S[S_ind, ]) + + # Remove: + # Do not need to treat the empty and grand coalitions different as they will never be present + # if (sum(S_now) %in% c(0, n_features)) { + # return(data.table::as.data.table(cbind("id" = seq(n_explain), x_explain))) + # } + + # Extract the features we condition on + x_S_star <- x_explain_mat[, S_now, drop = FALSE] + + # Extract the mean values for the features in the two sets + mu_S <- mu[S_now] + mu_Sbar <- mu[Sbar_now] + + # Extract the relevant parts of the covariance matrix + cov_mat_SS <- cov_mat[S_now, S_now, drop = FALSE] + cov_mat_SSbar <- cov_mat[S_now, Sbar_now, drop = FALSE] + cov_mat_SbarS <- cov_mat[Sbar_now, S_now, drop = FALSE] + cov_mat_SbarSbar <- cov_mat[Sbar_now, Sbar_now, drop = FALSE] + + # Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + cov_mat_SbarS_cov_mat_SS_inv <- cov_mat_SbarS %*% solve(cov_mat_SS) + cond_cov_mat_Sbar_given_S <- cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv %*% cov_mat_SSbar + + # Ensure that the conditional covariance matrix symmetric in the + # rare case where numerical instability made it unsymmetrical. + if (!isSymmetric(cond_cov_mat_Sbar_given_S)) { + cond_cov_mat_Sbar_given_S <- Matrix::symmpart(cond_cov_mat_Sbar_given_S) + } + + # Compute the conditional mean of Xsbar given Xs = Xs_star + x_Sbar_mean <- mu_Sbar + cov_mat_SbarS_cov_mat_SS_inv %*% t(sweep(x_S_star, 2, mu_S, FUN = "-")) + + # Transform the samples to be from N(O, Sigma_Sbar|S) + # Transpose her and untranspose later for faster matrix addition in `t(B + x_Sbar_mean[, idx_now])` + # as it seems to be faster than using `sweep(B, 2, x_Sbar_mean[, idx_now], FUN = "+")` on the + # original B (i.e., not transposed B). + B_now <- t(B[, Sbar_now] %*% chol(cond_cov_mat_Sbar_given_S)) + + # Create a data.table containing the MC samples for all test observations for one coalition + data.table::rbindlist( + + # Loop over the different test observations + lapply(seq(n_explain), function(idx_now) { + # Combine the generated values with the values we conditioned on + ret <- matrix(NA, ncol = n_features, nrow = n_samples) + ret[, S_now] <- rep(c(x_explain_mat[idx_now, S_now]), each = n_samples) + ret[, Sbar_now] <- t(B_now + x_Sbar_mean[, idx_now]) + + # Set names of the columns and convert to a data.table + colnames(ret) <- feature_names + as.data.table(ret) + }), + use.names = TRUE, idcol = "id", fill = TRUE + ) + } + ), + idcol = "id_combination" + ) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +prepare_data_gaussian_new_v5_rnorm <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Allocate an empty matrix used in mvnfast:::rmvnCpp to store the generated MC samples. + # B <- matrix(nrow = n_samples, ncol = n_features) + # class(B) <- "numeric" + + # .Call("rmvnCpp", + # n_ = n_samples, + # mu_ = rep(0, n_features), + # sigma_ = diag(n_features), + # ncores_ = 1, + # isChol_ = TRUE, + # A_ = B, + # PACKAGE = "mvnfast" + # ) + + B <- matrix(rnorm(n_samples*n_features),nrow = n_samples, ncol = n_features) + + # Generate a data table containing all Monte Carlo samples for all test observations and coalitions + dt <- data.table::rbindlist( + # Iterate over the coalitions + lapply( + seq_len(nrow(S)), + function(S_ind) { + # This function generates the conditional samples Xsbar | Xs = Xs_star + # and combine those values with the unconditional values. + cat(sprintf("%d,", S_ind)) + + # Get boolean representations if the features are in the S and the Sbar sets + S_now <- as.logical(S[S_ind, ]) + Sbar_now <- !as.logical(S[S_ind, ]) + + # Remove: + # Do not need to treat the empty and grand coalitions different as they will never be present + # if (sum(S_now) %in% c(0, n_features)) { + # return(data.table::as.data.table(cbind("id" = seq(n_explain), x_explain))) + # } + + # Extract the features we condition on + x_S_star <- x_explain_mat[, S_now, drop = FALSE] + + # Extract the mean values for the features in the two sets + mu_S <- mu[S_now] + mu_Sbar <- mu[Sbar_now] + + # Extract the relevant parts of the covariance matrix + cov_mat_SS <- cov_mat[S_now, S_now, drop = FALSE] + cov_mat_SSbar <- cov_mat[S_now, Sbar_now, drop = FALSE] + cov_mat_SbarS <- cov_mat[Sbar_now, S_now, drop = FALSE] + cov_mat_SbarSbar <- cov_mat[Sbar_now, Sbar_now, drop = FALSE] + + # Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + cov_mat_SbarS_cov_mat_SS_inv <- cov_mat_SbarS %*% solve(cov_mat_SS) + cond_cov_mat_Sbar_given_S <- cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv %*% cov_mat_SSbar + + # Ensure that the conditional covariance matrix symmetric in the + # rare case where numerical instability made it unsymmetrical. + if (!isSymmetric(cond_cov_mat_Sbar_given_S)) { + cond_cov_mat_Sbar_given_S <- Matrix::symmpart(cond_cov_mat_Sbar_given_S) + } + + # Compute the conditional mean of Xsbar given Xs = Xs_star + x_Sbar_mean <- mu_Sbar + cov_mat_SbarS_cov_mat_SS_inv %*% t(sweep(x_S_star, 2, mu_S, FUN = "-")) + + + + + + # Transform the samples to be from N(O, Sigma_Sbar|S) + # Transpose her and untranspose later for faster matrix addition in `t(B + x_Sbar_mean[, idx_now])` + # as it seems to be faster than using `sweep(B, 2, x_Sbar_mean[, idx_now], FUN = "+")` on the + # original B (i.e., not transposed B). + B_now <- t(B[, Sbar_now] %*% chol(cond_cov_mat_Sbar_given_S)) + + # Create a data.table containing the MC samples for all test observations for one coalition + data.table::rbindlist( + + # Loop over the different test observations + lapply(seq(n_explain), function(idx_now) { + # Combine the generated values with the values we conditioned on + ret <- matrix(NA, ncol = n_features, nrow = n_samples) + ret[, S_now] <- rep(c(x_explain_mat[idx_now, S_now]), each = n_samples) + ret[, Sbar_now] <- t(B_now + x_Sbar_mean[, idx_now]) + + # Set names of the columns and convert to a data.table + colnames(ret) <- feature_names + as.data.table(ret) + }), + use.names = TRUE, idcol = "id", fill = TRUE + ) + } + ), + idcol = "id_combination" + ) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +prepare_data_gaussian_new_v5_rnorm_v2 <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Allocate an empty matrix used in mvnfast:::rmvnCpp to store the generated MC samples. + # B <- matrix(nrow = n_samples, ncol = n_features) + # class(B) <- "numeric" + + # .Call("rmvnCpp", + # n_ = n_samples, + # mu_ = rep(0, n_features), + # sigma_ = diag(n_features), + # ncores_ = 1, + # isChol_ = TRUE, + # A_ = B, + # PACKAGE = "mvnfast" + # ) + + B <- matrix(rnorm(n_samples*n_features),nrow = n_samples, ncol = n_features) + + # Generate a data table containing all Monte Carlo samples for all test observations and coalitions + dt <- data.table::rbindlist( + # Iterate over the coalitions + lapply( + seq_len(nrow(S)), + function(S_ind) { + # This function generates the conditional samples Xsbar | Xs = Xs_star + # and combine those values with the unconditional values. + cat(sprintf("%d,", S_ind)) + + # Get boolean representations if the features are in the S and the Sbar sets + S_now <- as.logical(S[S_ind, ]) + Sbar_now <- !as.logical(S[S_ind, ]) + + # Remove: + # Do not need to treat the empty and grand coalitions different as they will never be present + # if (sum(S_now) %in% c(0, n_features)) { + # return(data.table::as.data.table(cbind("id" = seq(n_explain), x_explain))) + # } + + # Extract the features we condition on + x_S_star <- x_explain_mat[, S_now, drop = FALSE] + + # Extract the mean values for the features in the two sets + mu_S <- mu[S_now] + mu_Sbar <- mu[Sbar_now] + + # Extract the relevant parts of the covariance matrix + cov_mat_SS <- cov_mat[S_now, S_now, drop = FALSE] + cov_mat_SSbar <- cov_mat[S_now, Sbar_now, drop = FALSE] + cov_mat_SbarS <- cov_mat[Sbar_now, S_now, drop = FALSE] + cov_mat_SbarSbar <- cov_mat[Sbar_now, Sbar_now, drop = FALSE] + + # Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + cov_mat_SbarS_cov_mat_SS_inv <- cov_mat_SbarS %*% solve(cov_mat_SS) + cond_cov_mat_Sbar_given_S <- cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv %*% cov_mat_SSbar + + # Ensure that the conditional covariance matrix symmetric in the + # rare case where numerical instability made it unsymmetrical. + if (!isSymmetric(cond_cov_mat_Sbar_given_S)) { + cond_cov_mat_Sbar_given_S <- Matrix::symmpart(cond_cov_mat_Sbar_given_S) + } + + # Compute the conditional mean of Xsbar given Xs = Xs_star + x_Sbar_mean <- mu_Sbar + cov_mat_SbarS_cov_mat_SS_inv %*% (t(x_S_star) - mu_S) + + + # Transform the samples to be from N(O, Sigma_Sbar|S) + # Transpose her and untranspose later for faster matrix addition in `t(B + x_Sbar_mean[, idx_now])` + # as it seems to be faster than using `sweep(B, 2, x_Sbar_mean[, idx_now], FUN = "+")` on the + # original B (i.e., not transposed B). + B_now <- t(B[, Sbar_now] %*% chol(cond_cov_mat_Sbar_given_S)) + + # Create a data.table containing the MC samples for all test observations for one coalition + data.table::rbindlist( + + # Loop over the different test observations + lapply(seq(n_explain), function(idx_now) { + # Combine the generated values with the values we conditioned on + ret <- matrix(NA, ncol = n_features, nrow = n_samples) + ret[, S_now] <- rep(c(x_S_star[idx_now,]), each = n_samples) + ret[, Sbar_now] <- t(B_now + x_Sbar_mean[, idx_now]) + + # Set names of the columns and convert to a data.table + colnames(ret) <- feature_names + as.data.table(ret) + }), + use.names = TRUE, idcol = "id", fill = TRUE + ) + } + ), + idcol = "id_combination" + ) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + + + +prepare_data_gaussian_new_v5_rnorm_cpp <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate the MC samples + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Call cpp + result_list <- prepare_data_gaussian_cpp( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + S = S, + mu = mu, + cov_mat = cov_mat) + + dt = as.data.table(do.call(rbind, result_list)) + setnames(dt, feature_names) + dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)] + dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +prepare_data_gaussian_new_v5_rnorm_cpp_with_wrap <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate the MC samples + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Call cpp + result_list <- prepare_data_gaussian_cpp_with_wrap( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + S = S, + mu = mu, + cov_mat = cov_mat) + + dt = as.data.table(do.call(rbind, result_list)) + setnames(dt, feature_names) + dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)] + dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + + +prepare_data_gaussian_new_v5_rnorm_cpp_v2 <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate the MC samples + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Call cpp + result_list <- prepare_data_gaussian_cpp_v2( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + S = S, + mu = mu, + cov_mat = cov_mat) + + dt = as.data.table(do.call(rbind, result_list)) + setnames(dt, feature_names) + dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)] + dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +prepare_data_gaussian_new_v5_rnorm_cpp_fix_large_mat <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate the MC samples from N(0, 1) + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Call cpp to create the data table with the MC samples for all explicands and coalitions + dt <- as.data.table( + prepare_data_gaussian_cpp_fix_large_mat( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + S = S, + mu = mu, + cov_mat = cov_mat) + ) + setnames(dt, feature_names) + dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)] + dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +prepare_data_gaussian_new_v5_rnorm_cpp_fix_large_mat_v2 <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate the MC samples from N(0, 1) + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Call cpp to create the data table with the MC samples for all explicands and coalitions + dt <- as.data.table( + prepare_data_gaussian_cpp_fix_large_mat_v2( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + S = S, + mu = mu, + cov_mat = cov_mat) + ) + setnames(dt, feature_names) + dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)] + dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +prepare_data_gaussian_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate the MC samples + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Call cpp + result_list <- prepare_data_gaussian_cpp_fix_list_of_lists_of_matrices( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + S = S, + mu = mu, + cov_mat = cov_mat) + + # Here we first put the inner list together and then the whole thing. Maybe exist another faster way! + dt = as.data.table(do.call(rbind, lapply(result_list, function(inner_list) do.call(rbind, inner_list)))) + setnames(dt, feature_names) + dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)] + dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +prepare_data_gaussian_new_v5_rnorm_cpp_fix_cube <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate the MC samples + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Call cpp + result_cube <- prepare_data_gaussian_cpp_fix_cube( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + S = S, + mu = mu, + cov_mat = cov_mat) + + # Reshape the 3D array to 2D + # This is slower + # dt = as.data.table(matrix(aperm(result_cube, c(1, 3, 2)), + # nrow = prod(dim(result_cube)[-2]), + # ncol = dim(result_cube)[2])) + dims = dim(result_cube) + result_cube = aperm(result_cube, c(1, 3, 2)) + dim(result_cube) <- c(prod(dims[-2]), dims[2]) + dt = as.data.table(result_cube) + setnames(dt, feature_names) + dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)] + dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +prepare_data_gaussian_new_v5_rnorm_cpp_fix_cube_v2 <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + n_combinations_now <- length(index_features) + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate the MC samples + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Call cpp + dt <- prepare_data_gaussian_cpp_fix_cube_v2( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + S = S, + mu = mu, + cov_mat = cov_mat) + + # Reshape and convert to data.table + dim(dt) = c(n_combinations_now*n_explain*n_samples, n_features) + print(system.time({dt = as.data.table(dt)}, gcFirst = FALSE)) + setnames(dt, feature_names) + dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)] + dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +prepare_data_gaussian_new_v5_rnorm_cpp_fix_std_list <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + + # Generate the MC samples + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Call cpp + result_list <- prepare_data_gaussian_cpp_fix_std_list( + MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + S = S, + mu = mu, + cov_mat = cov_mat) + + # FIND A BETTER WAY TO DO THIS + for (i in seq(length(result_list))) { + dim(result_list[[i]]) = c(n_samples, n_features) + } + + # Here we first put the inner list together and then the whole thing. Maybe exist another faster way! + dt = as.data.table(do.call(rbind, result_list)) + setnames(dt, feature_names) + dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)] + dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +# Here we only want to generate the data once. So we generate n_samples*n_batches from N(0, I), +# and then use Cholensky to transform to N(O, Sigma_{Sbar|S}), and then add the means. +prepare_data_gaussian_new_v6 <- function(internal, index_features, ...) { + # This function assumes that index_features will never include the empty and + # grand coalitions. This is valid 21/11/23 as `batch_prepare_vS()` removes the + # grand coalition before calling the `prepare_data()` function and the empty + # coalition is never included in the `internal$objects$S_batch` list. + + # Extract objects that we are going to use + x_explain <- internal$data$x_explain + S <- internal$objects$S + mu <- internal$parameters$gaussian.mu + cov_mat <- internal$parameters$gaussian.cov_mat + x_explain_mat <- as.matrix(internal$data$x_explain) + n_explain <- internal$parameters$n_explain + n_features <- internal$parameters$n_features + n_samples <- internal$parameters$n_samples + feature_names <- internal$parameters$feature_names + n_combinations <- internal$parameters$n_combinations + + # Extract the relevant coalitions specified in `index_features` from `S`. + # This will always be called as `index_features` is never NULL. + S <- if (!is.null(index_features)) S[index_features, , drop = FALSE] + n_combinations_in_this_batch <- nrow(S) + + # Allocate an empty matrix used in mvnfast:::rmvnCpp to store the generated MC samples. + B <- matrix(nrow = n_samples * n_combinations_in_this_batch, ncol = n_features) + class(B) <- "numeric" + + .Call("rmvnCpp", + n_ = n_samples * n_combinations_in_this_batch, + mu_ = rep(0, n_features), + sigma_ = diag(n_features), + ncores_ = 1, + isChol_ = TRUE, + A_ = B, + PACKAGE = "mvnfast" + ) + + # Indices of the start for the combinations + B_indices <- n_samples * (seq(0, n_combinations_in_this_batch)) + 1 + + # Generate a data table containing all Monte Carlo samples for all test observations and coalitions + dt <- data.table::rbindlist( + # Iterate over the coalitions + lapply( + seq_len(nrow(S)), + function(S_ind) { + # This function generates the conditional samples Xsbar | Xs = Xs_star + # and combine those values with the unconditional values. + cat(sprintf("%d,", S_ind)) + + # Get boolean representations if the features are in the S and the Sbar sets + S_now <- as.logical(S[S_ind, ]) + Sbar_now <- !as.logical(S[S_ind, ]) + + # Remove: + # Do not need to treat the empty and grand coalitions different as they will never be present + # if (sum(S_now) %in% c(0, n_features)) { + # return(data.table::as.data.table(cbind("id" = seq(n_explain), x_explain))) + # } + + # Extract the features we condition on + x_S_star <- x_explain_mat[, S_now, drop = FALSE] + + # Extract the mean values for the features in the two sets + mu_S <- mu[S_now] + mu_Sbar <- mu[Sbar_now] + + # Extract the relevant parts of the covariance matrix + cov_mat_SS <- cov_mat[S_now, S_now, drop = FALSE] + cov_mat_SSbar <- cov_mat[S_now, Sbar_now, drop = FALSE] + cov_mat_SbarS <- cov_mat[Sbar_now, S_now, drop = FALSE] + cov_mat_SbarSbar <- cov_mat[Sbar_now, Sbar_now, drop = FALSE] + + # Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + cov_mat_SbarS_cov_mat_SS_inv <- cov_mat_SbarS %*% solve(cov_mat_SS) + cond_cov_mat_Sbar_given_S <- cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv %*% cov_mat_SSbar + + # Ensure that the conditional covariance matrix symmetric in the + # rare case where numerical instability made it unsymmetrical. + if (!isSymmetric(cond_cov_mat_Sbar_given_S)) { + cond_cov_mat_Sbar_given_S <- Matrix::symmpart(cond_cov_mat_Sbar_given_S) + } + + # Compute the conditional mean of Xsbar given Xs = Xs_star + x_Sbar_mean <- mu_Sbar + cov_mat_SbarS_cov_mat_SS_inv %*% t(sweep(x_S_star, 2, mu_S, FUN = "-")) + + # Transform the samples to be from N(O, Sigma_Sbar|S) + # Extract the relevant samples for this combination + # Transpose her and untranspose later for faster matrix addition in `t(B + x_Sbar_mean[, idx_now])` + # as it seems to be faster than using `sweep(B, 2, x_Sbar_mean[, idx_now], FUN = "+")` on the + # original B (i.e., not transposed B). + B_now <- t(B[B_indices[S_ind]:(B_indices[S_ind + 1] - 1), Sbar_now] %*% chol(cond_cov_mat_Sbar_given_S)) + + # Create a data.table containing the MC samples for all test observations for one coalition + data.table::rbindlist( + + # Loop over the different test observations + lapply(seq(n_explain), function(idx_now) { + # Combine the generated values with the values we conditioned on + ret <- matrix(NA, ncol = n_features, nrow = n_samples) + ret[, S_now] <- rep(c(x_explain_mat[idx_now, S_now]), each = n_samples) + ret[, Sbar_now] <- t(B_now + x_Sbar_mean[, idx_now]) + + # Set names of the columns and convert to a data.table + colnames(ret) <- feature_names + as.data.table(ret) + }), + use.names = TRUE, idcol = "id", fill = TRUE + ) + } + ), + idcol = "id_combination" + ) + + # Update the id_combination. This will always be called as `index_features` is never NULL. + if (!is.null(index_features)) dt[, id_combination := index_features[id_combination]] + + # Add uniform weights + dt[, w := 1 / n_samples] + + # Remove: + # This is not needed when we assume that the empty and grand coalitions will never be present + # dt[id_combination %in% c(1, n_combinations), w := 1] + + # Return the MC samples + return(dt) +} + +# Compare the methods --------------------------------------------------------------------------------------------- + +## Setup ----------------------------------------------------------------------------------------------------------- + +{ + n_samples <- 1000 + # n_samples <- 25000 + n_train <- 1000 + n_test <- 100 + M <- 8 + rho <- 0.5 + betas <- c(0, rep(1, M)) + + # We use the Gaussian approach + approach <- "gaussian" + + # Mean of the multivariate Gaussian distribution + mu <- rep(0, times = M) + mu <- seq(M) + + # Create the covariance matrix + sigma <- matrix(rho, ncol = M, nrow = M) # Old + for (i in seq(1, M - 1)) { + for (j in seq(i + 1, M)) { + sigma[i, j] <- sigma[j, i] <- rho^abs(i - j) + } + } + diag(sigma) <- 1 + + # Set seed for reproducibility + seed_setup <- 1996 + set.seed(seed_setup) + + # Make Gaussian data + data_train <- data.table(mvtnorm::rmvnorm(n = n_train, mean = mu, sigma = sigma)) + data_test <- data.table(mvtnorm::rmvnorm(n = n_test, mean = mu, sigma = sigma)) + colnames(data_train) <- paste("X", seq(M), sep = "") + colnames(data_test) <- paste("X", seq(M), sep = "") + + # Make the response + response_train <- as.vector(cbind(1, as.matrix(data_train)) %*% betas) + response_test <- as.vector(cbind(1, as.matrix(data_test)) %*% betas) + + # Put together the data + data_train_with_response <- copy(data_train)[, y := response_train] + data_test_with_response <- copy(data_test)[, y := response_test] + + # Fit a LM model + predictive_model <- lm(y ~ ., data = data_train_with_response) + + # Get the prediction zero, i.e., the phi0 Shapley value. + prediction_zero <- mean(response_train) + + model <- predictive_model + x_explain <- data_test + x_train <- data_train + keep_samp_for_vS <- FALSE + predict_model <- NULL + get_model_specs <- NULL + timing <- TRUE + n_combinations <- NULL + group <- NULL + feature_specs <- get_feature_specs(get_model_specs, model) + n_batches <- 1 + seed <- 1 + + internal <- setup( + x_train = x_train, + x_explain = x_explain, + approach = approach, + prediction_zero = prediction_zero, + n_combinations = n_combinations, + group = group, + n_samples = n_samples, + n_batches = n_batches, + seed = seed, + feature_specs = feature_specs, + keep_samp_for_vS = keep_samp_for_vS, + predict_model = predict_model, + get_model_specs = get_model_specs, + timing = timing, + gaussian.mu = mu, + gaussian.cov_mat = sigma + ) + + # Gets predict_model (if not passed to explain) + predict_model <- get_predict_model( + predict_model = predict_model, + model = model + ) + + # Sets up the Shapley (sampling) framework and prepares the + # conditional expectation computation for the chosen approach + # Note: model and predict_model are ONLY used by the AICc-methods of approach empirical to find optimal parameters + internal <- setup_computation(internal, model, predict_model) +} + + + +## Compare time ---------------------------------------------------------------------------------------------------- + +# Recall that old version iterate over the observations and then the coalitions. +# While the new version iterate over the coalitions and then the observations. +# The latter lets us reuse the computed conditional distributions for all observations. +look_at_coalitions <- seq(1, 2^M - 2) +#look_at_coalitions <- seq(1, 2^M - 2, 10) +#look_at_coalitions <- seq(1, 2^M - 2, 25) +time_old <- system.time({ + res_old <- prepare_data_gaussian_old(internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_old <- NULL +# Set to NULL as it is many GB when we look at many combinations in one batch and the methods slow down due to +# little available memory. The same below. + +time_new_v1 <- system.time({ + res_new_v1 <- prepare_data_gaussian_new_v1( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v1 <- NULL + +time_new_v2 <- system.time({ + res_new_v2 <- prepare_data_gaussian_new_v2( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v2 <- NULL + +time_new_v3 <- system.time({ + res_new_v3 <- prepare_data_gaussian_new_v3( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v3 <- NULL + +time_new_v4 <- system.time({ + res_new_v4 <- prepare_data_gaussian_new_v4( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v4 <- NULL + +time_new_v5 <- system.time({ + res_new_v5 <- prepare_data_gaussian_new_v5( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5 <- NULL + +time_new_v5_rnorm <- system.time({ + res_new_v5_rnorm <- prepare_data_gaussian_new_v5_rnorm( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm <- NULL + +time_new_v5_rnorm_v2 <- system.time({ + res_new_v5_rnorm_v2 <- prepare_data_gaussian_new_v5_rnorm_v2( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm_v2 <- NULL + +time_new_v5_rnorm_cpp <- system.time({ + res_new_v5_rnorm_cpp <- prepare_data_gaussian_new_v5_rnorm_cpp( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm_cpp <- NULL + +time_new_v5_rnorm_cpp_with_wrap <- system.time({ + res_new_v5_rnorm_cpp_with_wrap <- prepare_data_gaussian_new_v5_rnorm_cpp_with_wrap( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm_cpp_with_wrap <- NULL + +time_new_v5_rnorm_cpp_v2 <- system.time({ + res_new_v5_rnorm_cpp_v2 <- prepare_data_gaussian_new_v5_rnorm_cpp_v2( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm_cpp_v2 <- NULL + +time_new_v5_rnorm_cpp_fix_large_mat <- system.time({ + res_new_v5_rnorm_cpp_fix_large_mat <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_large_mat( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm_cpp_fix_large_mat <- NULL + +time_new_v5_rnorm_cpp_fix_large_mat_v2 <- system.time({ + res_new_v5_rnorm_cpp_fix_large_mat_v2 <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_large_mat_v2( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm_cpp_fix_large_mat_v2 <- NULL + +time_new_v5_rnorm_cpp_fix_cube <- system.time({ + res_new_v5_rnorm_cpp_fix_cube <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_cube( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm_cpp_fix_cube <- NULL + +time_new_v5_rnorm_cpp_fix_cube_v2 <- system.time({ + res_new_v5_rnorm_cpp_fix_cube_v2 <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_cube_v2( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm_cpp_fix_cube_v2 <- NULL + +time_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- system.time({ + res_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- NULL + +time_new_v5_rnorm_cpp_fix_std_list <- system.time({ + res_new_v5_rnorm_cpp_fix_std_list <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_std_list( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v5_rnorm_cpp_fix_std_list <- NULL + +time_new_v6 <- system.time({ + res_new_v6 <- prepare_data_gaussian_new_v6( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalitions])}) +res_new_v6 <- NULL + +# Create a table of the times. Less is better +times <- rbind(time_old, + time_new_v1, + time_new_v2, + time_new_v3, + time_new_v4, + time_new_v5, + time_new_v5_rnorm, + time_new_v5_rnorm_v2, + time_new_v5_rnorm_cpp, + time_new_v5_rnorm_cpp_with_wrap, + time_new_v5_rnorm_cpp_v2, + time_new_v5_rnorm_cpp_fix_large_mat, + time_new_v5_rnorm_cpp_fix_large_mat_v2, + time_new_v5_rnorm_cpp_fix_cube, + time_new_v5_rnorm_cpp_fix_cube_v2, + time_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices, + time_new_v5_rnorm_cpp_fix_std_list, + time_new_v6) +times + +# Look at the relative time compared to the old method. Larger value is better. +# Tells us how many times faster the new version is. +times_relative <- t(sapply(seq_len(nrow(times)), function(idx) times[1, ] / times[idx, ])) +rownames(times_relative) <- paste0(rownames(times), "_rel") +times_relative + +# ALL COALITIONS (look_at_coalitions = seq(1, 2^M-2)) +# user.self sys.self elapsed user.child sys.child +# time_old 38.663 3.654 43.044 0.000 0.000 +# time_new_v1 14.693 3.539 18.709 0.000 0.000 +# time_new_v2 15.545 3.897 19.966 0.012 0.032 +# time_new_v3 13.476 3.838 17.812 0.000 0.000 +# time_new_v4 14.085 4.858 19.718 0.015 0.033 +# time_new_v5 13.508 4.104 18.148 0.000 0.000 +# time_new_v5_rnorm 13.107 4.178 17.705 0.000 0.000 +# time_new_v5_rnorm_v2 13.309 4.458 18.233 0.010 0.023 +# time_new_v5_rnorm_cpp 44.782 5.589 51.849 0.000 0.000 +# time_new_v5_rnorm_cpp_with_wrap 45.816 4.799 51.979 0.021 0.070 +# time_new_v5_rnorm_cpp_v2 44.997 6.513 52.931 0.000 0.000 +# time_new_v5_rnorm_cpp_fix_large_mat 5.594 2.142 7.831 0.000 0.000 +# time_new_v5_rnorm_cpp_fix_large_mat_v2 6.160 2.112 8.499 0.000 0.000 +# time_new_v5_rnorm_cpp_fix_cube 5.607 2.745 8.558 0.000 0.000 +# time_new_v5_rnorm_cpp_fix_cube_v2 4.621 2.121 6.862 0.000 0.000 +# time_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices 6.016 3.687 10.469 0.000 0.000 +# time_new_v5_rnorm_cpp_fix_std_list 5.407 3.272 8.841 0.000 0.000 +# time_new_v6 13.540 4.267 18.361 0.000 0.000 +# user.self sys.self elapsed user.child sys.child +# time_old_rel 1.00000 1.00000 1.00000 NaN NaN +# time_new_v1_rel 2.63139 1.03250 2.30071 NaN NaN +# time_new_v2_rel 2.48717 0.93764 2.15586 0 0 +# time_new_v3_rel 2.86903 0.95206 2.41657 NaN NaN +# time_new_v4_rel 2.74498 0.75216 2.18298 0 0 +# time_new_v5_rel 2.86223 0.89035 2.37183 NaN NaN +# time_new_v5_rnorm_rel 2.94980 0.87458 2.43118 NaN NaN +# time_new_v5_rnorm_v2_rel 2.90503 0.81965 2.36077 0 0 +# time_new_v5_rnorm_cpp_rel 0.86336 0.65378 0.83018 NaN NaN +# time_new_v5_rnorm_cpp_with_wrap_rel 0.84388 0.76141 0.82810 0 0 +# time_new_v5_rnorm_cpp_v2_rel 0.85924 0.56103 0.81321 NaN NaN +# time_new_v5_rnorm_cpp_fix_large_mat_rel 6.91151 1.70588 5.49662 NaN NaN +# time_new_v5_rnorm_cpp_fix_large_mat_v2_rel 6.27646 1.73011 5.06460 NaN NaN +# time_new_v5_rnorm_cpp_fix_cube_rel 6.89549 1.33115 5.02968 NaN NaN +# time_new_v5_rnorm_cpp_fix_cube_v2_rel 8.36680 1.72277 6.27281 NaN NaN +# time_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices_rel 6.42670 0.99105 4.11157 NaN NaN +# time_new_v5_rnorm_cpp_fix_std_list_rel 7.15055 1.11675 4.86868 NaN NaN +# time_new_v6_rel 2.85547 0.85634 2.34432 NaN NaN + + +# 26 coalitions (look_at_coalitions = seq(1, 2^M-2, 10)) +# user.self sys.self elapsed user.child sys.child +# time_old 25.913 2.797 30.399 0 0 +# time_new_v1 7.071 1.624 8.997 0 0 +# time_new_v2 6.653 1.461 8.244 0 0 +# time_new_v3 5.700 1.690 7.521 0 0 +# time_new_v4 5.877 1.826 7.852 0 0 +# time_new_v5 5.522 1.594 7.286 0 0 +# time_new_v6 5.559 1.668 7.335 0 0 +# user.self sys.self elapsed user.child sys.child +# time_old_rel 1.0000 1.0000 1.0000 NaN NaN +# time_new_v1_rel 3.6647 1.7223 3.3788 NaN NaN +# time_new_v2_rel 3.8949 1.9144 3.6874 NaN NaN +# time_new_v3_rel 4.5461 1.6550 4.0419 NaN NaN +# time_new_v4_rel 4.4092 1.5318 3.8715 NaN NaN +# time_new_v5_rel 4.6927 1.7547 4.1722 NaN NaN +# time_new_v6_rel 4.6614 1.6769 4.1444 NaN NaN + + +# 11 coalitions (look_at_coalitions = seq(1, 2^M-2, 25)) +# user.self sys.self elapsed user.child sys.child +# time_old 11.251 1.187 12.961 0.000 0.000 +# time_new_v1 3.273 0.873 4.306 0.000 0.000 +# time_new_v2 3.043 0.690 4.011 0.000 0.000 +# time_new_v3 2.677 0.794 3.587 0.000 0.000 +# time_new_v4 2.598 0.759 3.460 0.000 0.000 +# time_new_v5 2.574 0.752 3.613 0.000 0.000 +# time_new_v6 2.303 0.669 3.009 0.000 0.000 +# user.self sys.self elapsed user.child sys.child +# time_old_rel 1.0000 1.0000 1.0000 NaN NaN +# time_new_v1_rel 3.4375 1.3597 3.0100 NaN NaN +# time_new_v2_rel 3.6973 1.7203 3.2314 NaN NaN +# time_new_v3_rel 4.2028 1.4950 3.6133 NaN NaN +# time_new_v4_rel 4.3306 1.5639 3.7460 NaN NaN +# time_new_v5_rel 4.3710 1.5785 3.5873 NaN NaN +# time_new_v6_rel 4.8854 1.7743 4.3074 NaN NaN + + +## Compare mean ---------------------------------------------------------------------------------------------------- +look_at_coalition <- 25 +one_coalition_time_old <- system.time({ + one_coalition_res_old <- prepare_data_gaussian_old( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) +one_coalition_time_old2 <- system.time({ + one_coalition_res_old2 <- prepare_data_gaussian_old( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +one_coalition_time_new_v1 <- system.time({ + one_coalition_res_new_v1 <- prepare_data_gaussian_new_v1( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +one_coalition_time_new_v2 <- system.time({ + one_coalition_res_new_v2 <- prepare_data_gaussian_new_v2( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +one_coalition_time_new_v3 <- system.time({ + one_coalition_res_new_v3 <- prepare_data_gaussian_new_v3( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +one_coalition_time_new_v4 <- system.time({ + one_coalition_res_new_v4 <- prepare_data_gaussian_new_v4( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +one_coalition_time_new_v5 <- system.time({ + one_coalition_res_new_v5 <- prepare_data_gaussian_new_v5( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm <- system.time({ + one_coalition_res_new_v5_rnorm <- prepare_data_gaussian_new_v5_rnorm( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm_v2 <- system.time({ + one_coalition_res_new_v5_rnorm_v2 <- prepare_data_gaussian_new_v5_rnorm_v2( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm_cpp <- system.time({ + one_coalition_res_new_v5_rnorm_cpp <- prepare_data_gaussian_new_v5_rnorm_cpp( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm_cpp_with_wrap <- system.time({ + one_coalition_res_new_v5_rnorm_cpp_with_wrap <- prepare_data_gaussian_new_v5_rnorm_cpp_with_wrap( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm_cpp_v2 <- system.time({ + one_coalition_res_new_v5_rnorm_cpp_v2 <- prepare_data_gaussian_new_v5_rnorm_cpp_v2( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm_cpp_fix_large_mat <- system.time({ + one_coalition_res_new_v5_rnorm_cpp_fix_large_mat <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_large_mat( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm_cpp_fix_large_mat_v2 <- system.time({ + one_coalition_res_new_v5_rnorm_cpp_fix_large_mat_v2 <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_large_mat_v2( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm_cpp_fix_cube <- system.time({ + one_coalition_res_new_v5_rnorm_cpp_fix_cube <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_cube( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm_cpp_fix_cube_v2 <- system.time({ + one_coalition_res_new_v5_rnorm_cpp_fix_cube_v2 <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_cube_v2( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- system.time({ + one_coalition_res_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +set.seed(123) +one_coalition_time_new_v5_rnorm_cpp_fix_std_list <- system.time({ + one_coalition_res_new_v5_rnorm_cpp_fix_std_list <- prepare_data_gaussian_new_v5_rnorm_cpp_fix_std_list( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +one_coalition_time_new_v6 <- system.time({ + one_coalition_res_new_v6 <- prepare_data_gaussian_new_v6( + internal = internal, + index_features = internal$objects$S_batch$`1`[look_at_coalition])}) + +rbind(one_coalition_time_old, + one_coalition_time_old2, + one_coalition_time_new_v1, + one_coalition_time_new_v2, + one_coalition_time_new_v3, + one_coalition_time_new_v4, + one_coalition_time_new_v5, + one_coalition_time_new_v5_rnorm, + one_coalition_time_new_v5_rnorm_v2, + one_coalition_time_new_v5_rnorm_cpp, + one_coalition_time_new_v5_rnorm_cpp_with_wrap, + one_coalition_time_new_v5_rnorm_cpp_v2, + one_coalition_time_new_v5_rnorm_cpp_fix_large_mat, + one_coalition_time_new_v5_rnorm_cpp_fix_large_mat_v2, + one_coalition_time_new_v5_rnorm_cpp_fix_cube, + one_coalition_time_new_v5_rnorm_cpp_fix_cube_v2, + one_coalition_time_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices, + one_coalition_time_new_v5_rnorm_cpp_fix_std_list, + one_coalition_time_new_v6) + +internal$objects$S[internal$objects$S_batch$`1`[look_at_coalition], , drop = FALSE] +means_old <- one_coalition_res_old[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_old2 <- one_coalition_res_old2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v1 <- one_coalition_res_new_v1[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v2 <- one_coalition_res_new_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v3 <- one_coalition_res_new_v3[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v4 <- one_coalition_res_new_v4[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5 <- one_coalition_res_new_v5[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm <- one_coalition_res_new_v5_rnorm[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm_v2 <- one_coalition_res_new_v5_rnorm_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm_cpp <- one_coalition_res_new_v5_rnorm_cpp[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm_cpp_with_wrap <- one_coalition_res_new_v5_rnorm_cpp_with_wrap[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm_cpp_v2 <- one_coalition_res_new_v5_rnorm_cpp_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm_cpp_fix_large_mat <- one_coalition_res_new_v5_rnorm_cpp_fix_large_mat[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm_cpp_fix_large_mat_v2 <- one_coalition_res_new_v5_rnorm_cpp_fix_large_mat_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm_cpp_fix_cube <- one_coalition_res_new_v5_rnorm_cpp_fix_cube[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm_cpp_fix_cube_v2 <- one_coalition_res_new_v5_rnorm_cpp_fix_cube_v2[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm_cpp_fix_list_of_lists_of_matrices <- one_coalition_res_new_v5_rnorm_cpp_fix_list_of_lists_of_matrices[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v5_rnorm_cpp_fix_std_list <- one_coalition_res_new_v5_rnorm_cpp_fix_std_list[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] +means_v6 <- one_coalition_res_new_v6[, lapply(.SD, mean), .SDcols = paste0("X", seq(M)), by = list(id_combination, id)] + +# They are all in the same ballpark, so the differences are due to sampling. +# This is supported by the fact that mean_old and mean_old2 use the same old code, and the difference there is the +# same as for the new methods. +# A larger n_samples makes these closer to 0 (I have done that and for other means too) +max(abs(means_old - means_old2)) +max(abs(means_old - means_v1)) +max(abs(means_old - means_v2)) +max(abs(means_old - means_v3)) +max(abs(means_old - means_v4)) +max(abs(means_old - means_v5)) +max(abs(means_old - means_v5_rnorm)) +max(abs(means_old - means_v5_rnorm_v2)) +max(abs(means_old - means_v5_rnorm_cpp)) +max(abs(means_old - means_v5_rnorm_cpp_with_wrap)) +max(abs(means_old - means_v5_rnorm_cpp_v2)) +max(abs(means_old - means_v5_rnorm_cpp_fix_large_mat)) +max(abs(means_old - means_v5_rnorm_cpp_fix_large_mat_v2)) +max(abs(means_old - means_v5_rnorm_cpp_fix_cube)) +max(abs(means_old - means_v5_rnorm_cpp_fix_cube_v2)) +max(abs(means_old - means_v5_rnorm_cpp_fix_list_of_lists_of_matrices)) +max(abs(means_old - means_v5_rnorm_cpp_fix_std_list)) +max(abs(means_old - means_v6)) + + + diff --git a/man/gaussian_transform_separate.Rd b/man/gaussian_transform_separate.Rd index eef1e0c6a..89afb6494 100644 --- a/man/gaussian_transform_separate.Rd +++ b/man/gaussian_transform_separate.Rd @@ -10,7 +10,7 @@ gaussian_transform_separate(yx, n_y) \item{yx}{Numeric vector. The first \code{n_y} items is the data that is transformed, and last part is the data with the original transformation.} -\item{n_y}{Positive integer. Number of elements of \code{yx} that belongs to the gaussian data.} +\item{n_y}{Positive integer. Number of elements of \code{yx} that belongs to the Gaussian data.} } \value{ Vector of back-transformed Gaussian data diff --git a/man/inv_gaussian_transform.Rd b/man/inv_gaussian_transform.Rd deleted file mode 100644 index 76f058772..000000000 --- a/man/inv_gaussian_transform.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/approach_copula.R -\name{inv_gaussian_transform} -\alias{inv_gaussian_transform} -\title{Transforms new data to a standardized normal distribution} -\usage{ -inv_gaussian_transform(zx, n_z) -} -\arguments{ -\item{zx}{Numeric vector. The first \code{n_z} items are the Gaussian data, and the last part is -the data with the original transformation.} - -\item{n_z}{Positive integer. Number of elements of \code{zx} that belongs to new data.} -} -\value{ -Numeric vector of length \code{n_z} -} -\description{ -Transforms new data to a standardized normal distribution -} -\author{ -Martin Jullum -} -\keyword{internal} diff --git a/man/inv_gaussian_transform_cpp.Rd b/man/inv_gaussian_transform_cpp.Rd new file mode 100644 index 000000000..7cf04833c --- /dev/null +++ b/man/inv_gaussian_transform_cpp.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{inv_gaussian_transform_cpp} +\alias{inv_gaussian_transform_cpp} +\title{Transforms new data to a standardized normal distribution} +\usage{ +inv_gaussian_transform_cpp(z, x) +} +\arguments{ +\item{z}{arma::mat. The data are the Gaussian Monte Carlos samples to transform.} + +\item{x}{arma::mat. The data with the original transformation. Used to conduct the transformation of \code{z}.} +} +\value{ +arma::mat of the same dimension as \code{z} +} +\description{ +Transforms new data to a standardized normal distribution +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/prepare_data.Rd b/man/prepare_data.Rd index 097fef9b8..23e57b18d 100644 --- a/man/prepare_data.Rd +++ b/man/prepare_data.Rd @@ -17,13 +17,13 @@ prepare_data(internal, index_features = NULL, ...) \method{prepare_data}{categorical}(internal, index_features = NULL, ...) -\method{prepare_data}{copula}(internal, index_features = NULL, ...) +\method{prepare_data}{copula}(internal, index_features, ...) \method{prepare_data}{ctree}(internal, index_features = NULL, ...) \method{prepare_data}{empirical}(internal, index_features = NULL, ...) -\method{prepare_data}{gaussian}(internal, index_features = NULL, ...) +\method{prepare_data}{gaussian}(internal, index_features, ...) \method{prepare_data}{independence}(internal, index_features = NULL, ...) @@ -44,4 +44,7 @@ the contribution function by Monte Carlo integration. \description{ Generate data used for predictions and Monte Carlo integration } +\author{ +Lars Henry Berge Olsen +} \keyword{internal} diff --git a/man/prepare_data_copula_cpp.Rd b/man/prepare_data_copula_cpp.Rd new file mode 100644 index 000000000..ca901031d --- /dev/null +++ b/man/prepare_data_copula_cpp.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{prepare_data_copula_cpp} +\alias{prepare_data_copula_cpp} +\title{Generate (Gaussian) Copula MC samples} +\usage{ +prepare_data_copula_cpp( + MC_samples_mat, + x_explain_mat, + x_explain_gaussian_mat, + x_train_mat, + S, + mu, + cov_mat +) +} +\arguments{ +\item{MC_samples_mat}{arma::mat. Matrix of dimension (\code{n_samples}, \code{n_features}) containing samples from the +univariate standard normal.} + +\item{x_explain_mat}{arma::mat. Matrix of dimension (\code{n_explain}, \code{n_features}) containing the observations +to explain on the original scale.} + +\item{x_explain_gaussian_mat}{arma::mat. Matrix of dimension (\code{n_explain}, \code{n_features}) containing the +observations to explain after being transformed using the Gaussian transform, i.e., the samples have been +transformed to a standardized normal distribution.} + +\item{x_train_mat}{arma::mat. Matrix of dimension (\code{n_train}, \code{n_features}) containing the training observations.} + +\item{S}{arma::mat. Matrix of dimension (\code{n_combinations}, \code{n_features}) containing binary representations of +the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones. +This is not a problem internally in shapr as the empty and grand coalitions treated differently.} + +\item{mu}{arma::vec. Vector of length \code{n_features} containing the mean of each feature after being transformed +using the Gaussian transform, i.e., the samples have been transformed to a standardized normal distribution.} + +\item{cov_mat}{arma::mat. Matrix of dimension (\code{n_features}, \code{n_features}) containing the pairwise covariance +between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been +transformed to a standardized normal distribution.} +} +\value{ +An arma::cube/3D array of dimension (\code{n_samples}, \code{n_explain} * \code{n_coalitions}, \code{n_features}), where +the columns (\emph{,j,}) are matrices of dimension (\code{n_samples}, \code{n_features}) containing the conditional Gaussian +copula MC samples for each explicand and coalition on the original scale. +} +\description{ +Generate (Gaussian) Copula MC samples +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/prepare_data_gaussian_cpp.Rd b/man/prepare_data_gaussian_cpp.Rd new file mode 100644 index 000000000..b24b431e6 --- /dev/null +++ b/man/prepare_data_gaussian_cpp.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{prepare_data_gaussian_cpp} +\alias{prepare_data_gaussian_cpp} +\title{Generate Gaussian MC samples} +\usage{ +prepare_data_gaussian_cpp(MC_samples_mat, x_explain_mat, S, mu, cov_mat) +} +\arguments{ +\item{MC_samples_mat}{arma::mat. Matrix of dimension (\code{n_samples}, \code{n_features}) containing samples from the +univariate standard normal.} + +\item{x_explain_mat}{arma::mat. Matrix of dimension (\code{n_explain}, \code{n_features}) containing the observations +to explain.} + +\item{S}{arma::mat. Matrix of dimension (\code{n_combinations}, \code{n_features}) containing binary representations of +the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones. +This is not a problem internally in shapr as the empty and grand coalitions treated differently.} + +\item{mu}{arma::vec. Vector of length \code{n_features} containing the mean of each feature.} + +\item{cov_mat}{arma::mat. Matrix of dimension (\code{n_features}, \code{n_features}) containing the pairwise covariance +between all pairs of features.} +} +\value{ +An arma::cube/3D array of dimension (\code{n_samples}, \code{n_explain} * \code{n_coalitions}, \code{n_features}), where +the columns (\emph{,j,}) are matrices of dimension (\code{n_samples}, \code{n_features}) containing the conditional Gaussian +MC samples for each explicand and coalition. +} +\description{ +Generate Gaussian MC samples +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/quantile_type7_cpp.Rd b/man/quantile_type7_cpp.Rd new file mode 100644 index 000000000..69e3100a9 --- /dev/null +++ b/man/quantile_type7_cpp.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{quantile_type7_cpp} +\alias{quantile_type7_cpp} +\title{Compute the quantiles using quantile type seven} +\usage{ +quantile_type7_cpp(x, probs) +} +\arguments{ +\item{x}{arma::vec. Numeric vector whose sample quantiles are wanted.} + +\item{probs}{arma::vec. Numeric vector of probabilities with values between zero and one.} +} +\value{ +A vector of length \code{length(probs)} with the quantiles is returned. +} +\description{ +Compute the quantiles using quantile type seven +} +\details{ +Using quantile type number seven from stats::quantile in R. +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/sample_copula.Rd b/man/sample_copula.Rd deleted file mode 100644 index c180f25ca..000000000 --- a/man/sample_copula.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/approach_copula.R -\name{sample_copula} -\alias{sample_copula} -\title{Sample conditional variables using the Gaussian copula approach} -\usage{ -sample_copula( - index_given, - n_samples, - mu, - cov_mat, - m, - x_explain_gaussian, - x_train, - x_explain -) -} -\arguments{ -\item{index_given}{Integer vector. The indices of the features to condition upon. Note that -\code{min(index_given) >= 1} and \code{max(index_given) <= m}.} - -\item{m}{Positive integer. The total number of features.} - -\item{x_explain_gaussian}{Numeric matrix. Contains the observation whose predictions ought -to be explained (test data), -after quantile-transforming them to standard Gaussian variables.} - -\item{x_explain}{Numeric matrix. Contains the features of the observation whose -predictions ought to be explained (test data).} -} -\value{ -data.table -} -\description{ -Sample conditional variables using the Gaussian copula approach -} -\author{ -Martin Jullum -} -\keyword{internal} diff --git a/man/sample_gaussian.Rd b/man/sample_gaussian.Rd deleted file mode 100644 index f91312e85..000000000 --- a/man/sample_gaussian.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/approach_gaussian.R -\name{sample_gaussian} -\alias{sample_gaussian} -\title{Sample conditional Gaussian variables} -\usage{ -sample_gaussian(index_given, n_samples, mu, cov_mat, m, x_explain) -} -\arguments{ -\item{index_given}{Integer vector. The indices of the features to condition upon. Note that -\code{min(index_given) >= 1} and \code{max(index_given) <= m}.} - -\item{m}{Positive integer. The total number of features.} - -\item{x_explain}{Numeric matrix. Contains the features of the observation whose -predictions ought to be explained (test data).} -} -\value{ -data.table -} -\description{ -Sample conditional Gaussian variables -} -\author{ -Martin Jullum -} -\keyword{internal} diff --git a/man/setup_approach.Rd b/man/setup_approach.Rd index 86cb164bb..a87fbd106 100644 --- a/man/setup_approach.Rd +++ b/man/setup_approach.Rd @@ -153,3 +153,6 @@ This is useful if the underlying time series are scaled between 0 and 1, for exa The different choices of \code{approach} takes different (optional) parameters, which are forwarded from \code{\link[=explain]{explain()}}. } +\author{ +Martin Jullum +} diff --git a/python/install_r_packages.R b/python/install_r_packages.R index 0c8768d91..598886243 100644 --- a/python/install_r_packages.R +++ b/python/install_r_packages.R @@ -1,3 +1,4 @@ # Installs the required R-packages -install.packages("remotes",repos="https://cloud.r-project.org") -remotes::install_github("NorskRegnesentral/shapr") # Installs the development version of shapr from the master branch on CRAN +install.packages("remotes", repos = "https://cloud.r-project.org") +remotes::install_github("NorskRegnesentral/shapr") +# Installs the development version of shapr from the master branch on CRAN diff --git a/src/Copula.cpp b/src/Copula.cpp new file mode 100644 index 000000000..732ed3a4f --- /dev/null +++ b/src/Copula.cpp @@ -0,0 +1,169 @@ +#include +using namespace Rcpp; + +// [[Rcpp::depends(RcppArmadillo)]] + +//' Compute the quantiles using quantile type seven +//' +//' @details Using quantile type number seven from stats::quantile in R. +//' +//' @param x arma::vec. Numeric vector whose sample quantiles are wanted. +//' @param probs arma::vec. Numeric vector of probabilities with values between zero and one. +//' +//' @return A vector of length `length(probs)` with the quantiles is returned. +//' +//' @keywords internal +//' @author Lars Henry Berge Olsen +// [[Rcpp::export]] +arma::vec quantile_type7_cpp(const arma::vec& x, const arma::vec& probs) { + int n = x.n_elem; + int m = probs.n_elem; + + // Initialize output quantile vector + arma::vec qs(m); + + // Calculate indices + arma::vec index = 1 + (n - 1) * probs; + arma::vec lo = arma::floor(index); + arma::vec hi = arma::ceil(index); + + // Sort the data + arma::vec sorted_x = arma::sort(x); + + // Calculate quantiles using quantile type seven + for (int i = 0; i < m; ++i) { + qs(i) = sorted_x(lo(i) - 1); + if (index(i) > lo(i)) { + double h = index(i) - lo(i); + qs(i) = (1 - h) * qs(i) + h * sorted_x(hi(i) - 1); + } + } + + return qs; +} + +//' Transforms new data to a standardized normal distribution +//' +//' @param z arma::mat. The data are the Gaussian Monte Carlos samples to transform. +//' @param x arma::mat. The data with the original transformation. Used to conduct the transformation of `z`. +//' +//' @return arma::mat of the same dimension as `z` +//' +//' @keywords internal +//' @author Lars Henry Berge Olsen +// [[Rcpp::export]] +arma::mat inv_gaussian_transform_cpp(const arma::mat& z, const arma::mat& x) { + int n_features = z.n_cols; + int n_samples = z.n_rows; + arma::mat z_new(n_samples, n_features); + arma::mat u = arma::normcdf(z); + for (int feature_idx = 0; feature_idx < n_features; feature_idx++) { + z_new.col(feature_idx) = quantile_type7_cpp(x.col(feature_idx), u.col(feature_idx)); + } + return z_new; +} + +//' Generate (Gaussian) Copula MC samples +//' +//' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the +//' univariate standard normal. +//' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations +//' to explain on the original scale. +//' @param x_explain_gaussian_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the +//' observations to explain after being transformed using the Gaussian transform, i.e., the samples have been +//' transformed to a standardized normal distribution. +//' @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations. +//' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of +//' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones. +//' This is not a problem internally in shapr as the empty and grand coalitions treated differently. +//' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed +//' using the Gaussian transform, i.e., the samples have been transformed to a standardized normal distribution. +//' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance +//' between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been +//' transformed to a standardized normal distribution. +//' +//' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where +//' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian +//' copula MC samples for each explicand and coalition on the original scale. +//' +//' @export +//' @keywords internal +//' @author Lars Henry Berge Olsen +// [[Rcpp::export]] +arma::cube prepare_data_copula_cpp(const arma::mat& MC_samples_mat, + const arma::mat& x_explain_mat, + const arma::mat& x_explain_gaussian_mat, + const arma::mat& x_train_mat, + const arma::mat& S, + const arma::vec& mu, + const arma::mat& cov_mat) { + + int n_explain = x_explain_mat.n_rows; + int n_samples = MC_samples_mat.n_rows; + int n_features = MC_samples_mat.n_cols; + int n_coalitions = S.n_rows; + + // Initialize auxiliary matrix and result cube + arma::mat aux_mat(n_samples, n_features); + arma::cube result_cube(n_samples, n_explain*n_coalitions, n_features); + + // Iterate over the coalitions + for (int S_ind = 0; S_ind < n_coalitions; S_ind++) { + + // Get current coalition S and the indices of the features in coalition S and mask Sbar + arma::mat S_now = S.row(S_ind); + arma::uvec S_now_idx = arma::find(S_now > 0.5); + arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); + + // Extract the features we condition on, both on the original scale and the Gaussian transformed values. + arma::mat x_S_star = x_explain_mat.cols(S_now_idx); + arma::mat x_S_star_gaussian = x_explain_gaussian_mat.cols(S_now_idx); + + // Extract the mean values of the Gaussian transformed features in the two sets + arma::vec mu_S = mu.elem(S_now_idx); + arma::vec mu_Sbar = mu.elem(Sbar_now_idx); + + // Extract the relevant parts of the Gaussian transformed covariance matrix + arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); + arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); + arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); + arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); + + // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); + arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; + + // Ensure that the conditional covariance matrix is symmetric + if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { + cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); + } + + // Compute the conditional mean of Xsbar given Xs = Xs_star_gaussian, i.e., of the Gaussian transformed features + arma::mat x_Sbar_gaussian_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star_gaussian.each_row() - mu_S.t()).t(); + x_Sbar_gaussian_mean.each_col() += mu_Sbar; + + // Transform the samples to be from N(O, Sigma_{Sbar|S}) + arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); + + // Loop over the different explicands and combine the generated values with the values we conditioned on + for (int idx_now = 0; idx_now < n_explain; idx_now++) { + + // Transform the MC samples to be from N(mu_{Sbar|S}, Sigma_{Sbar|S}) for one coalition and one explicand + arma::mat MC_samples_mat_now_now = + MC_samples_mat_now + repmat(trans(x_Sbar_gaussian_mean.col(idx_now)), n_samples, 1); + + // Transform the MC to the original scale using the inverse Gaussian transform + arma::mat MC_samples_mat_now_now_trans = + inv_gaussian_transform_cpp(MC_samples_mat_now_now, x_train_mat.cols(Sbar_now_idx)); + + // Insert the generate Gaussian copula MC samples and the feature values we condition on into an auxiliary matrix + aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now_now_trans; + aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); + + // Insert the auxiliary matrix into the result cube + result_cube.col(S_ind*n_explain + idx_now) = aux_mat; + } + } + + return result_cube; +} diff --git a/src/Gaussian.cpp b/src/Gaussian.cpp new file mode 100644 index 000000000..c375ed510 --- /dev/null +++ b/src/Gaussian.cpp @@ -0,0 +1,88 @@ +#include +using namespace Rcpp; + +// [[Rcpp::depends(RcppArmadillo)]] + +//' Generate Gaussian MC samples +//' +//' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the +//' univariate standard normal. +//' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations +//' to explain. +//' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of +//' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones. +//' This is not a problem internally in shapr as the empty and grand coalitions treated differently. +//' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature. +//' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance +//' between all pairs of features. +//' +//' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where +//' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian +//' MC samples for each explicand and coalition. +//' +//' @export +//' @keywords internal +//' @author Lars Henry Berge Olsen +// [[Rcpp::export]] +arma::cube prepare_data_gaussian_cpp(const arma::mat& MC_samples_mat, + const arma::mat& x_explain_mat, + const arma::mat& S, + const arma::vec& mu, + const arma::mat& cov_mat) { + + int n_explain = x_explain_mat.n_rows; + int n_samples = MC_samples_mat.n_rows; + int n_features = MC_samples_mat.n_cols; + int n_coalitions = S.n_rows; + + // Initialize auxiliary matrix and result cube + arma::mat aux_mat(n_samples, n_features); + arma::cube result_cube(n_samples, n_explain*n_coalitions, n_features); + + // Iterate over the coalitions + for (int S_ind = 0; S_ind < n_coalitions; S_ind++) { + + // Get current coalition S and the indices of the features in coalition S and mask Sbar + arma::mat S_now = S.row(S_ind); + arma::uvec S_now_idx = arma::find(S_now > 0.5); + arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); + + // Extract the features we condition on + arma::mat x_S_star = x_explain_mat.cols(S_now_idx); + + // Extract the mean values of the features in the two sets + arma::vec mu_S = mu.elem(S_now_idx); + arma::vec mu_Sbar = mu.elem(Sbar_now_idx); + + // Extract the relevant parts of the covariance matrix + arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); + arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); + arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); + arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); + + // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); + arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; + + // Ensure that the conditional covariance matrix is symmetric + if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { + cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); + } + + // Compute the conditional mean of Xsbar given Xs = Xs_star + arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star.each_row() - mu_S.t()).t(); + x_Sbar_mean.each_col() += mu_Sbar; + + // Transform the samples to be from N(O, Sigma_{Sbar|S}) + arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); + + // Loop over the different explicands and combine the generated values with the values we conditioned on + for (int idx_now = 0; idx_now < n_explain; idx_now++) { + aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); + aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1); + result_cube.col(S_ind*n_explain + idx_now) = aux_mat; + } + } + + return result_cube; +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 8face37f8..c95d55541 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -80,6 +80,62 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// quantile_type7_cpp +arma::vec quantile_type7_cpp(const arma::vec& x, const arma::vec& probs); +RcppExport SEXP _shapr_quantile_type7_cpp(SEXP xSEXP, SEXP probsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::vec& >::type x(xSEXP); + Rcpp::traits::input_parameter< const arma::vec& >::type probs(probsSEXP); + rcpp_result_gen = Rcpp::wrap(quantile_type7_cpp(x, probs)); + return rcpp_result_gen; +END_RCPP +} +// inv_gaussian_transform_cpp +arma::mat inv_gaussian_transform_cpp(const arma::mat& z, const arma::mat& x); +RcppExport SEXP _shapr_inv_gaussian_transform_cpp(SEXP zSEXP, SEXP xSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::mat& >::type z(zSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type x(xSEXP); + rcpp_result_gen = Rcpp::wrap(inv_gaussian_transform_cpp(z, x)); + return rcpp_result_gen; +END_RCPP +} +// prepare_data_copula_cpp +arma::cube prepare_data_copula_cpp(const arma::mat& MC_samples_mat, const arma::mat& x_explain_mat, const arma::mat& x_explain_gaussian_mat, const arma::mat& x_train_mat, const arma::mat& S, const arma::vec& mu, const arma::mat& cov_mat); +RcppExport SEXP _shapr_prepare_data_copula_cpp(SEXP MC_samples_matSEXP, SEXP x_explain_matSEXP, SEXP x_explain_gaussian_matSEXP, SEXP x_train_matSEXP, SEXP SSEXP, SEXP muSEXP, SEXP cov_matSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::mat& >::type MC_samples_mat(MC_samples_matSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type x_explain_mat(x_explain_matSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type x_explain_gaussian_mat(x_explain_gaussian_matSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type x_train_mat(x_train_matSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type S(SSEXP); + Rcpp::traits::input_parameter< const arma::vec& >::type mu(muSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type cov_mat(cov_matSEXP); + rcpp_result_gen = Rcpp::wrap(prepare_data_copula_cpp(MC_samples_mat, x_explain_mat, x_explain_gaussian_mat, x_train_mat, S, mu, cov_mat)); + return rcpp_result_gen; +END_RCPP +} +// prepare_data_gaussian_cpp +arma::cube prepare_data_gaussian_cpp(const arma::mat& MC_samples_mat, const arma::mat& x_explain_mat, const arma::mat& S, const arma::vec& mu, const arma::mat& cov_mat); +RcppExport SEXP _shapr_prepare_data_gaussian_cpp(SEXP MC_samples_matSEXP, SEXP x_explain_matSEXP, SEXP SSEXP, SEXP muSEXP, SEXP cov_matSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::mat& >::type MC_samples_mat(MC_samples_matSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type x_explain_mat(x_explain_matSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type S(SSEXP); + Rcpp::traits::input_parameter< const arma::vec& >::type mu(muSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type cov_mat(cov_matSEXP); + rcpp_result_gen = Rcpp::wrap(prepare_data_gaussian_cpp(MC_samples_mat, x_explain_mat, S, mu, cov_mat)); + return rcpp_result_gen; +END_RCPP +} // mahalanobis_distance_cpp arma::cube mahalanobis_distance_cpp(Rcpp::List featureList, arma::mat Xtrain_mat, arma::mat Xtest_mat, arma::mat mcov, bool S_scale_dist); RcppExport SEXP _shapr_mahalanobis_distance_cpp(SEXP featureListSEXP, SEXP Xtrain_matSEXP, SEXP Xtest_matSEXP, SEXP mcovSEXP, SEXP S_scale_distSEXP) { @@ -155,6 +211,10 @@ static const R_CallMethodDef CallEntries[] = { {"_shapr_correction_matrix_cpp", (DL_FUNC) &_shapr_correction_matrix_cpp, 2}, {"_shapr_aicc_full_single_cpp", (DL_FUNC) &_shapr_aicc_full_single_cpp, 5}, {"_shapr_aicc_full_cpp", (DL_FUNC) &_shapr_aicc_full_cpp, 6}, + {"_shapr_quantile_type7_cpp", (DL_FUNC) &_shapr_quantile_type7_cpp, 2}, + {"_shapr_inv_gaussian_transform_cpp", (DL_FUNC) &_shapr_inv_gaussian_transform_cpp, 2}, + {"_shapr_prepare_data_copula_cpp", (DL_FUNC) &_shapr_prepare_data_copula_cpp, 7}, + {"_shapr_prepare_data_gaussian_cpp", (DL_FUNC) &_shapr_prepare_data_gaussian_cpp, 5}, {"_shapr_mahalanobis_distance_cpp", (DL_FUNC) &_shapr_mahalanobis_distance_cpp, 5}, {"_shapr_sample_features_cpp", (DL_FUNC) &_shapr_sample_features_cpp, 2}, {"_shapr_observation_impute_cpp", (DL_FUNC) &_shapr_observation_impute_cpp, 5}, diff --git a/tests/testthat/_snaps/output.md b/tests/testthat/_snaps/output.md index 9a84b0089..bd2dd91fe 100644 --- a/tests/testthat/_snaps/output.md +++ b/tests/testthat/_snaps/output.md @@ -81,20 +81,20 @@ Code (out <- code) Output - none Solar.R Wind Temp Month Day - 1: 42.44 -8.545 7.779 14.586 0.4475 -1.6653 - 2: 42.44 4.826 -4.295 -11.655 -1.1250 -1.6309 - 3: 42.44 7.163 -25.491 0.368 -0.5455 0.9377 + none Solar.R Wind Temp Month Day + 1: 42.44 -8.117 7.438 14.0026 0.8602 -1.5813 + 2: 42.44 5.278 -5.219 -12.1079 -0.8073 -1.0235 + 3: 42.44 7.867 -25.995 -0.1377 -0.2368 0.9342 # output_lm_numeric_copula Code (out <- code) Output - none Solar.R Wind Temp Month Day - 1: 42.44 -6.371 7.355 14.470 -0.6108 -2.241 - 2: 42.44 4.115 -4.159 -9.980 -1.9378 -1.917 - 3: 42.44 5.932 -25.086 1.857 -1.3624 1.090 + none Solar.R Wind Temp Month Day + 1: 42.44 -5.960 7.046 13.863 -0.274 -2.074 + 2: 42.44 4.482 -4.892 -10.491 -1.659 -1.319 + 3: 42.44 6.587 -25.533 1.279 -1.043 1.142 # output_lm_numeric_ctree @@ -150,20 +150,20 @@ Code (out <- code) Output - none Solar.R Wind Temp Month Day - 1: 42.44 -8.809 9.149 15.503 -2.8888 -0.3522 - 2: 42.44 3.146 -4.566 -7.727 -4.3771 -0.3559 - 3: 42.44 6.655 -22.559 -1.645 0.5634 -0.5832 + none Solar.R Wind Temp Month Day + 1: 42.44 -8.746 9.03 15.366 -2.619 -0.4293 + 2: 42.44 3.126 -4.50 -7.789 -4.401 -0.3161 + 3: 42.44 7.037 -22.86 -1.837 0.607 -0.5181 # output_lm_numeric_comb2 Code (out <- code) Output - none Solar.R Wind Temp Month Day - 1: 42.44 -9.302 9.454 17.2106 -1.767 -2.9936 - 2: 42.44 5.189 -5.352 -8.5382 -2.854 -2.3245 - 3: 42.44 6.388 -22.748 0.0177 -1.441 0.2159 + none Solar.R Wind Temp Month Day + 1: 42.44 -9.294 9.327 17.31641 -1.754 -2.9935 + 2: 42.44 5.194 -5.506 -8.45049 -2.935 -2.1810 + 3: 42.44 6.452 -22.967 -0.09553 -1.310 0.3519 # output_lm_numeric_comb3 @@ -171,9 +171,9 @@ (out <- code) Output none Solar.R Wind Temp Month Day - 1: 42.44 -6.940 10.773 12.187 -3.692 0.27495 - 2: 42.44 2.628 -2.656 -8.569 -5.313 0.03032 - 3: 42.44 5.827 -22.183 3.440 -2.954 -1.69839 + 1: 42.44 -6.952 10.777 12.160 -3.641 0.25767 + 2: 42.44 2.538 -2.586 -8.503 -5.376 0.04789 + 3: 42.44 5.803 -22.122 3.362 -2.926 -1.68514 # output_lm_mixed_independence diff --git a/tests/testthat/_snaps/output/output_lm_numeric_comb1.rds b/tests/testthat/_snaps/output/output_lm_numeric_comb1.rds index d0dbda0e2..87eb74630 100644 Binary files a/tests/testthat/_snaps/output/output_lm_numeric_comb1.rds and b/tests/testthat/_snaps/output/output_lm_numeric_comb1.rds differ diff --git a/tests/testthat/_snaps/output/output_lm_numeric_comb2.rds b/tests/testthat/_snaps/output/output_lm_numeric_comb2.rds index 3cd50d84d..fc125c859 100644 Binary files a/tests/testthat/_snaps/output/output_lm_numeric_comb2.rds and b/tests/testthat/_snaps/output/output_lm_numeric_comb2.rds differ diff --git a/tests/testthat/_snaps/output/output_lm_numeric_comb3.rds b/tests/testthat/_snaps/output/output_lm_numeric_comb3.rds index 630236b01..7ab8d808d 100644 Binary files a/tests/testthat/_snaps/output/output_lm_numeric_comb3.rds and b/tests/testthat/_snaps/output/output_lm_numeric_comb3.rds differ diff --git a/tests/testthat/_snaps/output/output_lm_numeric_copula.rds b/tests/testthat/_snaps/output/output_lm_numeric_copula.rds index 6ac05fd92..b74a9cf5e 100644 Binary files a/tests/testthat/_snaps/output/output_lm_numeric_copula.rds and b/tests/testthat/_snaps/output/output_lm_numeric_copula.rds differ diff --git a/tests/testthat/_snaps/output/output_lm_numeric_gaussian.rds b/tests/testthat/_snaps/output/output_lm_numeric_gaussian.rds index 5e95e0b14..d941cac15 100644 Binary files a/tests/testthat/_snaps/output/output_lm_numeric_gaussian.rds and b/tests/testthat/_snaps/output/output_lm_numeric_gaussian.rds differ diff --git a/tests/testthat/_snaps/plot/msev-bar-50-ci.svg b/tests/testthat/_snaps/plot/msev-bar-50-ci.svg index ff95215fc..7d5630803 100644 --- a/tests/testthat/_snaps/plot/msev-bar-50-ci.svg +++ b/tests/testthat/_snaps/plot/msev-bar-50-ci.svg @@ -43,21 +43,21 @@ - + - + - - - + + + - - - + + + 0 diff --git a/tests/testthat/_snaps/plot/msev-bar-with-ci-different-width.svg b/tests/testthat/_snaps/plot/msev-bar-with-ci-different-width.svg index 17d6e9ec2..c6d232890 100644 --- a/tests/testthat/_snaps/plot/msev-bar-with-ci-different-width.svg +++ b/tests/testthat/_snaps/plot/msev-bar-with-ci-different-width.svg @@ -28,9 +28,9 @@ - + - + diff --git a/tests/testthat/_snaps/plot/msev-bar-without-ci.svg b/tests/testthat/_snaps/plot/msev-bar-without-ci.svg index 5b31384c1..1be15b58a 100644 --- a/tests/testthat/_snaps/plot/msev-bar-without-ci.svg +++ b/tests/testthat/_snaps/plot/msev-bar-without-ci.svg @@ -41,9 +41,9 @@ - + - + 0 diff --git a/tests/testthat/_snaps/plot/msev-bar.svg b/tests/testthat/_snaps/plot/msev-bar.svg index f7b6b2e15..57d503c70 100644 --- a/tests/testthat/_snaps/plot/msev-bar.svg +++ b/tests/testthat/_snaps/plot/msev-bar.svg @@ -28,21 +28,21 @@ - + - + - - - + + + - - - + + + diff --git a/tests/testthat/_snaps/plot/msev-combination-bar-specified-width.svg b/tests/testthat/_snaps/plot/msev-combination-bar-specified-width.svg index 7d95ab35d..9a7ee4506 100644 --- a/tests/testthat/_snaps/plot/msev-combination-bar-specified-width.svg +++ b/tests/testthat/_snaps/plot/msev-combination-bar-specified-width.svg @@ -57,36 +57,36 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -132,16 +132,16 @@ - - - - - - - - - - + + + + + + + + + + diff --git a/tests/testthat/_snaps/plot/msev-combination-bar.svg b/tests/testthat/_snaps/plot/msev-combination-bar.svg index 9d2de04ab..42a758234 100644 --- a/tests/testthat/_snaps/plot/msev-combination-bar.svg +++ b/tests/testthat/_snaps/plot/msev-combination-bar.svg @@ -57,36 +57,36 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -132,16 +132,16 @@ - - - - - - - - - - + + + + + + + + + + @@ -237,96 +237,96 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -462,36 +462,36 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/testthat/_snaps/plot/msev-combination-line-point.svg b/tests/testthat/_snaps/plot/msev-combination-line-point.svg index 1b229513f..65b02981c 100644 --- a/tests/testthat/_snaps/plot/msev-combination-line-point.svg +++ b/tests/testthat/_snaps/plot/msev-combination-line-point.svg @@ -57,36 +57,36 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -132,25 +132,25 @@ - - - - - - - - - - + + + + + + + + + + - + - + diff --git a/tests/testthat/_snaps/plot/msev-explicand-bar-specified-width.svg b/tests/testthat/_snaps/plot/msev-explicand-bar-specified-width.svg index 4bb61aa68..c3be8d1e8 100644 --- a/tests/testthat/_snaps/plot/msev-explicand-bar-specified-width.svg +++ b/tests/testthat/_snaps/plot/msev-explicand-bar-specified-width.svg @@ -27,27 +27,27 @@ - - - - - + + + + + - - - - - - + + + + + + 0 -100 -200 +100 +200 - - + + diff --git a/tests/testthat/_snaps/plot/msev-explicand-bar.svg b/tests/testthat/_snaps/plot/msev-explicand-bar.svg index 3c6d7b21f..02cdd7d64 100644 --- a/tests/testthat/_snaps/plot/msev-explicand-bar.svg +++ b/tests/testthat/_snaps/plot/msev-explicand-bar.svg @@ -27,27 +27,27 @@ - - - - - + + + + + - - - - - - + + + + + + 0 -100 -200 +100 +200 - - + + diff --git a/tests/testthat/_snaps/plot/msev-explicand-for-specified-observations.svg b/tests/testthat/_snaps/plot/msev-explicand-for-specified-observations.svg index 77659a782..87b0706fd 100644 --- a/tests/testthat/_snaps/plot/msev-explicand-for-specified-observations.svg +++ b/tests/testthat/_snaps/plot/msev-explicand-for-specified-observations.svg @@ -27,23 +27,23 @@ - - - + + + - - - - + + + + 0 -100 -200 +100 +200 - - + + 1 diff --git a/tests/testthat/_snaps/plot/msev-explicand-line-point.svg b/tests/testthat/_snaps/plot/msev-explicand-line-point.svg index e3afb1ef1..13aa02cfb 100644 --- a/tests/testthat/_snaps/plot/msev-explicand-line-point.svg +++ b/tests/testthat/_snaps/plot/msev-explicand-line-point.svg @@ -27,33 +27,33 @@ - + - - - + + + - - - - - - - - - - + + + + + + + + + + -100 -150 -200 -250 - - - - +100 +150 +200 +250 + + + +