Improving the efficiency of the Gaussian and (Gaussian) copula methods (

#366)
NorskRegnesentral · Jan 15, 2024 · f940886 · f940886
1 parent 579724b
commit f940886
Show file tree

Hide file tree

Showing 40 changed files with 5,275 additions and 665 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,32 +5,22 @@
 Rprof.out
 src/*.o
 src/*.so
-
 *.o
-
 *.so
-
 tmp/
-
 \.RDataTmp
-
 *.out
-
 *.Rout
-
 src-i386/
-
 src-x64/
-
 paper_experiments/res/single_res/
-
 paper_experiments/res/single_res_old/
 inst/doc
-
 docs/*
 doc
 Meta
 docs
 /doc/
 /Meta/
-.idea
+.idea
+.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -32,8 +32,6 @@ Imports:
     stats,
     data.table,
     Rcpp (>= 0.12.15),
-    condMVNorm,
-    mvnfast,
     Matrix,
     future.apply
 Suggests: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -63,6 +63,8 @@ export(observation_impute_cpp)
 export(plot_MSEv_eval_crit)
 export(predict_model)
 export(prepare_data)
+export(prepare_data_copula_cpp)
+export(prepare_data_gaussian_cpp)
 export(rss_cpp)
 export(setup)
 export(setup_approach)
@@ -96,6 +98,7 @@ importFrom(stats,model.matrix)
 importFrom(stats,predict)
 importFrom(stats,pt)
 importFrom(stats,qt)
+importFrom(stats,rnorm)
 importFrom(stats,sd)
 importFrom(stats,setNames)
 importFrom(utils,head)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -80,6 +80,88 @@ aicc_full_cpp <- function(h, X_list, mcov_list, S_scale_dist, y_list, negative)
     .Call(`_shapr_aicc_full_cpp`, h, X_list, mcov_list, S_scale_dist, y_list, negative)
 }
 
+#' Compute the quantiles using quantile type seven
+#'
+#' @details Using quantile type number seven from stats::quantile in R.
+#'
+#' @param x arma::vec. Numeric vector whose sample quantiles are wanted.
+#' @param probs arma::vec. Numeric vector of probabilities with values between zero and one.
+#'
+#' @return A vector of length `length(probs)` with the quantiles is returned.
+#'
+#' @keywords internal
+#' @author Lars Henry Berge Olsen
+quantile_type7_cpp <- function(x, probs) {
+    .Call(`_shapr_quantile_type7_cpp`, x, probs)
+}
+
+#' Transforms new data to a standardized normal distribution
+#'
+#' @param z arma::mat. The data are the Gaussian Monte Carlos samples to transform.
+#' @param x arma::mat. The data with the original transformation. Used to conduct the transformation of `z`.
+#'
+#' @return arma::mat of the same dimension as `z`
+#'
+#' @keywords internal
+#' @author Lars Henry Berge Olsen
+inv_gaussian_transform_cpp <- function(z, x) {
+    .Call(`_shapr_inv_gaussian_transform_cpp`, z, x)
+}
+
+#' Generate (Gaussian) Copula MC samples
+#'
+#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the
+#' univariate standard normal.
+#' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations
+#' to explain on the original scale.
+#' @param x_explain_gaussian_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the
+#' observations to explain after being transformed using the Gaussian transform, i.e., the samples have been
+#' transformed to a standardized normal distribution.
+#' @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations.
+#' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
+#' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
+#' This is not a problem internally in shapr as the empty and grand coalitions treated differently.
+#' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed
+#' using the Gaussian transform, i.e., the samples have been transformed to a standardized normal distribution.
+#' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance
+#' between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been
+#' transformed to a standardized normal distribution.
+#'
+#' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where
+#' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian
+#' copula MC samples for each explicand and coalition on the original scale.
+#'
+#' @export
+#' @keywords internal
+#' @author Lars Henry Berge Olsen
+prepare_data_copula_cpp <- function(MC_samples_mat, x_explain_mat, x_explain_gaussian_mat, x_train_mat, S, mu, cov_mat) {
+    .Call(`_shapr_prepare_data_copula_cpp`, MC_samples_mat, x_explain_mat, x_explain_gaussian_mat, x_train_mat, S, mu, cov_mat)
+}
+
+#' Generate Gaussian MC samples
+#'
+#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the
+#' univariate standard normal.
+#' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations
+#' to explain.
+#' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
+#' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
+#' This is not a problem internally in shapr as the empty and grand coalitions treated differently.
+#' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature.
+#' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance
+#' between all pairs of features.
+#'
+#' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where
+#' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian
+#' MC samples for each explicand and coalition.
+#'
+#' @export
+#' @keywords internal
+#' @author Lars Henry Berge Olsen
+prepare_data_gaussian_cpp <- function(MC_samples_mat, x_explain_mat, S, mu, cov_mat) {
+    .Call(`_shapr_prepare_data_gaussian_cpp`, MC_samples_mat, x_explain_mat, S, mu, cov_mat)
+}
+
 #' (Generalized) Mahalanobis distance
 #'
 #' Used to get the Euclidean distance as well by setting \code{mcov} = \code{diag(m)}.

diff --git a/R/approach_copula.R b/R/approach_copula.R
@@ -1,12 +1,11 @@
 #' @rdname setup_approach
-#'
 #' @inheritParams default_doc_explain
-#'
 #' @export
+#' @author Martin Jullum
 setup_approach.copula <- function(internal, ...) {
   parameters <- internal$parameters
-  x_train <- internal$data$x_train
-  x_explain <- internal$data$x_explain
+  x_train_mat <- as.matrix(internal$data$x_train)
+  x_explain_mat <- as.matrix(internal$data$x_explain)
 
   # Checking if factor features are present
   feature_specs <- internal$objects$feature_specs
@@ -23,159 +22,97 @@ setup_approach.copula <- function(internal, ...) {
   }
 
   # Prepare transformed data
-  parameters$copula.mu <- rep(0, ncol(x_train))
-  x_train0 <- apply(
-    X = x_train,
-    MARGIN = 2,
-    FUN = gaussian_transform
-  )
-  parameters$copula.cov_mat <- get_cov_mat(x_train0)
-
+  parameters$copula.mu <- rep(0, ncol(x_train_mat))
+  x_train_mat0 <- apply(X = x_train_mat, MARGIN = 2, FUN = gaussian_transform)
+  parameters$copula.cov_mat <- get_cov_mat(x_train_mat0)
 
   x_explain_gaussian <- apply(
-    X = rbind(x_explain, x_train),
+    X = rbind(x_explain_mat, x_train_mat),
     MARGIN = 2,
     FUN = gaussian_transform_separate,
-    n_y = nrow(x_explain)
+    n_y = nrow(x_explain_mat)
   )
+  if (is.null(dim(x_explain_gaussian))) x_explain_gaussian <- t(as.matrix(x_explain_gaussian))
 
-  if (is.null(dim(x_explain_gaussian))) {
-    x_explain_gaussian <- t(as.matrix(x_explain_gaussian))
-  }
-  # TODO: Change to this a data.table for consistency (not speed/memory)
-  internal$data$copula.x_explain_gaussian <- x_explain_gaussian
+  # Add objects to internal list
   internal$parameters <- parameters
+  internal$data$copula.x_explain_gaussian <- as.data.table(x_explain_gaussian)
 
   return(internal)
 }
 
 #' @inheritParams default_doc
 #' @rdname prepare_data
 #' @export
-prepare_data.copula <- function(internal, index_features = NULL, ...) {
-  x_train <- internal$data$x_train
-  x_explain <- internal$data$x_explain
+#' @author Lars Henry Berge Olsen
+prepare_data.copula <- function(internal, index_features, ...) {
+  # Extract used variables
+  S <- internal$objects$S[index_features, , drop = FALSE]
+  feature_names <- internal$parameters$feature_names
   n_explain <- internal$parameters$n_explain
-  copula.cov_mat <- internal$parameters$copula.cov_mat
   n_samples <- internal$parameters$n_samples
-  copula.mu <- internal$parameters$copula.mu
   n_features <- internal$parameters$n_features
+  n_combinations_now <- length(index_features)
+  x_train_mat <- as.matrix(internal$data$x_train)
+  x_explain_mat <- as.matrix(internal$data$x_explain)
+  copula.mu <- internal$parameters$copula.mu
+  copula.cov_mat <- internal$parameters$copula.cov_mat
+  copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian)
+
+  # Generate the MC samples from N(0, 1)
+  MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features)
+
+  # Use C++ to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}), for all coalitions and explicands,
+  # and then transforming them back to the original scale using the inverse Gaussian transform in C++.
+  # The object `dt` is a 3D array of dimension (n_samples, n_explain * n_coalitions, n_features).
+  dt <- prepare_data_copula_cpp(
+    MC_samples_mat = MC_samples_mat,
+    x_explain_mat = x_explain_mat,
+    x_explain_gaussian_mat = copula.x_explain_gaussian_mat,
+    x_train_mat = x_train_mat,
+    S = S,
+    mu = copula.mu,
+    cov_mat = copula.cov_mat
+  )
 
-  copula.x_explain_gaussian <- internal$data$copula.x_explain_gaussian
-  X <- internal$objects$X
-
-
-  x_explain0 <- as.matrix(x_explain)
-  dt_l <- list()
-  if (is.null(index_features)) {
-    features <- X$features
-  } else {
-    features <- X$features[index_features]
-  }
-
+  # Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features).
+  dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features)
 
-  for (i in seq_len(n_explain)) {
-    l <- lapply(
-      X = features,
-      FUN = sample_copula,
-      n_samples = n_samples,
-      mu = copula.mu,
-      cov_mat = copula.cov_mat,
-      m = n_features,
-      x_explain = x_explain0[i, , drop = FALSE],
-      x_train = as.matrix(x_train),
-      x_explain_gaussian = copula.x_explain_gaussian[i, , drop = FALSE]
-    )
+  # Convert to a data.table and add extra identification columns
+  dt <- data.table::as.data.table(dt)
+  data.table::setnames(dt, feature_names)
+  dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)]
+  dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))]
+  dt[, w := 1 / n_samples]
+  dt[, id_combination := index_features[id_combination]]
+  data.table::setcolorder(dt, c("id_combination", "id", feature_names))
 
-    dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination")
-    dt_l[[i]][, w := 1 / n_samples]
-    dt_l[[i]][, id := i]
-    if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]]
-  }
-  dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE)
   return(dt)
 }
 
-#' Sample conditional variables using the Gaussian copula approach
-#'
-#' @param index_given Integer vector. The indices of the features to condition upon. Note that
-#' `min(index_given) >= 1` and `max(index_given) <= m`.
-#' @param m Positive integer. The total number of features.
-#' @param x_explain_gaussian Numeric matrix. Contains the observation whose predictions ought
-#' to be explained (test data),
-#' after quantile-transforming them to standard Gaussian variables.
-#' @param x_explain Numeric matrix. Contains the features of the observation whose
-#' predictions ought to be explained (test data).
-#'
-#' @return data.table
-#'
-#' @keywords internal
-#'
-#' @author Martin Jullum
-sample_copula <- function(index_given, n_samples, mu, cov_mat, m, x_explain_gaussian, x_train, x_explain) {
-  # Handles the unconditional and full conditional separtely when predicting
-  if (length(index_given) %in% c(0, m)) {
-    ret <- matrix(x_explain, ncol = m, nrow = 1)
-  } else {
-    dependent_ind <- (seq_len(length(mu)))[-index_given]
-
-    tmp <- condMVNorm::condMVN(
-      mean = mu,
-      sigma = cov_mat,
-      dependent.ind = dependent_ind,
-      given.ind = index_given,
-      X.given = x_explain_gaussian[index_given]
-    )
-
-    ret0_z <- mvnfast::rmvn(n = n_samples, mu = tmp$condMean, sigma = tmp$condVar)
-
-    ret0_x <- apply(
-      X = rbind(ret0_z, x_train[, dependent_ind, drop = FALSE]),
-      MARGIN = 2,
-      FUN = inv_gaussian_transform,
-      n_z = n_samples
-    )
-
-    ret <- matrix(NA, ncol = m, nrow = n_samples)
-    ret[, index_given] <- rep(x_explain[index_given], each = n_samples)
-    ret[, dependent_ind] <- ret0_x
-  }
-  colnames(ret) <- colnames(x_explain)
-  return(as.data.table(ret))
-}
-
-
-#' Transforms new data to a standardized normal distribution
+#' Transforms a sample to standardized normal distribution
 #'
-#' @param zx Numeric vector. The first `n_z` items are the Gaussian data, and the last part is
-#' the data with the original transformation.
-#' @param n_z Positive integer. Number of elements of `zx` that belongs to new data.
+#' @param x Numeric vector.The data which should be transformed to a standard normal distribution.
 #'
-#' @return Numeric vector of length `n_z`
+#' @return Numeric vector of length `length(x)`
 #'
 #' @keywords internal
-#'
 #' @author Martin Jullum
-inv_gaussian_transform <- function(zx, n_z) {
-  if (n_z >= length(zx)) stop("n_z should be less than length(zx)")
-  ind <- 1:n_z
-  z <- zx[ind]
-  x <- zx[-ind]
-  u <- stats::pnorm(z)
-  x_new <- stats::quantile(x, probs = u)
-  return(as.double(x_new))
+gaussian_transform <- function(x) {
+  u <- rank(x) / (length(x) + 1)
+  z <- stats::qnorm(u)
+  return(z)
 }
 
 #' Transforms new data to standardized normal (dimension 1) based on other data transformations
 #'
 #' @param yx Numeric vector. The first `n_y` items is the data that is transformed, and last
 #' part is the data with the original transformation.
-#' @param n_y Positive integer. Number of elements of `yx` that belongs to the gaussian data.
+#' @param n_y Positive integer. Number of elements of `yx` that belongs to the Gaussian data.
 #'
 #' @return Vector of back-transformed Gaussian data
 #'
 #' @keywords internal
-#'
 #' @author Martin Jullum
 gaussian_transform_separate <- function(yx, n_y) {
   if (n_y >= length(yx)) stop("n_y should be less than length(yx)")
@@ -187,18 +124,3 @@ gaussian_transform_separate <- function(yx, n_y) {
   z_y <- stats::qnorm(u_y)
   return(z_y)
 }
-
-#' Transforms a sample to standardized normal distribution
-#'
-#' @param x Numeric vector.The data which should be transformed to a standard normal distribution.
-#'
-#' @return Numeric vector of length `length(x)`
-#'
-#' @keywords internal
-#'
-#' @author Martin Jullum
-gaussian_transform <- function(x) {
-  u <- rank(x) / (length(x) + 1)
-  z <- stats::qnorm(u)
-  return(z)
-}