Skip to content

Commit

Permalink
Improving the efficiency of the Gaussian and (Gaussian) copula methods (
Browse files Browse the repository at this point in the history
  • Loading branch information
LHBO authored Jan 15, 2024
1 parent 579724b commit f940886
Show file tree
Hide file tree
Showing 40 changed files with 5,275 additions and 665 deletions.
14 changes: 2 additions & 12 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,22 @@
Rprof.out
src/*.o
src/*.so

*.o

*.so

tmp/

\.RDataTmp

*.out

*.Rout

src-i386/

src-x64/

paper_experiments/res/single_res/

paper_experiments/res/single_res_old/
inst/doc

docs/*
doc
Meta
docs
/doc/
/Meta/
.idea
.idea
.DS_Store
2 changes: 0 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ Imports:
stats,
data.table,
Rcpp (>= 0.12.15),
condMVNorm,
mvnfast,
Matrix,
future.apply
Suggests:
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ export(observation_impute_cpp)
export(plot_MSEv_eval_crit)
export(predict_model)
export(prepare_data)
export(prepare_data_copula_cpp)
export(prepare_data_gaussian_cpp)
export(rss_cpp)
export(setup)
export(setup_approach)
Expand Down Expand Up @@ -96,6 +98,7 @@ importFrom(stats,model.matrix)
importFrom(stats,predict)
importFrom(stats,pt)
importFrom(stats,qt)
importFrom(stats,rnorm)
importFrom(stats,sd)
importFrom(stats,setNames)
importFrom(utils,head)
Expand Down
82 changes: 82 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,88 @@ aicc_full_cpp <- function(h, X_list, mcov_list, S_scale_dist, y_list, negative)
.Call(`_shapr_aicc_full_cpp`, h, X_list, mcov_list, S_scale_dist, y_list, negative)
}

#' Compute the quantiles using quantile type seven
#'
#' @details Using quantile type number seven from stats::quantile in R.
#'
#' @param x arma::vec. Numeric vector whose sample quantiles are wanted.
#' @param probs arma::vec. Numeric vector of probabilities with values between zero and one.
#'
#' @return A vector of length `length(probs)` with the quantiles is returned.
#'
#' @keywords internal
#' @author Lars Henry Berge Olsen
quantile_type7_cpp <- function(x, probs) {
.Call(`_shapr_quantile_type7_cpp`, x, probs)
}

#' Transforms new data to a standardized normal distribution
#'
#' @param z arma::mat. The data are the Gaussian Monte Carlos samples to transform.
#' @param x arma::mat. The data with the original transformation. Used to conduct the transformation of `z`.
#'
#' @return arma::mat of the same dimension as `z`
#'
#' @keywords internal
#' @author Lars Henry Berge Olsen
inv_gaussian_transform_cpp <- function(z, x) {
.Call(`_shapr_inv_gaussian_transform_cpp`, z, x)
}

#' Generate (Gaussian) Copula MC samples
#'
#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the
#' univariate standard normal.
#' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations
#' to explain on the original scale.
#' @param x_explain_gaussian_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the
#' observations to explain after being transformed using the Gaussian transform, i.e., the samples have been
#' transformed to a standardized normal distribution.
#' @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations.
#' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
#' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
#' This is not a problem internally in shapr as the empty and grand coalitions treated differently.
#' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed
#' using the Gaussian transform, i.e., the samples have been transformed to a standardized normal distribution.
#' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance
#' between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been
#' transformed to a standardized normal distribution.
#'
#' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where
#' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian
#' copula MC samples for each explicand and coalition on the original scale.
#'
#' @export
#' @keywords internal
#' @author Lars Henry Berge Olsen
prepare_data_copula_cpp <- function(MC_samples_mat, x_explain_mat, x_explain_gaussian_mat, x_train_mat, S, mu, cov_mat) {
.Call(`_shapr_prepare_data_copula_cpp`, MC_samples_mat, x_explain_mat, x_explain_gaussian_mat, x_train_mat, S, mu, cov_mat)
}

#' Generate Gaussian MC samples
#'
#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_samples`, `n_features`) containing samples from the
#' univariate standard normal.
#' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations
#' to explain.
#' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
#' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
#' This is not a problem internally in shapr as the empty and grand coalitions treated differently.
#' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature.
#' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance
#' between all pairs of features.
#'
#' @return An arma::cube/3D array of dimension (`n_samples`, `n_explain` * `n_coalitions`, `n_features`), where
#' the columns (_,j,_) are matrices of dimension (`n_samples`, `n_features`) containing the conditional Gaussian
#' MC samples for each explicand and coalition.
#'
#' @export
#' @keywords internal
#' @author Lars Henry Berge Olsen
prepare_data_gaussian_cpp <- function(MC_samples_mat, x_explain_mat, S, mu, cov_mat) {
.Call(`_shapr_prepare_data_gaussian_cpp`, MC_samples_mat, x_explain_mat, S, mu, cov_mat)
}

#' (Generalized) Mahalanobis distance
#'
#' Used to get the Euclidean distance as well by setting \code{mcov} = \code{diag(m)}.
Expand Down
190 changes: 56 additions & 134 deletions R/approach_copula.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
#' @rdname setup_approach
#'
#' @inheritParams default_doc_explain
#'
#' @export
#' @author Martin Jullum
setup_approach.copula <- function(internal, ...) {
parameters <- internal$parameters
x_train <- internal$data$x_train
x_explain <- internal$data$x_explain
x_train_mat <- as.matrix(internal$data$x_train)
x_explain_mat <- as.matrix(internal$data$x_explain)

# Checking if factor features are present
feature_specs <- internal$objects$feature_specs
Expand All @@ -23,159 +22,97 @@ setup_approach.copula <- function(internal, ...) {
}

# Prepare transformed data
parameters$copula.mu <- rep(0, ncol(x_train))
x_train0 <- apply(
X = x_train,
MARGIN = 2,
FUN = gaussian_transform
)
parameters$copula.cov_mat <- get_cov_mat(x_train0)

parameters$copula.mu <- rep(0, ncol(x_train_mat))
x_train_mat0 <- apply(X = x_train_mat, MARGIN = 2, FUN = gaussian_transform)
parameters$copula.cov_mat <- get_cov_mat(x_train_mat0)

x_explain_gaussian <- apply(
X = rbind(x_explain, x_train),
X = rbind(x_explain_mat, x_train_mat),
MARGIN = 2,
FUN = gaussian_transform_separate,
n_y = nrow(x_explain)
n_y = nrow(x_explain_mat)
)
if (is.null(dim(x_explain_gaussian))) x_explain_gaussian <- t(as.matrix(x_explain_gaussian))

if (is.null(dim(x_explain_gaussian))) {
x_explain_gaussian <- t(as.matrix(x_explain_gaussian))
}
# TODO: Change to this a data.table for consistency (not speed/memory)
internal$data$copula.x_explain_gaussian <- x_explain_gaussian
# Add objects to internal list
internal$parameters <- parameters
internal$data$copula.x_explain_gaussian <- as.data.table(x_explain_gaussian)

return(internal)
}

#' @inheritParams default_doc
#' @rdname prepare_data
#' @export
prepare_data.copula <- function(internal, index_features = NULL, ...) {
x_train <- internal$data$x_train
x_explain <- internal$data$x_explain
#' @author Lars Henry Berge Olsen
prepare_data.copula <- function(internal, index_features, ...) {
# Extract used variables
S <- internal$objects$S[index_features, , drop = FALSE]
feature_names <- internal$parameters$feature_names
n_explain <- internal$parameters$n_explain
copula.cov_mat <- internal$parameters$copula.cov_mat
n_samples <- internal$parameters$n_samples
copula.mu <- internal$parameters$copula.mu
n_features <- internal$parameters$n_features
n_combinations_now <- length(index_features)
x_train_mat <- as.matrix(internal$data$x_train)
x_explain_mat <- as.matrix(internal$data$x_explain)
copula.mu <- internal$parameters$copula.mu
copula.cov_mat <- internal$parameters$copula.cov_mat
copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian)

# Generate the MC samples from N(0, 1)
MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features)

# Use C++ to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}), for all coalitions and explicands,
# and then transforming them back to the original scale using the inverse Gaussian transform in C++.
# The object `dt` is a 3D array of dimension (n_samples, n_explain * n_coalitions, n_features).
dt <- prepare_data_copula_cpp(
MC_samples_mat = MC_samples_mat,
x_explain_mat = x_explain_mat,
x_explain_gaussian_mat = copula.x_explain_gaussian_mat,
x_train_mat = x_train_mat,
S = S,
mu = copula.mu,
cov_mat = copula.cov_mat
)

copula.x_explain_gaussian <- internal$data$copula.x_explain_gaussian
X <- internal$objects$X


x_explain0 <- as.matrix(x_explain)
dt_l <- list()
if (is.null(index_features)) {
features <- X$features
} else {
features <- X$features[index_features]
}

# Reshape `dt` to a 2D array of dimension (n_samples * n_explain * n_coalitions, n_features).
dim(dt) <- c(n_combinations_now * n_explain * n_samples, n_features)

for (i in seq_len(n_explain)) {
l <- lapply(
X = features,
FUN = sample_copula,
n_samples = n_samples,
mu = copula.mu,
cov_mat = copula.cov_mat,
m = n_features,
x_explain = x_explain0[i, , drop = FALSE],
x_train = as.matrix(x_train),
x_explain_gaussian = copula.x_explain_gaussian[i, , drop = FALSE]
)
# Convert to a data.table and add extra identification columns
dt <- data.table::as.data.table(dt)
data.table::setnames(dt, feature_names)
dt[, id_combination := rep(seq_len(nrow(S)), each = n_samples * n_explain)]
dt[, id := rep(seq(n_explain), each = n_samples, times = nrow(S))]
dt[, w := 1 / n_samples]
dt[, id_combination := index_features[id_combination]]
data.table::setcolorder(dt, c("id_combination", "id", feature_names))

dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination")
dt_l[[i]][, w := 1 / n_samples]
dt_l[[i]][, id := i]
if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]]
}
dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE)
return(dt)
}

#' Sample conditional variables using the Gaussian copula approach
#'
#' @param index_given Integer vector. The indices of the features to condition upon. Note that
#' `min(index_given) >= 1` and `max(index_given) <= m`.
#' @param m Positive integer. The total number of features.
#' @param x_explain_gaussian Numeric matrix. Contains the observation whose predictions ought
#' to be explained (test data),
#' after quantile-transforming them to standard Gaussian variables.
#' @param x_explain Numeric matrix. Contains the features of the observation whose
#' predictions ought to be explained (test data).
#'
#' @return data.table
#'
#' @keywords internal
#'
#' @author Martin Jullum
sample_copula <- function(index_given, n_samples, mu, cov_mat, m, x_explain_gaussian, x_train, x_explain) {
# Handles the unconditional and full conditional separtely when predicting
if (length(index_given) %in% c(0, m)) {
ret <- matrix(x_explain, ncol = m, nrow = 1)
} else {
dependent_ind <- (seq_len(length(mu)))[-index_given]

tmp <- condMVNorm::condMVN(
mean = mu,
sigma = cov_mat,
dependent.ind = dependent_ind,
given.ind = index_given,
X.given = x_explain_gaussian[index_given]
)

ret0_z <- mvnfast::rmvn(n = n_samples, mu = tmp$condMean, sigma = tmp$condVar)

ret0_x <- apply(
X = rbind(ret0_z, x_train[, dependent_ind, drop = FALSE]),
MARGIN = 2,
FUN = inv_gaussian_transform,
n_z = n_samples
)

ret <- matrix(NA, ncol = m, nrow = n_samples)
ret[, index_given] <- rep(x_explain[index_given], each = n_samples)
ret[, dependent_ind] <- ret0_x
}
colnames(ret) <- colnames(x_explain)
return(as.data.table(ret))
}


#' Transforms new data to a standardized normal distribution
#' Transforms a sample to standardized normal distribution
#'
#' @param zx Numeric vector. The first `n_z` items are the Gaussian data, and the last part is
#' the data with the original transformation.
#' @param n_z Positive integer. Number of elements of `zx` that belongs to new data.
#' @param x Numeric vector.The data which should be transformed to a standard normal distribution.
#'
#' @return Numeric vector of length `n_z`
#' @return Numeric vector of length `length(x)`
#'
#' @keywords internal
#'
#' @author Martin Jullum
inv_gaussian_transform <- function(zx, n_z) {
if (n_z >= length(zx)) stop("n_z should be less than length(zx)")
ind <- 1:n_z
z <- zx[ind]
x <- zx[-ind]
u <- stats::pnorm(z)
x_new <- stats::quantile(x, probs = u)
return(as.double(x_new))
gaussian_transform <- function(x) {
u <- rank(x) / (length(x) + 1)
z <- stats::qnorm(u)
return(z)
}

#' Transforms new data to standardized normal (dimension 1) based on other data transformations
#'
#' @param yx Numeric vector. The first `n_y` items is the data that is transformed, and last
#' part is the data with the original transformation.
#' @param n_y Positive integer. Number of elements of `yx` that belongs to the gaussian data.
#' @param n_y Positive integer. Number of elements of `yx` that belongs to the Gaussian data.
#'
#' @return Vector of back-transformed Gaussian data
#'
#' @keywords internal
#'
#' @author Martin Jullum
gaussian_transform_separate <- function(yx, n_y) {
if (n_y >= length(yx)) stop("n_y should be less than length(yx)")
Expand All @@ -187,18 +124,3 @@ gaussian_transform_separate <- function(yx, n_y) {
z_y <- stats::qnorm(u_y)
return(z_y)
}

#' Transforms a sample to standardized normal distribution
#'
#' @param x Numeric vector.The data which should be transformed to a standard normal distribution.
#'
#' @return Numeric vector of length `length(x)`
#'
#' @keywords internal
#'
#' @author Martin Jullum
gaussian_transform <- function(x) {
u <- rank(x) / (length(x) + 1)
z <- stats::qnorm(u)
return(z)
}
Loading

0 comments on commit f940886

Please sign in to comment.