Shapr version 1.0.0 (#402)

NorskRegnesentral · Oct 23, 2024 · 4c7e223 · 4c7e223
1 parent ddd32c7
commit 4c7e223
Show file tree

Hide file tree

Showing 483 changed files with 31,838 additions and 12,841 deletions.
diff --git a/.Rprofile b/.Rprofile
@@ -1,3 +1,7 @@
+if (requireNamespace("testthat", quietly = TRUE)) {
+  testthat::set_max_fails(Inf)
+}
+
 #' Helper function for package development
 #'
 #' This is a manual extension of [testthat::snapshot_review()] which works for the \code{.rds} files used in
@@ -7,17 +11,19 @@
 #' @param ... Additional arguments passed to [waldo::compare()]
 #' Gives the relative path to the test files to review
 #'
-snapshot_review_man <- function(path, tolerance = NULL, ...) {
-  changed <- testthat:::snapshot_meta(path)
-  these_rds <- (tools::file_ext(changed$name) == "rds")
-  if (any(these_rds)) {
-    for (i in which(these_rds)) {
-      old <- readRDS(changed[i, "cur"])
-      new <- readRDS(changed[i, "new"])
+snapshot_review_man <- function(path, tolerance = 10^(-5), max_diffs = 200, ...) {
+  if (requireNamespace("testthat", quietly = TRUE) && requireNamespace("waldo", quietly = TRUE)) {
+    changed <- testthat:::snapshot_meta(path)
+    these_rds <- (tools::file_ext(changed$name) == "rds")
+    if (any(these_rds)) {
+      for (i in which(these_rds)) {
+        old <- readRDS(changed[i, "cur"])
+        new <- readRDS(changed[i, "new"])
 
-      cat(paste0("Difference for check ", changed[i, "name"], " in test ", changed[i, "test"], "\n"))
-      print(waldo::compare(old, new, max_diffs = 50, tolerance = tolerance, ...))
-      browser()
+        cat(paste0("Difference for check ", changed[i, "name"], " in test ", changed[i, "test"], "\n"))
+        print(waldo::compare(old, new, max_diffs = max_diffs, tolerance = tolerance, ...))
+        browser()
+      }
     }
   }
 }
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -19,9 +19,9 @@
 
 on:
   push:
-    branches: [main, master, cranversion, devel]
+    branches: [main, master, cranversion, devel, 'shapr-1.0.0']
   pull_request:
-    branches: [main, master, cranversion, devel]
+    branches: [main, master, cranversion, devel, 'shapr-1.0.0']
 
 name: R-CMD-check
 

diff --git a/.github/workflows/lint-changed-files.yaml b/.github/workflows/lint-changed-files.yaml
@@ -8,7 +8,7 @@
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   pull_request:
-    branches: [main, master]
+    branches: [main, master, cranversion, devel, 'shapr-1.0.0']
 
 name: lint-changed-files
 

diff --git a/.lintr b/.lintr
@@ -8,6 +8,7 @@ linters: linters_with_defaults(
     )
 exclusions: list(
         "inst/scripts",
+        "inst/code_paper",
         "vignettes",
         "R/RcppExports.R",
         "R/zzz.R"

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,20 +1,19 @@
 Package: shapr
-Version: 0.2.3.9200
+Version: 1.0.0
 Title: Prediction Explanation with Dependence-Aware Shapley Values
 Description: Complex machine learning models are often hard to interpret. However, in 
   many situations it is crucial to understand and explain why a model made a specific 
   prediction. Shapley values is the only method for such prediction explanation framework 
   with a solid theoretical foundation. Previously known methods for estimating the Shapley 
-  values do, however, assume feature independence. This package implements the method 
-  described in Aas, Jullum and Løland (2019) <arXiv:1903.10464>, which accounts for any feature 
+  values do, however, assume feature independence. This package implements methods which accounts for any feature 
   dependence, and thereby produces more accurate estimates of the true Shapley values.
   An accompanying Python wrapper (shaprpy) is available on GitHub.
 Authors@R: c(
-    person("Nikolai", "Sellereite", email = "[email protected]", role = "aut", comment = c(ORCID = "0000-0002-4671-0337")),
     person("Martin", "Jullum", email = "[email protected]", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-3908-5155")),
     person("Lars Henry Berge", "Olsen", email = "[email protected]", role = "aut", comment = c(ORCID = "0009-0006-9360-6993")),
     person("Annabelle", "Redelmeier", email = "[email protected]", role = "aut"),
-    person("Jon", "Lachmann", email = "[email protected]", role = "aut"),
+    person("Jon", "Lachmann", email = "[email protected]", role = "aut", comment = c(ORCID = "0000-0001-8396-5673")),
+    person("Nikolai", "Sellereite", email = "[email protected]", role = "aut", comment = c(ORCID = "0000-0002-4671-0337")),
     person("Anders", "Løland", email = "[email protected]", role = "ctb"), 
     person("Jens Christian", "Wahl", email = "[email protected]", role = "ctb"), 
     person("Camilla", "Lingjærde", role = "ctb"),
@@ -27,7 +26,7 @@ Encoding: UTF-8
 LazyData: true
 ByteCompile: true
 Language: en-US
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Depends: R (>= 3.5.0)
 Imports: 
     stats,
@@ -66,7 +65,8 @@ Suggests:
     yardstick,
     hardhat,
     rsample,
-    rlang
+    rlang,
+    cli
 LinkingTo: 
     RcppArmadillo,
     Rcpp

diff --git a/NAMESPACE b/NAMESPACE
@@ -52,19 +52,31 @@ S3method(setup_approach,regression_separate)
 S3method(setup_approach,regression_surrogate)
 S3method(setup_approach,timeseries)
 S3method(setup_approach,vaeac)
+export(additional_regression_setup)
 export(aicc_full_single_cpp)
+export(append_vS_list)
+export(check_convergence)
+export(cli_compute_vS)
+export(cli_iter)
+export(cli_startup)
+export(coalition_matrix_cpp)
+export(compute_estimates)
 export(compute_shapley_new)
+export(compute_time)
 export(compute_vS)
 export(correction_matrix_cpp)
+export(create_coalition_table)
 export(explain)
 export(explain_forecast)
-export(feature_combinations)
-export(feature_matrix_cpp)
 export(finalize_explanation)
+export(finalize_explanation_forecast)
 export(get_cov_mat)
 export(get_data_specs)
+export(get_extra_est_args_default)
+export(get_iterative_args_default)
 export(get_model_specs)
 export(get_mu_vec)
+export(get_output_args_default)
 export(get_supported_approaches)
 export(hat_matrix_cpp)
 export(mahalanobis_distance_cpp)
@@ -73,19 +85,28 @@ export(plot_MSEv_eval_crit)
 export(plot_SV_several_approaches)
 export(predict_model)
 export(prepare_data)
+export(prepare_data_causal)
 export(prepare_data_copula_cpp)
+export(prepare_data_copula_cpp_caus)
 export(prepare_data_gaussian_cpp)
+export(prepare_data_gaussian_cpp_caus)
+export(prepare_next_iteration)
+export(print_iter)
 export(regression.train_model)
 export(rss_cpp)
+export(save_results)
 export(setup)
 export(setup_approach)
 export(setup_computation)
+export(shapley_setup)
+export(testing_cleanup)
 export(vaeac_get_evaluation_criteria)
 export(vaeac_get_extra_para_default)
 export(vaeac_plot_eval_crit)
 export(vaeac_plot_imputed_ggpairs)
 export(vaeac_train_model)
 export(vaeac_train_model_continue)
+export(weight_matrix)
 export(weight_matrix_cpp)
 importFrom(Rcpp,sourceCpp)
 importFrom(data.table,":=")
@@ -110,6 +131,7 @@ importFrom(stats,as.formula)
 importFrom(stats,contrasts)
 importFrom(stats,embed)
 importFrom(stats,formula)
+importFrom(stats,median)
 importFrom(stats,model.frame)
 importFrom(stats,model.matrix)
 importFrom(stats,predict)
@@ -118,8 +140,10 @@ importFrom(stats,qt)
 importFrom(stats,rnorm)
 importFrom(stats,sd)
 importFrom(stats,setNames)
+importFrom(utils,capture.output)
 importFrom(utils,head)
 importFrom(utils,methods)
 importFrom(utils,modifyList)
+importFrom(utils,relist)
 importFrom(utils,tail)
 useDynLib(shapr, .registration = TRUE)
diff --git a/NEWS.md b/NEWS.md
@@ -1,18 +1,24 @@
-# shapr (development version) 
-
-* Release a Python wrapper (`shaprpyr`, [#325](https://github.com/NorskRegnesentral/shapr/pull/325)) for explaining predictions from Python models (from Python) utilizing almost all functionality of `shapr`. The wrapper moves back and forth back and forth between Python and R, doing the prediction in Python, and almost everything else in R. This simplifies maintenance of `shaprpy` significantly. The wrapper is available [here](https://github.com/NorskRegnesentral/shapr/tree/master/python).
-* Complete restructuring motivated by introducing the Python wrapper. The restructuring splits the explanation tasks into smaller pieces, which was necessary to allow the Python wrapper to move back and forth between R and Python.
-* As part of the restructuring, we also did a number of design changes, resulting in a series of breaking changes described below.
+# shapr 1.0.0
 
 ### Breaking changes
 
 * Moved from explaining predictions using *two* functions (`shapr()` for initial setup + `explain()` for explanation for specific observations), to a *single* function call (also named `explain()`). The data used for training and to be explained have gotten explicit names (`x_train` and `x_explain`). The order of the input arguments has also been slightly changed (`model` is now the first argument).
 * Prediction and checking functions for custom models are now passed directly as arguments to `explain()` instead of being defined as functions of a specific class in the global env.
 * The previously exported function `make_dummies` used to explain `xgboost` models with categorical data, is removed to simplify the code base. This is rather handled with a custom prediction model.
 * The function `explain.ctree_comb_mincrit`, which allowed combining models with `approch=ctree` with different `mincrit` parameters, has been removed to simplify the code base. It may return in a completely general manner in later version of `shapr`.
+* New argument names: prediction_zero -> phi0, n_combinations -> max_n_coalitions, n_samples -> n_MC_samples, 
 
 ### New features
 
+* Iterative Shapley value estimation with convergence detection
+* New approaches: vaeac, regression_separate, regression_surrogate, timeseries, categorical
+* verbose argument for explain() to control the amount of output
+* Parallelized computation of v(S) with future, including progress updates
+* Paired_sampling of coalitions
+* prev_shapr_object argument to explain() to continue explanation from a previous object
+* asymmetric and causal Shapley values
+* Improved KernelSHAP estimation with adjusted weights for reduced variance
+* Release a Python wrapper (`shaprpyr`, [#325](https://github.com/NorskRegnesentral/shapr/pull/325)) for explaining predictions from Python models (from Python) utilizing almost all functionality of `shapr`. The wrapper moves back and forth back and forth between Python and R, doing the prediction in Python, and almost everything else in R. This simplifies maintenance of `shaprpy` significantly. The wrapper is available [here](https://github.com/NorskRegnesentral/shapr/tree/master/python).
 * Introduce batch computation of conditional expectations ([#244](https://github.com/NorskRegnesentral/shapr/issues/244)). 
 This essentially compute $v(S)$ for a portion of the $S$-subsets at a time, to reduce the amount of data needed to be held in memory. 
 The user can control the number of batches herself, but we set a reasonable value by default ([#327](https://github.com/NorskRegnesentral/shapr/pull/327)). 
@@ -49,6 +55,7 @@ Previously, this was not possible with the prediction functions defined internal
 ### Documentation improvements
 
 * The [vignette](https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html) has been updated to reflect the new framework for explaining predictions, and all the new package features/functionality.
+* New vignettes also for the regression paradigm, vaeac and the asymmetric/causal Shapley values
 
 # shapr 0.2.3 (GitHub only)