misc app and setup edits

m-clark · Aug 18, 2024 · 123eab8 · 123eab8
1 parent b226d15
commit 123eab8
Show file tree

Hide file tree

Showing 14 changed files with 1,420 additions and 1,408 deletions.
diff --git a/.Rprofile b/.Rprofile
@@ -229,46 +229,7 @@ ggsave = function(filename, width = 8, height = 6, ...) {
   )
 }
 
-skimmer = function() {
-	skimr::skim_with(
-		# reordering/naming/slimming numeric output
-		numeric = skimr::sfl(
-			mean = ~ mean(., na.rm = TRUE),
-			sd   = ~ sd(., na.rm = TRUE),
-			min  = ~ min(., na.rm = TRUE),
-			med  = ~ median(., na.rm = TRUE),
-			max  = ~ max(., na.rm = TRUE),
-			iqr  = NULL,
-			hist = NULL,
-			p0   = NULL,  # renamed
-			p25  = NULL,
-			p50  = NULL,  # renamed
-			p75  = NULL,
-			p100 = NULL   # renamed
-		),
-
-		character = skimr::sfl(
-			empty  = \(x) skimr::n_empty(x) + skimr::n_whitespace(x), # replace default which is only n_empty
-      whitespace = NULL,
-      min = NULL,  # these refer to nchar which I doubt anyone would know
-      max = NULL,
-		),
-		append = TRUE
-	)
-}
-
-summarize_data = function(data, types = 'all') {
-	init = skimmer()(data)
-	summ = skimr::partition(init)
-
-	if (!all(types == 'all')) {
-		summ = summ[tolower(names(summ)) %in% tolower(types)]
-	}
 
-	summ = purrr::map(summ, tibble::tibble)
-
-	return(summ)
-}
 
 options(digits = 4) # number of digits of precision for floating point output
 
diff --git a/_quarto.yml b/_quarto.yml
@@ -40,10 +40,12 @@ book:
   appendices:
     - part: "Additional Topics"
       chapters:
-        - dataset_descriptions.qmd  # this and ref to separate section
         - matrix_operations.qmd
         - pyr.qmd
         - more_models.qmd
+    - part: "References & Resources"
+      chapters:
+        - dataset_descriptions.qmd  # this and ref to separate section
         - references.qmd   # this and ref to separate section maybe 'resources'
       # - appendix_placeholder.qmd
   search: true

diff --git a/create_nocite.ipynb b/create_nocite.ipynb
@@ -2,14 +2,14 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "@agresti_foundations_2015, @albon_machine_2024, @albon_machine_2024-1, @amazon_what_2024, @andrej_karpathy_lets_2024, @arel-bundock_marginal_2024, @bai_understanding_2021, @barrett_causal_2024, @belkin_reconciling_2019, @biecek_explanatory_2020, @bischl_applied_2024, @bishop_pattern_2006, @boykis_what_2023, @brownlee_gentle_2016, @brownlee_gentle_2019, @brownlee_gradient_2021, @brownlee_how_2020, @burges_learning_2005, @burges_ranknet_2016, @bycroft_llm_2023, @carpenter_prior_2023, @causalml_causalml_2023, @cawley_over-fitting_2010, @chawla_smote_2002, @chernozhukov_applied_2024, @clark_bayesian_2022, @clark_deep_2022, @clark_generalized_2022, @clark_graphical_2018, @clark_mixed_2023, @clark_model_2021, @clark_practical_2020, @clark_thinking_2018, @clark_this_2021, @cohen_statistical_2009, @cross_validated_answer_2011, @cross_validated_answer_2020, @cross_validated_answer_2021, @cunningham_causal_2023, @dahabreh_causal_2024, @databricks_what_2019, @davison_bootstrap_1997, @dobson_introduction_2018, @dunn_distribution-free_2020, @dunn_generalized_2018, @efron_introduction_1994, @facure_alves_causal_2022, @fahrmeir_regression_2021, @faraway_extending_2016, @faraway_linear_2014, @ferrari_beta_2004, @fleuret_little_2023, @fortuner_machine_2023, @fox_applied_2015, @gelman_advanced_2024, @gelman_bayesian_2013, @gelman_data_2006, @gelman_garden_nodate, @gelman_regression_2020, @gelman_what_2013, @goodfellow_deep_2016, @google_classification_2024, @google_imbalanced_2023, @google_introduction_2023, @google_machine_2023, @google_reducing_2024, @google_what_2024, @greene_econometric_2017, @grolemund_welcome_2023, @hardin_generalized_2018, @harrell_regression_2015, @hastie_elements_2017, @heiss_marginalia_2022, @hernan_c-word_2018, @hernan_causal_2012, @howard_practical_2024, @hugging_face_byte-pair_2024, @hvitfeldt_feature_2024, @hyndman_forecasting_2021, @ivanova_comprehension_2020, @james_introduction_2021, @jiang_visual_2020, @jordan_introduction_2018, @kirillov_segment_2023, @koenker_galton_2000, @koenker_quantile_2005, @kruschke_doing_2010, @kuhn_applied_2023, @kuhn_tidy_2023, @kunzel_metalearners_2019, @lang_mlr3_2019, @lee_deep_2017, @leech_questionable_2024, @lones_how_2024, @mahr_random_2021, @masis_interpretable_2023, @mccullagh_generalized_2019, @mcculloch_logical_1943, @mcelreath_statistical_2020, @mckinney_python_2023, @microsoft_generative_2024, @molnar_interpretable_2023, @monroe_imbalanced_2024, @morgan_counterfactuals_2014, @murphy_machine_2012, @murphy_probabilistic_2023, @navarro_learning_2018, @neal_priors_1996, @nelder_generalized_1972, @nickyp_llm_2023, @niculescu-mizil_predicting_2005, @pearl_causal_2009, @pearl_causal_2022, @peng_r_2022, @penn_state_54_2018, @pok_how_2020, @prince_understanding_2023, @quantmetry_mapie_2024, @raschka_about_2014, @raschka_build_2023, @raschka_machine_2022, @raschka_machine_2023, @rasmussen_gaussian_2005, @ripley_pattern_1996, @roback_beyond_2021, @roberts_neural_2000, @rocca_handling_2019, @rovine_peirce_2004, @schmidhuber_annotated_2022, @scikit-learn_116_2023, @scikit-learn_nested_2023, @sen_decoder-only_2024, @shalizi_f-tests_2015, @shevchenko_types_2023, @shorten_text_2021, @simpson_using_2021, @stackexchange_are_2015, @statquest_with_josh_starmer_bootstrapping_2021, @statquest_with_josh_starmer_gradient_2019, @statquest_with_josh_starmer_stochastic_2019, @turrell_python_2024, @ucla_advanced_research_computing_faq_2023, @ucla_advanced_research_computing_faq_2023-1, @ushey_arrays_2024, @vanderplas_python_2016, @vaswani_attention_2017, @vig_deconstructing_2019, @walker_analyzing_2023, @weed_learning_2021, @welchowski_techniques_2022, @wikipedia_cross-entropy_2024, @wikipedia_exponential_2024, @wikipedia_gradient_2024, @wikipedia_relationships_2023, @wikipedia_replication_2024, @wood_generalized_2017, @wooldridge_introductory_2012, @yeh_ai_2024, @zhang_dive_2023\n"
+      "@agresti_foundations_2015, @albon_machine_2024, @albon_machine_2024-1, @amazon_what_2024, @andrej_karpathy_lets_2024, @arel-bundock_marginal_2024, @bai_understanding_2021, @barrett_causal_2024, @belkin_reconciling_2019, @biecek_explanatory_2020, @bischl_applied_2024, @bishop_pattern_2006, @boehmke_hands-machine_2020, @boykis_what_2023, @brownlee_gentle_2016, @brownlee_gentle_2019, @brownlee_gradient_2021, @brownlee_how_2020, @burges_learning_2005, @burges_ranknet_2016, @bycroft_llm_2023, @carpenter_prior_2023, @causalml_causalml_2023, @cawley_over-fitting_2010, @chawla_smote_2002, @chernozhukov_applied_2024, @clark_bayesian_2022, @clark_deep_2022, @clark_generalized_2022, @clark_graphical_2018, @clark_mixed_2023, @clark_model_2021, @clark_practical_2020, @clark_thinking_2018, @clark_this_2021, @cohen_statistical_2009, @cross_validated_answer_2011, @cross_validated_answer_2020, @cross_validated_answer_2021, @cross_validated_why_2016, @cunningham_causal_2023, @dahabreh_causal_2024, @databricks_what_2019, @davison_bootstrap_1997, @dobson_introduction_2018, @dunn_distribution-free_2020, @dunn_generalized_2018, @efron_introduction_1994, @facure_alves_causal_2022, @fahrmeir_regression_2021, @faraway_extending_2016, @faraway_linear_2014, @ferrari_beta_2004, @fleuret_little_2023, @fortuner_machine_2023, @fox_applied_2015, @gelman_advanced_2024, @gelman_bayesian_2013, @gelman_data_2006, @gelman_garden_nodate, @gelman_regression_2020, @gelman_what_2013, @goodfellow_deep_2016, @google_classification_2024, @google_imbalanced_2023, @google_introduction_2023, @google_machine_2023, @google_reducing_2024, @google_what_2024, @greene_econometric_2017, @grolemund_welcome_2023, @hardin_generalized_2018, @harrell_regression_2015, @hastie_elements_2017, @heiss_marginalia_2022, @hernan_c-word_2018, @hernan_causal_2012, @howard_practical_2024, @hugging_face_byte-pair_2024, @hvitfeldt_feature_2024, @hyndman_forecasting_2021, @ivanova_comprehension_2020, @james_introduction_2021, @jiang_visual_2020, @jordan_introduction_2018, @kirillov_segment_2023, @koenker_galton_2000, @koenker_quantile_2005, @kruschke_doing_2010, @kuhn_applied_2023, @kuhn_tidy_2023, @kunzel_metalearners_2019, @lang_mlr3_2019, @lee_deep_2017, @leech_questionable_2024, @lones_how_2024, @mahr_random_2021, @masis_interpretable_2023, @mccullagh_generalized_2019, @mcculloch_logical_1943, @mcelreath_statistical_2020, @mckinney_python_2023, @microsoft_generative_2024, @molnar_interpretable_2023, @monroe_imbalanced_2024, @morgan_counterfactuals_2014, @murphy_machine_2012, @murphy_probabilistic_2023, @navarro_learning_2018, @neal_priors_1996, @nelder_generalized_1972, @niculescu-mizil_predicting_2005, @pearl_causal_2009, @pearl_causal_2022, @peng_r_2022, @penn_state_54_2018, @pochinkov_llm_2023, @pok_how_2020, @prince_understanding_2023, @quantmetry_mapie_2024, @raschka_about_2014, @raschka_build_2023, @raschka_machine_2022, @raschka_machine_2023, @rasmussen_gaussian_2005, @ripley_pattern_1996, @roback_beyond_2021, @roberts_neural_2000, @rocca_handling_2019, @rovine_peirce_2004, @schmidhuber_annotated_2022, @scikit-learn_116_2023, @scikit-learn_nested_2023, @sen_decoder-only_2024, @shalizi_f-tests_2015, @shevchenko_types_2023, @shorten_text_2021, @simpson_using_2021, @stackexchange_are_2015, @statquest_with_josh_starmer_bootstrapping_2021, @statquest_with_josh_starmer_gradient_2019, @statquest_with_josh_starmer_stochastic_2019, @turrell_python_2024, @ucla_advanced_research_computing_faq_2023, @ucla_advanced_research_computing_faq_2023-1, @ushey_arrays_2024, @vanderplas_python_2016, @vaswani_attention_2017, @vig_deconstructing_2019, @walker_analyzing_2023, @weed_learning_2021, @welchowski_techniques_2022, @wikipedia_cross-entropy_2024, @wikipedia_exponential_2024, @wikipedia_gradient_2024, @wikipedia_relationships_2023, @wikipedia_replication_2024, @wood_generalized_2017, @wooldridge_introductory_2012, @yeh_ai_2024, @zhang_dive_2023\n"
      ]
     }
    ],

diff --git a/functions/utils.r b/functions/utils.r
@@ -13,4 +13,45 @@ word_sign <- function(value, words) {
 
 round_any = function(x, accuracy, f = round) {
   f(x / accuracy) * accuracy
+}
+
+skimmer = function() {
+	skimr::skim_with(
+		# reordering/naming/slimming numeric output
+		numeric = skimr::sfl(
+			mean = ~ mean(., na.rm = TRUE),
+			sd   = ~ sd(., na.rm = TRUE),
+			min  = ~ min(., na.rm = TRUE),
+			med  = ~ median(., na.rm = TRUE),
+			max  = ~ max(., na.rm = TRUE),
+			iqr  = NULL,
+			hist = NULL,
+			p0   = NULL,  # renamed
+			p25  = NULL,
+			p50  = NULL,  # renamed
+			p75  = NULL,
+			p100 = NULL   # renamed
+		),
+
+		character = skimr::sfl(
+			empty  = \(x) skimr::n_empty(x) + skimr::n_whitespace(x), # replace default which is only n_empty
+      whitespace = NULL,
+      min = NULL,  # these refer to nchar which I doubt anyone would know
+      max = NULL,
+		),
+		append = TRUE
+	)
+}
+
+summarize_data = function(data, types = 'all') {
+	init = skimmer()(data)
+	summ = skimr::partition(init)
+
+	if (!all(types == 'all')) {
+		summ = summ[tolower(names(summ)) %in% tolower(types)]
+	}
+
+	summ = purrr::map(summ, tibble::tibble)
+
+	return(summ)
 }
diff --git a/generalized_linear_models.qmd b/generalized_linear_models.qmd
@@ -541,14 +541,18 @@ We feel it is much more intuitive to interpret things on the probability scale,
 #| out-width: 100%
 p_prob = ggeffects::ggpredict(model_logistic, terms = c('word_count')) |>
     plot(color = okabe_ito['darkblue'], use_theme = FALSE) +
+    annotate(geom = 'text', x = 20, y = plogis(1), label = 'Not so much!') +
     scale_fill_manual(values = okabe_ito) +
+    lims(y = c(0.02, .9), x = c(0, 32)) +
+    scale_y_continuous(labels = scales::percent) +
     labs(x = 'Word Count', y = '% Good Rating', title = '')
 
-p_dat = p_prob$data
+p_dat = p_prob$data |> 
+    as_tibble()
 
 logit_preds = predict(
     model_logistic,
-    newdata = as_tibble(p_prob$data) |> mutate(word_count = x, gender = 'female'),
+    newdata = as_tibble(p_dat) |> mutate(word_count = x, gender = 'female'),
     se.fit = TRUE
 )
 
@@ -563,6 +567,8 @@ p_logit = p_dat |>
     ggplot(aes(x = x, y = logit)) +
     geom_ribbon(aes(ymin = ll, ymax = ul), fill = okabe_ito['darkblue'], alpha = .2) +
     geom_line(color = okabe_ito['darkblue']) +
+    annotate(geom = 'text', x = 20, y = 1, label = 'Linear!') +
+    lims(y = c(qlogis(0.02), qlogis(.9)), x = c(0, 32)) +
     labs(x = 'Word Count', y = 'Log Odds', title = '')
 
 
@@ -579,7 +585,8 @@ p_logit = p_dat |>
     plot_layout(guides = 'collect') +
     plot_annotation(
         title = 'Logistic Regression Predictions',
-        caption = 'Note: The shaded area represents the 95% confidence interval for the log odds.'
+        caption = "Note: The shaded area represents the 95% confidence interval for the log odds.\n
+        Effect is with Gender = 'Female'."
     ) &
     theme(
         axis.title.y = element_text(size = 12),
@@ -609,14 +616,15 @@ In @fig-logistic-regression-word-count, we can see a clear negative relationship
 #| fig-cap: Model Predictions for Gender
 p = ggeffects::ggpredict(model_logistic, terms = c('gender')) |> 
     plot(
-        color = okabe_ito['darkblue'], 
+        color = c(okabe_ito[['orange']], okabe_ito[['darkblue']]), 
         dot_size = 10, 
         dot_alpha = 1, # ignored, so added back with geom_point
         line_size = 2,
         use_theme = FALSE
     ) +
     geom_point(color = okabe_ito['darkblue'], alpha = 1, size = 10) +
     scale_fill_manual(values = unname(okabe_ito)) +
+    scale_y_continuous(limits = c(.5, .65), labels = scales::percent) +
     labs(x = '', y = '% Good Rating', title = 'Logistic Regression Predictions')
 
 p
@@ -653,7 +661,7 @@ Logistic regression does not have an $R^2$ value in the way that a linear regres
 
 The Poisson distribution is very similar to the binomial distribution, because the binomial is also a count distribution, and in fact generalizes the poisson[^poisbin]. The Poisson has a single parameter noted as $\lambda$, which makes it the simplest model setting we've seen so far[^novariance]. Conceptually, this rate parameter is going to estimate the expected number of events during a time interval. This can be accidents in a year, pieces produced in a day, or hits during the course of a baseball season.
 
-[^novariance]: Neither the binomial nor the Poisson have a variance parameter to estimate, as the variance is determined by the mean. This is in contrast to the normal distribution, where the variance is a separate parameter. For the Poisson, the variance is equal to the mean, and for the binomial, the variance is equal to $n*p*(1-p)$. The Poisson assumption of equal variance rarely holds up in practice, so people often use the negative binomial distribution instead.
+[^novariance]: Neither the binomial nor the Poisson have a variance parameter to estimate, as the variance is determined by the mean. This is in contrast to the normal distribution, where the variance is an estimated parameter. For the Poisson, the variance is equal to the mean, and for the binomial, the variance is equal to $n*p*(1-p)$. The Poisson assumption of equal variance rarely holds up in practice, so people often use the negative binomial distribution instead.
 
 [^poisbin]: If your binomial setting has a very large number of trials relative to the number of successes, which amounts to very small proportions $p$, you would find that the binomial distribution would converge to the Poisson distribution.