1st analysis

ref #2
OuhscBbmc · Feb 25, 2020 · 7bede22 · 7bede22
1 parent c83ab7d
commit 7bede22
Show file tree

Hide file tree

Showing 7 changed files with 2,322 additions and 24 deletions.
diff --git a/analysis/month-performance-1/figure-png/scatterplots-1.png b/analysis/month-performance-1/figure-png/scatterplots-1.png
diff --git a/analysis/month-performance-1/month-performance-1.R b/analysis/month-performance-1/month-performance-1.R
@@ -0,0 +1,140 @@
+rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run. This is not called by knitr, because it's above the first chunk.
+
+# ---- load-sources ------------------------------------------------------------
+#Load any source files that contain/define functions, but that don't load any other types of variables
+#   into memory.  Avoid side effects and don't pollute the global environment.
+# source("SomethingSomething.R")
+
+# ---- load-packages -----------------------------------------------------------
+library(ggplot2) #For graphing
+import::from("magrittr", "%>%")
+requireNamespace("dplyr")
+# requireNamespace("RColorBrewer")
+# requireNamespace("scales") #For formating values in graphs
+# requireNamespace("mgcv) #For the Generalized Additive Model that smooths the longitudinal graphs.
+# requireNamespace("TabularManifest") # remotes::install_github("Melinae/TabularManifest")
+
+# ---- declare-globals ---------------------------------------------------------
+options(show.signif.stars=F) #Turn off the annotations on p-values
+config                      <- config::get()
+
+palette_dark <- list( # http://colrd.com/image-dna/50489/
+  # "#141e1f",            # black
+  "boundary" = "#3e525c", # dark gray
+  "post"  = "#27403a",    # dark green
+  "pre"   = "#804d1e"     # dark brown
+  # "#77b6c4",            # light blue
+  # "post"  = "#185f63",    # lighter green
+  # "pre"   = "#c5873e"     # lighter brown
+)
+palette_light <- list( # http://colrd.com/image-dna/50489/
+  # "#141e1f",            # black
+  "boundary" = "#3e525c", # dark gray
+  # "post"  = "#27403a",    # dark green
+  # "pre"   = "#804d1e",    # dark brown
+  # "#77b6c4",            # light blue
+  "post"  = "#185f63",    # lighter green
+  "pre"   = "#c5873e"     # lighter brown
+)
+# OuhscMunge::readr_spec_aligned(config$path_month_raw)
+
+col_types <- readr::cols_only(
+  `month`                     = readr::col_date(format = ""),
+  `phase`                     = readr::col_character(),
+  `stroke_numerator`          = readr::col_integer(),
+  `stroke_denominator`        = readr::col_integer(),
+  `antithromb_numerator`      = readr::col_integer(),
+  `antithromb_denominator`    = readr::col_integer(),
+  `anticoag_numerator`        = readr::col_integer(),
+  `anticoag_denominator`      = readr::col_integer(),
+  `statin_numerator`          = readr::col_integer(),
+  `statin_denominator`        = readr::col_integer(),
+  `smoking_numerator`         = readr::col_integer(),
+  `smoking_denominator`       = readr::col_integer(),
+  `cumulative_numerator`      = readr::col_integer(),
+  `cumulative_denominator`    = readr::col_integer()
+)
+
+# ---- load-data ---------------------------------------------------------------
+ds <- readr::read_csv(config$path_month_raw, col_types = col_types)
+
+# ---- tweak-data --------------------------------------------------------------
+ds <-
+  ds %>%
+  dplyr::mutate(
+    post  = dplyr::recode(phase, "pre" = 0L, "post" = 1L),
+    phase = factor(phase, levels = c("pre", "post")),
+  ) %>%
+  dplyr::mutate(
+    stroke_proportion     = stroke_numerator     / stroke_denominator     ,
+    antithromb_proportion = antithromb_numerator / antithromb_denominator ,
+    anticoag_proportion   = anticoag_numerator   / anticoag_denominator   ,
+    statin_proportion     = statin_numerator     / statin_denominator     ,
+    smoking_proportion    = smoking_numerator    / smoking_denominator    ,
+    cumulative_proportion = cumulative_numerator / cumulative_denominator ,
+  ) %>%
+  dplyr::mutate(
+    stroke_label     = sprintf("%4.0f%% (%2i of %2i)", stroke_proportion     * 100, stroke_numerator     , stroke_denominator     ),
+    antithromb_label = sprintf("%4.0f%% (%2i of %2i)", antithromb_proportion * 100, antithromb_numerator , antithromb_denominator ),
+    anticoag_label   = sprintf("%4.0f%% (%2i of %2i)", anticoag_proportion   * 100, anticoag_numerator   , anticoag_denominator   ),
+    statin_label     = sprintf("%4.0f%% (%2i of %2i)", statin_proportion     * 100, statin_numerator     , statin_denominator     ),
+    smoking_label    = sprintf("%4.0f%% (%2i of %2i)", smoking_proportion    * 100, smoking_numerator    , smoking_denominator    ),
+    cumulative_label = sprintf("%4.0f%% (%2i of %2i)", cumulative_proportion * 100, cumulative_numerator , cumulative_denominator ),
+  )
+
+# ---- marginals ---------------------------------------------------------------
+# # Inspect continuous variables
+# histogram_continuous(d_observed=ds, variable_name="quarter_mile_sec", bin_width=.5, rounded_digits=1)
+# # slightly better function: TabularManifest::histogram_continuous(d_observed=ds, variable_name="quarter_mile_sec", bin_width=.5, rounded_digits=1)
+# histogram_continuous(d_observed=ds, variable_name="displacement_inches_cubed", bin_width=50, rounded_digits=1)
+#
+# # Inspect discrete/categorical variables
+# histogram_discrete(d_observed=ds, variable_name="carburetor_count_f")
+# histogram_discrete(d_observed=ds, variable_name="forward_gear_count_f")
+
+
+# ---- scatterplots ------------------------------------------------------------
+x_breaks  <- ds$month
+
+g1 <-
+  ggplot(ds, aes(x=month, y=statin_proportion, size=statin_denominator, label=statin_label, color = phase)) +
+  geom_text(aes(y = -Inf, size=5), angle = 90, hjust = 0) +
+  geom_vline(xintercept = config$intervention_start, size = 4, color = "gray70", alpha = .6) +
+  annotate("text", label = "intervention starts", x = config$intervention_start, y = .5, hjust = .5, angle = 90, alpha = .4) +
+  geom_line(size=.5, color = "gray70") +
+  geom_point(aes(fill = phase), shape=21, alpha = .5) +
+  scale_x_date(breaks = x_breaks, date_labels = "%b\n%Y") +
+  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
+  scale_color_manual(values = palette_dark, guide = "none") +
+  scale_fill_manual(values = palette_light, guide = "none") +
+  scale_size(guide = "none") +
+  coord_cartesian(ylim = c(0, 1)) +
+  theme_light() +
+  theme(axis.ticks = element_blank()) +
+  labs(x = NULL, y = "statin")
+g1
+
+# g1 %+% aes(color=cylinder_count)
+
+
+# ---- models ------------------------------------------------------------------
+m1 <- lm(statin_proportion ~ 1 + post, data=ds)
+summary(m1)
+
+m2 <- glm(
+  statin_numerator / statin_denominator ~ 1 + post,
+  family  = quasipoisson,
+  data    = ds
+)
+summary(m2)
+
+
+# ---- model-results-table  -----------------------------------------------
+summary(m2)$coef %>%
+  knitr::kable(
+    digits      = 2,
+    format      = "markdown"
+  )
+
+# Uncomment the next line for a dynamic, JavaScript [DataTables](https://datatables.net/) table.
+# DT::datatable(round(summary(m2)$coef, digits = 2), options = list(pageLength = 2))
diff --git a/analysis/month-performance-1/month-performance-1.Rmd b/analysis/month-performance-1/month-performance-1.Rmd
@@ -0,0 +1,145 @@
+---
+title: Skeleton Report 1
+date: "Date: `r Sys.Date()`"
+output:
+  # radix::radix_article: # radix is a newer alternative that has some advantages over `html_document`.
+  html_document:
+    keep_md: yes
+    toc: 4
+    toc_float: true
+    number_sections: true
+    css: ../common/styles.css         # analysis/common/styles.css
+---
+
+This report covers the analyses used in the ZZZ project (Marcus Mark, PI).
+
+<!--  Set the working directory to the repository's base directory; this assumes the report is nested inside of two directories.-->
+```{r, echo=F, message=F}
+# cat("Working directory: ", getwd())
+library(knitr)
+opts_knit$set(root.dir='../../')  #Don't combine this call with any other chunk -especially one that uses file paths.
+```
+
+<!-- Set the report-wide options, and point to the external code file. -->
+```{r set-options, echo=F}
+# cat("Working directory: ", getwd())
+report_render_start_time <- Sys.time()
+opts_chunk$set(
+  results      = 'show',
+  comment      = NA,
+  tidy         = FALSE,
+  # dpi        = 400,
+  # out.width  = "650px", #This affects only the markdown, not the underlying png file.  The height will be scaled appropriately.
+  fig.width    = 4,
+  fig.height   = 4,
+  fig.path     = 'figure-png/'
+)
+
+echo_chunks    <- FALSE # Toggle for debugging.
+message_chunks <- FALSE # Toggle for debugging.
+# options(width=100) # So the output is 25% wider than the default.
+read_chunk("./analysis/month-performance-1/month-performance-1.R") # This allows knitr to call chunks tagged in the underlying *.R file.
+```
+
+<!-- Load 'sourced' R files.  Suppress the output when loading sources. -->
+```{r load-sources, echo=echo_chunks, message=message_chunks}
+```
+
+<!-- Load packages, or at least verify they're available on the local machine.  Suppress the output when loading packages. -->
+```{r load-packages, echo=echo_chunks, message=message_chunks}
+```
+
+<!-- Load any global functions and variables declared in the R file.  Suppress the output. -->
+```{r declare-globals, echo=echo_chunks, results='show', message=message_chunks}
+```
+
+<!-- Declare any global functions specific to a Rmd output.  Suppress the output. -->
+```{r rmd-specific, echo=echo_chunks, message=message_chunks}
+# Put presentation-specific code in here.  It doesn't call a chunk in the codebehind file.
+#   It should be rare (and used cautiously), but sometimes it makes sense to include code in Rmd
+#   that doesn't live in the codebehind R file.
+```
+
+<!-- Load the datasets.   -->
+```{r load-data, echo=echo_chunks, results='show', message=message_chunks}
+```
+
+<!-- Tweak the datasets.   -->
+```{r tweak-data, echo=echo_chunks, results='show', message=message_chunks}
+```
+
+Summary {.tabset .tabset-fade .tabset-pills}
+===========================================================================
+
+Notes
+---------------------------------------------------------------------------
+
+1. The current report covers `r nrow(ds)` month, with `r dplyr::n_distinct(ds$phase)` unique values for `month`.
+
+
+Unanswered Questions
+---------------------------------------------------------------------------
+
+
+
+Answered Questions
+---------------------------------------------------------------------------
+
+
+Graphs
+===========================================================================
+
+
+Marginals
+---------------------------------------------------------------------------
+
+```{r marginals, echo=echo_chunks, message=message_chunks}
+```
+
+
+Scatterplots
+---------------------------------------------------------------------------
+
+```{r scatterplots, echo=echo_chunks, message=message_chunks, fig.width=7}
+```
+
+
+Models
+===========================================================================
+
+Model Exploration
+---------------------------------------------------------------------------
+```{r models, echo=echo_chunks, message=message_chunks}
+```
+
+
+Final Model
+---------------------------------------------------------------------------
+
+```{r model-results-table, echo=echo_chunks, message=message_chunks, warning=TRUE}
+```
+
+In the model that includes two predictors, the slope coefficent of `Miles per gallon` is `r summary(m2)$coefficients[2,1]`.
+
+
+Session Information {#session-info}
+===========================================================================
+
+For the sake of documentation and reproducibility, the current report was rendered in the following environment.  Click the line below to expand.
+
+<details>
+  <summary>Environment <span class="glyphicon glyphicon-plus-sign"></span></summary>
+```{r session-info, echo=FALSE}
+if( requireNamespace("devtools", quietly = TRUE) ) {
+  devtools::session_info()
+} else {
+  sessionInfo()
+}
+```
+</details>
+
+```{r session-duration, echo=FALSE}
+report_render_duration_in_seconds <- as.integer(difftime(Sys.time(), report_render_start_time, units="secs"))
+```
+
+Report rendered by `r Sys.info()["user"]` at `r strftime(Sys.time(), "%Y-%m-%d, %H:%M %z")` in `r report_render_duration_in_seconds` seconds.
diff --git a/analysis/month-performance-1/month-performance-1.html b/analysis/month-performance-1/month-performance-1.html