From c83ab7d06f711b07db173f427ca515153d1f73ca Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Mon, 24 Feb 2020 23:18:55 -0600 Subject: [PATCH] initialize repo closes #1 --- .Rbuildignore | 53 +++++ .gitattributes | 11 ++ .gitignore | 108 +++++++++-- DESCRIPTION | 62 ++++++ analysis/README.md | 4 + analysis/common/display-1.R | 341 +++++++++++++++++++++++++++++++++ analysis/common/styles.css | 7 + config.yml | 28 +++ data-public/README.md | 6 + data-public/derived/README.md | 13 ++ data-public/metadata/README.md | 9 + data-public/raw/README.md | 9 + data-unshared/README.md | 8 + data-unshared/contents.md | 11 ++ documentation/README.md | 3 + flow.R | 113 +++++++++++ manipulation/README.md | 4 + suku-cqi-1.Rproj | 20 ++ 18 files changed, 791 insertions(+), 19 deletions(-) create mode 100644 .Rbuildignore create mode 100644 .gitattributes create mode 100644 DESCRIPTION create mode 100644 analysis/README.md create mode 100644 analysis/common/display-1.R create mode 100644 analysis/common/styles.css create mode 100644 config.yml create mode 100644 data-public/README.md create mode 100644 data-public/derived/README.md create mode 100644 data-public/metadata/README.md create mode 100644 data-public/raw/README.md create mode 100644 data-unshared/README.md create mode 100644 data-unshared/contents.md create mode 100644 documentation/README.md create mode 100644 flow.R create mode 100644 manipulation/README.md create mode 100644 suku-cqi-1.Rproj diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..6959644 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,53 @@ +# List files that should be included in the repo, +# but ignored by R when building the packages. +# Specify with a PCRE regular expression. +# See http://r-pkgs.had.co.nz/package.html#package + +# Directories used in many R-centric repos. +^\.github$ +^docs$ +^revdep$ +^.*\.Rproj$ +^\.Rproj\.user$ + + +# Files used in many R-centric repos. +^_pkgdown\.yml$ +^\.gitignore$ +^\.travis\.yml$ +^appveyor\.yml$ +^CONDUCT.md$ +^CODE_OF_CONDUCT.md$ +^CODE-OF-CONDUCT.md$ +^code-of-conduct.md$ +^config.yml$ +^cran-comments\.md$ +^flow.R$ +^README.html$ +^license\.md$ +^LICENSE\.md$ +^README.html$ +^shim_package\.sh$ +^wercker\.yml$ + + +# Directories occasionally used in R-centric repos written by the BBMC. +^analysis$ +^data-private$ +^data-public$ +^data-unshared$ +^demonstration$ +^documentation$ +^documentation-for-developers$ +^figure$ +^libs$ +^manipulation$ +^sandbox$ +^scripts$ +^stitched-output$ +^playgrounds$ +^utility$ + + +# Files occasionally used in R-centric repos written by the BBMC. +^documentation-peek.pdf$ diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..9a7e312 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,11 @@ +#So HTML, MD, & CSS files aren't considered as code when determining language of repo. +# https://github.com/github/linguist#using-gitattributes +*.html linguist-documentation +*.md linguist-documentation +*.css linguist-documentation + +#For AppVeyor +* text=auto +data/* binary +src/* text=lf +R/* text=lf diff --git a/.gitignore b/.gitignore index fae8299..b04e0ca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,15 +1,10 @@ -# History files +# ---- R and RStudio files ----------------------------------------------- +.Rproj.user/ .Rhistory -.Rapp.history - -# Session Data files .RData - -# User-specific files +.Rdata .Ruserdata - -# Example code in package build process -*-Ex.R +.Rapp.history # Output files from R CMD build /*.tar.gz @@ -17,23 +12,98 @@ # Output files from R CMD check /*.Rcheck/ -# RStudio files -.Rproj.user/ - -# produced vignettes +# R package temporary files +inst/doc vignettes/*.html vignettes/*.pdf +vignettes/*.R + +# The devtools zip is downloaded when the package is updating itself. If it's not deleted, there's no reason to commit it to the repository. +devtools.zip # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 .httr-oauth -# knitr and R markdown default cache directories -*_cache/ -/cache/ - # Temporary files created by R markdown *.utf8.md *.knit.md -# R Environment Variables -.Renviron +# Example code in package build process +*-Ex.R + +# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html +rsconnect/ + +# ---- Protected Information ----------------------------------------------- +# see cubuspl42's answer in http://stackoverflow.com/questions/2415873/exceptions-in-gitignore +/unshared-material/* +/data-unshared/* +/data/unshared/* + +# Dataset Exceptions +!utility/package-dependency-list.csv + +# Keep the README files in the unshared directories. +# They provide documentation, and also force the directories to be created on other machines. +# Depending on how you do it, you might need to add w/ the command line +# eg, `git add -f ./data-unshared/raw/README.md` +!README.md +!data-unshared/contents.md +!data-unshared/raw/README.md +!data-unshared/derived/README.md +!data/unshared/contents.md +!data/unshared/raw/README.md +!data/unshared/derived/README.md + +# ---- Caches ----------------------------------------------- +# Exclude caches in case they contain PHI +cache/ + +# knitr and RMarkdown default cache directories +*_cache/ + +# ---- Windows OS -------------------------------------------- +# Cached small image files +Thumbs.db + +# Folder config file +Desktop.ini + +# ---- OS X -------------------------------------------------- +.DS_Store + +# ---- Visual Studio Code (text editor) ---------------------- +.vscode/ + +# ---- Locks ---------------------- +# Do not commit lock files. +# Programs like Microsoft Word create temporary files that don't contain info useful to the repo. + +~*.tmp +~*.temp + +# Temporary files of Microsoft Office. +~*.doc +~*.docx +~*.xlsx +~*.pptx +*.ldb +*.laccdb + +# Temporary locks used by LibreOffice. +*.odb.lck +.~*.csv# +.~*.odf# +.~*.odp# +.~*.ods# +.~*.odt# +.~*.docx# +.~*.pptx# +.~*.xlsx# + +# ---- Accidentally Added Code ------------------------------- +# The GitHub desktop client makes it easy to right-click a file and select 'ignore'. +# This will append the file name to the bottom of `.gitignore`. +# So the lines below should either be +# 1. removed (because they were added unintentionally), or +# 2. reorganized and moved above this chunk (so it won't be confused with files added unintentionally). diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..43dfd47 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,62 @@ +Package: RAnalysisSkeleton +Title: Displaying Health Data +Description: Files and settings commonly used in analysis projects with R. +Version: 0.0.1.9001 +Date: 2019-02-04 +Authors@R: c(person("Will", "Beasley", role = c("aut", "cre"), email = + "wibeasley@hotmail.com", comment = c(ORCID = "0000-0002-5613-5006")), + person("Andrey", "Koval", role = "aut")) +URL: https://github.com/wibeasley/RAnalysisSkeleton +BugReports: https://github.com/wibeasley/RAnalysisSkeleton/issues +Depends: + R(>= 3.0.0) +Imports: + broom, + checkmate (>= 1.8.4), + codified (>= 0.2.0), + config, + DBI (>= 0.7.0), + dplyr (>= 0.5.0), + DT, + flexdashboard, + ggplot2, + httr (>= 1.3.0), + kableExtra, + knitr (>= 1.18.0), + lme4, + magrittr, + mgcv, + methods, + odbc (>= 1.1.1), + OuhscMunge (>= 0.1.9.9009), + RColorBrewer, + readr (>= 1.2.1), + rmarkdown, + sessioninfo, + TabularManifest, + tibble (>= 1.4.0), + yaml +Suggests: + devtools (>= 1.13.0), + ggalluvial, + ggrepel, + plotly, + purrr, + RcppRoll, + remotes, + rlang, + RSQLite, + scales, + testit, + testthat (>= 0.9), + tidyr (>= 0.7.0), + zoo +Remotes: + github::OuhscBbmc/OuhscMunge, + github::Melinae/TabularManifest +License: GPL-3 + file LICENSE +LazyData: TRUE +VignetteBuilder: knitr +Encoding: UTF-8 +RoxygenNote: 6.1.0 +Roxygen: list(markdown = TRUE) diff --git a/analysis/README.md b/analysis/README.md new file mode 100644 index 0000000..b495f6a --- /dev/null +++ b/analysis/README.md @@ -0,0 +1,4 @@ +`analysis/` Directory +========= + +Files in this directory statistically analyze the data. If multiple lines of analysis are needed, they should be contained in separate subfolders. diff --git a/analysis/common/display-1.R b/analysis/common/display-1.R new file mode 100644 index 0000000..0794c03 --- /dev/null +++ b/analysis/common/display-1.R @@ -0,0 +1,341 @@ +# This file has graphing & table functions used in reports + +palette_model <- c("C1"="#446699", "SC"="#ea573d", "PAT"="#615b70", "P4"="#70af81") #"HFA"="#fb9a62", +palette_miechv <- c("TRUE"="#1765a2", "FALSE"="#4aab5e") # http://colrd.com/image-dna/24016/ + +repo_theme <- function( base_size = 8 ) { + ggplot2::theme_light(base_size=base_size) + + ggplot2::theme(title = ggplot2::element_text(color="gray20")) + + ggplot2::theme(axis.text = ggplot2::element_text(color="gray40")) + + ggplot2::theme(axis.title = ggplot2::element_text(color="gray40")) + + ggplot2::theme(panel.border = ggplot2::element_rect(color="gray80")) + + ggplot2::theme(axis.ticks = ggplot2::element_blank()) + + ggplot2::theme(legend.position = "none") +} +col_types_annotation <- function() { + readr::cols_only( + # date = readr::col_date(format = ""), + date = readr::col_integer(), + title = readr::col_character(), + description = readr::col_character(), + color = readr::col_character() + ) +} + +spaghetti_1 <- function( + d, response_variable, color_variable="model_name", group_variable="program_code", + time_variable="month", facet_variable="model_name", + width_variable="emphasis", alpha_variable="emphasis", loess_variable=facet_variable, + + path_in_annotation = "data-public/raw/programs/cqi-annotation-example.csv", + + base_size=12, + point_size=0L, y_min=0, y_max=NA, + main_title=NULL, x_title=NULL, y_title=NULL, sub_title=NULL, + y_label_format=scales::comma, palette=palette_model(), + width=c("focus"=1, "background"=.25) + # width=c("focus"=.25, "background"=.25) +) { + group_symbol <- ifelse(is.null(group_variable), NULL, rlang::sym(group_variable)) + time_symbol <- rlang::sym(time_variable) + response_symbol <- rlang::sym(response_variable) + color_symbol <- rlang::sym(color_variable) + width_symbol <- rlang::sym(width_variable) + + g <- ggplot(d, aes(x=!!time_symbol, y=!!response_symbol, color=!!color_symbol, size=!!width_symbol, yMin=y_min)) + + + if( !is.null(group_variable) & nrow(d)>0L ) { + d_label <- d %>% + dplyr::group_by(!!group_symbol) %>% + dplyr::arrange(!!time_symbol) %>% + dplyr::mutate( + is_first = (dplyr::row_number() == 1L), + is_last = (dplyr::row_number() == dplyr::n()), + ) %>% + dplyr::filter(is_first | is_last) %>% + dplyr::select( + !!group_symbol, + !!time_symbol, + !!response_symbol, + !!color_symbol, + is_first, + is_last + ) %>% + dplyr::ungroup() + + d_label_left <- d_label[d_label$is_first, ] + d_label_right <- d_label[d_label$is_last , ] + + g <- g + + geom_text(mapping=aes(label=!!group_symbol), data=d_label_left , size=3, hjust=1.2, na.rm=T) + #Left endpoint + geom_text(mapping=aes(label=!!group_symbol), data=d_label_right, size=3, hjust=-.2, na.rm=T) #Right endpoint + + # geom_text(mapping=aes_string(label=group_variable), data=d_label , size=3, hjust=1.2, na.rm=T) + rm(d_label, d_label_left, d_label_right) + } + + # g <- g + geom_hline(yintercept=c(median(d[[response_variable]], na.rm=T), mean(d[[response_variable]], na.rm=T)), color="gray70", linetype="F3") + # g <- g + geom_smooth(aes_string(group=facet_variable), method="loess", color="gray30", na.rm=TRUE) + # g <- g + annotate("text", x=max(d[[time_variable]], na.rm=T), y=Inf, label=sub_title, hjust=1, vjust=1) + + if( !is.null(loess_variable) ) { + g <- g + geom_smooth(aes(group=!!rlang::sym(loess_variable)), method="loess", color="gray80", size=4, alpha=.1, na.rm=T, se=F) + } + if( !is.na(y_max) ) { + g <- g + coord_cartesian(ylim=c(y_min, y_max)) + } + + g <- g + geom_line(aes_string(group=group_variable, alpha=alpha_variable), stat="identity", na.rm=TRUE) + + geom_point(aes_string(group=group_variable, alpha=alpha_variable), size=point_size, stat="identity", shape=1, na.rm=TRUE) + + scale_y_continuous(labels=y_label_format) + + scale_alpha_manual(values=c("focus"=1, "background"=.5)) + # scale_alpha_manual(values=c("focus"=.5, "background"=.5)) + if( !is.null(path_in_annotation) ) { + d_annotation <- readr::read_csv(path_in_annotation, col_types=col_types_annotation(), comment="#") + + g <- g + geom_vline(data=d_annotation, aes(xintercept=as.numeric(date)), size=.25, color="gray45") + + geom_text(data=d_annotation, aes(x=date, y=-Inf, label=title), angle=90, vjust=0, hjust=0, size=3, color="gray45") + } + + if( !is.null(width) ) + g <- g + scale_size_manual(values=width) + + if( !is.null(palette) ) + g <- g + scale_color_manual(values=palette) #+ scale_fill_manual(values=palette) + + + if( !is.null(facet_variable) ) + g <- g + facet_wrap(facet_variable, scales="free_y") + + g <- g + + guides(color="none") + + guides(alpha="none") + + guides(size="none") + + # package_theme(base_size) + + theme_minimal(base_size) + + labs(title=main_title, x=x_title, y=y_title, subtitle=sub_title) + + return( g ) +} + +create_palette <- function( spaghetti_id, rainbow_start=30, rainbow_end=300, rainbow_c=100, rainbow_l=50 ) { + # strand_count <- dplyr::n_distinct(spaghetti_id) + strand_name <- sort(unique(spaghetti_id)) + strand_count <- length(strand_name) + + if( strand_count == 2L ) { + # palette_strand <- c("#fd8450", "#b177fc") # http://colrd.com/image-dna/36377/ + palette_strand <- c("#057871", "#9c8a4a") # http://colrd.com/image-dna/24034/ + + } else if( strand_count <= 9L ) { + palette_strand <- RColorBrewer::brewer.pal(strand_count,"Set1") + } else { + # stop("Only 12 providers are currently supported by this palette-generating function.") + # palette_strand <- rainbow(strand_count) + palette_strand <- colorspace::rainbow_hcl(strand_count, start=rainbow_start, end=rainbow_end, c=rainbow_c, l=rainbow_l) + } + + names(palette_strand) <- strand_name + + palette_strand +} + +histogram_2 <- function( + d_observed, + variable_name, + bin_width = NULL, + main_title = base::gsub("_", " ", variable_name, perl=TRUE), + sub_title = NULL, + # caption = paste0("each bin is ", scales::comma(bin_width), " units wide"), + tab_title = paste0("\n\n### ", base::gsub("_", " ", variable_name, perl=TRUE), "\n\n"), + x_title = variable_name, + y_title = "Count", + x_axis_format = scales::comma, + x_limits = NULL, + hover_text_template = "There were {count} occasions with\n values between {boundary_left_pretty} and {boundary_right_pretty}.", + # new_tab = FALSE, + rounded_digits = 0L, + font_base_size = 12 +) { + + if( !inherits(d_observed, "data.frame") ) + stop("`d_observed` should inherit from the data.frame class.") + + + # Uses d3 formats: https://github.com/d3/d3-format/blob/master/README.md#locale_format + # percent format example: https://stackoverflow.com/questions/42043633/format-y-axis-as-percent-in-plot-ly + # comma format example: https://stackoverflow.com/questions/43436009/change-comma-and-thousand-separator-in-tick-labels + + x_axis_format_string <- deparse(x_axis_format) + if( identical(x_axis_format_string, deparse(scales::comma_format())) ) { + tickformat <- paste0(",.", rounded_digits, "f") + } else if( identical(x_axis_format_string, deparse(scales::percent_format())) ) { + tickformat <- ",.0%" + } else { + tickformat <- paste0(".", rounded_digits, "f") + } + + x <- d_observed[[variable_name]] + # missing_count <- sum(is.na(x)) + x <- x[!is.na(x)] + non_empty <- (nrow(d_observed) >= 1L) + + if( non_empty ) { + } else { + main_title <- paste0("Empty: ", main_title) + caption <- "The variable contains only missing values.\nThere is nothing to graph." + } + + if( !is.null(x_limits) & !is.null(bin_width) ) { + histogram_breaks <- pretty(x_limits, n = diff(range(x_limits)) / bin_width) + } else if( 1L<=length(x) & !is.null(bin_width) ) { + histogram_breaks <- pretty(x, n = diff(range(x)) / bin_width) + } else if( 1L<=length(x) & !is.null(x_limits) ) { + histogram_breaks <- pretty(c(x, x_limits), n=7) + } else if( length(x)==0L & !is.null(x_limits) ) { + histogram_breaks <- x_limits + } else if( length(x)==0L ) { + histogram_breaks <- c(0, 1) + } else { + histogram_breaks <- pretty(x, n=7) + } + # browser() + + histrv <- hist( + x = x, + breaks = histogram_breaks, + right = FALSE, # The left boundary is closed/inclusive. + plot = FALSE + ) + + ds_stoplight_bin <- tibble::tibble( + boundary_left = histrv$breaks[-length(histrv$breaks)], + boundary_right = histrv$breaks[-1], + count = histrv$counts + ) %>% + dplyr::mutate( + midpoint = (boundary_right + boundary_left) / 2, + width = (boundary_right - boundary_left), + + boundary_left_pretty = x_axis_format(boundary_left ), + boundary_right_pretty = x_axis_format(boundary_right ), + midpoint_pretty = x_axis_format(midpoint ), + width_pretty = x_axis_format(width ), + + #category = cut(boundary_left, breaks=c(-Inf, .3, 1.4, 3, Inf), labels = c("Good", "Ok", "Bad", "Jesum")), + category = cut(boundary_left, breaks=c(-Inf, Inf), labels = c("")), + hover_text = glue::glue_data(., hover_text_template) + ) + + title_graph_font <- list( + family = "'Oswald', sans-serif", + size = 16, + color = "#333" + ) + title_axis_font <- list( + family = "'Oswald', sans-serif", + size = 22, + color = "#666" + ) + label_axis_font <- list( + family = "'Oswald', sans-serif", + size = 18, + color = "#888" + ) + + + if( !is.null(tab_title) ) { + cat(tab_title) + } + + plot_ly(ds_stoplight_bin, alpha = 0.6) %>% + add_bars( + x=~midpoint, y=~count, width=~width, color=~category, text=~count, hovertext=~hover_text, + hoverinfo = 'text', + # marker = list(line = list(color = '#AAAAAA', width = 1.5)), colors = c("#FF99cc", "#0dbab1","#ffd400", "#ff1a1a" ) + marker = list(line = list(color = '#AAAAAA', width = 1.5)), colors = c("#EBEBEB") + ) %>% + layout( + title = paste0("\n", main_title), + font = title_graph_font, + xaxis = list( + title = x_title, + zeroline = FALSE, + titlefont = title_axis_font, + tickfont = label_axis_font, + autotick = TRUE, + tickformat = tickformat, + dtick = .5 + ), + yaxis = list( + title = y_title, + # hoverformat = '.2f', + titlefont = title_axis_font, + tickfont = label_axis_font + ) + ) +} + + +# activity_scatter <- function( d_plot, x_name=NULL, color_name=NULL, variable_name="Count", #sizeName=NULL, +# main_title=NULL, x_title=NULL, y_title=NULL, log10_scale=FALSE, y_label_format=scales::comma, base_size=8, palette=NULL +# ) { +# +# g <- ggplot(d_plot, aes_string(x=x_name, y=variable_name, label=x_name, color=color_name, y_min=0) ) + #, size=sizeName +# geom_text(size=4, fontface=2, na.rm=TRUE) +# if( log10_scale ) { +# g <- g + scale_y_continuous(labels=scales::comma_format(), trans="log10", breaks=c(1, 5, 10, 50, 100, 500, 1000, 5000 )) + +# annotation_logticks(sides="l") +# # g <- g + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x, n=3), +# # labels = trans_format("log10", function(x) 10^x), #General placement +# # # labels = trans_format("log10", math_format(10^.x)), #General placement +# # # labels =c(0, 1, 10, 100, 1000), #Works well when the bin sizes ranges from 0 to 1000. +# # minor_breaks=log10(5) + -1:3) +# } +# if( !log10_scale ) { +# g <- g + scale_y_continuous(labels=y_label_format) #percent_format() +# } +# if( !is.null(palette) ) +# g <- g + scale_color_manual(values=palette) + +# guides(color="none", size="none") + +# labs(title=main_title, x=x_title, y=y_title) + +# repo_theme(base_size) +# return( g ) +# } +# +# activity_each_month <- function( +# d_plot, color_variable=NULL, month_variable="Month", response_variable="Count", width_variable=NULL, loess_variable=NULL, +# y_min=0, y_max=NA, +# main_title=NULL, x_title=NULL, y_title=NULL, y_label_format=scales::comma, base_size=8, palette=NULL, width=NULL +# ) { +# +# ds_label_left <- d_plot[d_plot[[month_variable]] == min(d_plot[[month_variable]], na.rm=T), ] +# ds_label_right <- d_plot[d_plot[[month_variable]] == max(d_plot[[month_variable]], na.rm=T), ] +# +# g <- ggplot(d_plot, aes_string(x=month_variable, y=response_variable, color=color_variable, size=width_variable, y_min=y_min)) + +# geom_hline(yintercept=c(median(d_plot[[response_variable]], na.rm=T), mean(d_plot[[response_variable]], na.rm=T)), color="gray70", linetype="F3") + +# geom_text(mapping=aes_string(label=color_variable), data=ds_label_left , size=3, hjust=1.2, na.rm=T) # Left endpoint + +# geom_text(mapping=aes_string(label=color_variable), data=ds_label_right, size=3, hjust=-.2, na.rm=T) # Right endpoint +# +# if( !is.null(loess_variable) ) { +# g <- g + geom_smooth(aes_string(group=loess_variable), method="loess", color="gray10", fill="gray75", na.rm=T) +# } +# if( !is.na(y_max) ) { +# g <- g + coord_cartesian(ylim=c(y_min, y_max)) +# } +# +# g <- g + geom_line(stat="identity", alpha=.5, na.rm=TRUE) + +# scale_y_continuous(labels=y_label_format) +# +# if( !is.null(width) ) +# g <- g + scale_size_manual(values=width) +# +# if( !is.null(palette) ) +# g <- g + scale_color_manual(values=palette) + +# guides(color="none") + +# labs(title=main_title, x=x_title, y=y_title) + +# repo_theme(base_size) +# +# return( g ) +# } diff --git a/analysis/common/styles.css b/analysis/common/styles.css new file mode 100644 index 0000000..02af397 --- /dev/null +++ b/analysis/common/styles.css @@ -0,0 +1,7 @@ +#session-info { + color: #aaa; # gray +} + +.highlight { + background-color: #ffd; # pale yellow +} diff --git a/config.yml b/config.yml new file mode 100644 index 0000000..3659e4c --- /dev/null +++ b/config.yml @@ -0,0 +1,28 @@ +default: + # To be processed by Ellis lanes + path_subject_1_raw: "data-public/raw/subject-1.csv" + path_mlm_1_raw: "data-public/raw/mlm-1.csv" + + path_car_raw: "data-public/raw/mtcar.csv" + + path_ss_county: "data-public/metadata/ss-county.csv" + + # Central Database (produced by Ellis lanes). + path_database: "data-public/derived/db.sqlite3" + + # Derived Rectangles (produced by Ellis lanes). + + # Analysis-ready datasets (produced by scribes & consumed by analyses). + + path_county_derived: "data-public/derived/county.rds" + path_county_year_derived: "data-public/derived/county-year.rds" + path_mlm_1_derived: "data-public/derived/mlm-1.rds" + + path_te_county: "data-public/derived/te/te-county.rds" + path_te_county_month: "data-public/derived/te/te-county-month.rds" + + path_car_derived: "data-public/derived/car.rds" # Actually produced by an Ellis + + path_annotation: "data-public/metadata/cqi-annotation.csv" + + path_log_flow: !expr strftime(Sys.time(), "data-unshared/log/%Y/%Y-%m/flow-%Y-%m-%d--%H-%M-%S.log") diff --git a/data-public/README.md b/data-public/README.md new file mode 100644 index 0000000..c895f65 --- /dev/null +++ b/data-public/README.md @@ -0,0 +1,6 @@ +`data-public/` Directory +========= + +This directory should contain only datasets that DO NOT hold [PHI](https://www.hhs.gov/answers/hipaa/what-is-phi/index.html) (Protected Health Information), or any other sensitive information. Files with PHI should **not** be stored in a GitHub repository, even a [private GitHub repository](https://help.github.com/articles/publicizing-or-hiding-your-private-contributions-on-your-profile/). We recommend using an enterprise database (such as SQL Server, PostgreSQL, MySQL, or Oracle) to store the data, and read & write the information to/from the software right before and after it's used. These databases typically secure the information at rest, and then require user authentication/authorization (to reduce the chance of sensitive information being accessed by those not approved by your [IRB](https://en.wikipedia.org/wiki/Institutional_review_board)). + +If a database isn't feasible, consider storing the files in `data-unshared/`, whose contents are not committed to the repository; a line in the `.gitignore` file keeps the files uncommitted/unstaged. However, there could be some information that is sensitive enough that it shouldn't even be stored locally without encryption (such as PHI). diff --git a/data-public/derived/README.md b/data-public/derived/README.md new file mode 100644 index 0000000..59a95b1 --- /dev/null +++ b/data-public/derived/README.md @@ -0,0 +1,13 @@ +`./data-public/derived/` Directory +========= + +This directory should contain only data files that can be derived from the raw data files (ie, those in [`./data-public/raw/`](./data-public/raw/))) **using code contained in this repository**. Unlike the raw data files, proprietary & binary formats are acceptable, since the repository's code should be able to reproduce them. + +When using `R`, the *.rds files are well-suited here, since they are smaller than CSV (thus quicker to load) and persist the metadata (such as factor labels). + +the processed raw, unmodified files that serve as an input to the project. In theory the schema of these data files shouldn't change when new data arrive. But of course this is frequenlty violated, so at minimum, our code should assert that the required columns are present, and contain reasonable values. More thorough checking can be warranted. + +For the sake of long-term reproducibility, these files are ideally in a nonproprietary format that is human readable. Plain text files (eg, CSVs & XML) are preferred. Binary & proprietary formats (eg, Excel & SAS) may not be readable if certain softrware is missing from the user's computer; or they might be able to be read by only old versions of software (eg, Excel 97). + +## No PHI +Files with PHI should **not** be stored in a GitHub repository, even a private GitHub repository. We recommend using an enterprise database (such as MySQL or SQL Server) to store the data, and read & write the information to/from the software right before and after it's used. If a database isn't feasible, consider storing the files in [`./data-unshared/`](./data-unshared/), whose contents are not committed to the repository; a line in [`./.gitignore/`](./.gitignore/) keeps the files uncommitted/unstaged. However, there could be some information that is sensitive enough that it shouldn't even be stored locally without encryption (such as PHI). diff --git a/data-public/metadata/README.md b/data-public/metadata/README.md new file mode 100644 index 0000000..4219b82 --- /dev/null +++ b/data-public/metadata/README.md @@ -0,0 +1,9 @@ +`./data-public/metadata/` Directory +========= + +This directory should contain only data files that describe structure of the project or other datasets. For example, specifying that `1` and `2` represents 'male' and 'female'. + +Ideally datasets are stored as CSVs, so they are easily portable, accessible, and modifiable from any software. + +## No PHI +Files with PHI should **not** be stored in a GitHub repository, even a private GitHub repository. We recommend using an enterprise database (such as MySQL or SQL Server) to store the data, and read & write the information to/from the software right before and after it's used. If a database isn't feasible, consider storing the files in [`./data-unshared/`](./data-unshared/), whose contents are not committed to the repository; a line in [`./.gitignore/`](./.gitignore/) keeps the files uncommitted/unstaged. However, there could be some information that is sensitive enough that it shouldn't even be stored locally without encryption (such as PHI). diff --git a/data-public/raw/README.md b/data-public/raw/README.md new file mode 100644 index 0000000..8c08ac1 --- /dev/null +++ b/data-public/raw/README.md @@ -0,0 +1,9 @@ +`./data-public/raw/` Directory +========= + +This directory should contain the raw, unmodified files that serve as an input to the project. In theory the schema of these data files shouldn't change when new data arrive. But of course this is frequenlty violated, so at minimum, our code should assert that the required columns are present, and contain reasonable values. More thorough checking can be warranted. + +For the sake of long-term reproducibility, these files are ideally in a nonproprietary format that is human readable. Plain text files (eg, CSVs & XML) are preferred. Binary & proprietary formats (eg, Excel & SAS) may not be readable if certain softrware is missing fro mthe user's computer; or they might be able to be read by only old versions of software (eg, Excel 97). + +## No PHI +Files with PHI should **not** be stored in a GitHub repository, even a private GitHub repository. We recommend using an enterprise database (such as MySQL or SQL Server) to store the data, and read & write the information to/from the software right before and after it's used. If a database isn't feasible, consider storing the files in `./data-unshared/`, whose contents are not committed to the repository; a line in `./.gitignore/` keeps the files uncommitted/unstaged. However, there could be some information that is sensitive enough that it shouldn't even be stored locally without encryption (such as PHI). diff --git a/data-unshared/README.md b/data-unshared/README.md new file mode 100644 index 0000000..5e212e2 --- /dev/null +++ b/data-unshared/README.md @@ -0,0 +1,8 @@ +`data-unshared/` Directory +========= + +Files in this directory are stored locally, but not staged/committed and sent to the central GitHub repository. A line in the `.gitignore` file keeps the files uncommitted/unstaged. + +Even though these files are kept off the central repository, it still should not contain anything sensitive enough that it requires encryption when stored on your local drive (such as PHI). See the `data-public/` [`README.md`](data-public/) for more information. + +Since files in this directory are not staged/committed, it's tough to communicate with collaborators what the files should look like on their computers. Try to keep a list updated at `data-unshared/contents.md` diff --git a/data-unshared/contents.md b/data-unshared/contents.md new file mode 100644 index 0000000..fa0e45c --- /dev/null +++ b/data-unshared/contents.md @@ -0,0 +1,11 @@ +Contents of `./data-unshared/` Directory +========= +Since files in this directory are not staged/committed, it's tough to communicate with collaborators what the files should look like on their computers. Try to keep this list updated. + +### Files in `./data-unshared/raw/` +* psychopathy-2014-03-20.csv (formerly called `Psychopathy 20March2014.csv`) +* heart_attack-2014-02-03.csv (formerly called `HeartAttack 03-02-14 Final.csv`) + +### Files in `./data-unshared/derived/` +* psychopathy.rds +* heart_attack.rds diff --git a/documentation/README.md b/documentation/README.md new file mode 100644 index 0000000..338d633 --- /dev/null +++ b/documentation/README.md @@ -0,0 +1,3 @@ +# Documentation Directory + +In this directory, include documentation about the datasets, such as codebooks. Make sure the documentation doesn't include passwords, PHI, or anything else that should remain private. Even if the repository is private, we do NOT treat GitHub as HIPAA/FERPA compliant (this is just our instinct, and not the official advice from a lawyer). diff --git a/flow.R b/flow.R new file mode 100644 index 0000000..5c7a0fc --- /dev/null +++ b/flow.R @@ -0,0 +1,113 @@ +# knitr::stitch_rmd(script="flow.R", output="stitched-output/flow.md") +rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run. This is not called by knitr, because it's above the first chunk. + +# ---- load-sources ------------------------------------------------------------ + +# ---- load-packages ----------------------------------------------------------- +import::from("magrittr", "%>%") + +requireNamespace("purrr") +requireNamespace("rlang") +# requireNamespace("checkmate") +requireNamespace("OuhscMunge") # remotes::install_github("OuhscBbmc/OuhscMunge") + +# ---- declare-globals --------------------------------------------------------- +# Allow multiple files below to have the same chunk name. +# If the `root.dir` option is properly managed in the Rmd files, no files will be overwritten. +options(knitr.duplicate.label = "allow") + +config <- config::get() + +# open log +if( interactive() ) { + sink_log <- FALSE +} else { + message("Creating flow log file at ", config$path_log_flow) + + if( !dir.exists(dirname(config$path_log_flow)) ) { + # Create a month-specific directory, so they're easier to find & compress later. + dir.create(dirname(config$path_log_flow), recursive=T) + } + + file_log <- file( + description = config$path_log_flow, + open = "wt" + ) + sink( + file = file_log, + type = "message" + ) + sink_log <- TRUE +} +ds_rail <- tibble::tribble( + ~fx , ~path, + + # Simulate observed data + + # ETL (extract-transform-load) the data from the outside world. + + + # Reports for human consumers. + "run_rmd" , "analysis/car-report-1/car-report-1.Rmd" + + # Dashboards for human consumers. + # "run_rmd" , "analysis/dashboard-1/dashboard-1.Rmd" +) + +run_r <- function( minion ) { + message("\nStarting `", basename(minion), "` at ", Sys.time(), ".") + base::source(minion, local=new.env()) + message("Completed `", basename(minion), "`.") + return( TRUE ) +} +run_sql <- function( minion ) { + message("\nStarting `", basename(minion), "` at ", Sys.time(), ".") + OuhscMunge::execute_sql_file(minion, config$dsn_staging) + message("Completed `", basename(minion), "`.") + return( TRUE ) +} +run_rmd <- function( minion ) { + message("\nStarting `", basename(minion), "` at ", Sys.time(), ".") + path_out <- rmarkdown::render(minion, envir=new.env()) + Sys.sleep(3) # Sleep for three secs, to let pandoc finish + message(path_out) + return( TRUE ) +} + +(file_found <- purrr::map_lgl(ds_rail$path, file.exists)) +if( !all(file_found) ) { + warning("--Missing files-- \n", paste0(ds_rail$path[!file_found], collapse="\n")) + stop("All source files to be run should exist.") +} + +# ---- load-data --------------------------------------------------------------- + +# ---- tweak-data -------------------------------------------------------------- + +# ---- run --------------------------------------------------------------------- +message("Starting flow of `", basename(base::getwd()), "` at ", Sys.time(), ".") + +warn_level_initial <- as.integer(options("warn")) +# options(warn=0) # warnings are stored until the top–level function returns +# options(warn=2) # treat warnings as errors + +elapsed_duration <- system.time({ + purrr::map2_lgl( + ds_rail$fx, + ds_rail$path, + function(fn, args) rlang::exec(fn, !!!args) + ) +}) + +message("Completed flow of `", basename(base::getwd()), "` at ", Sys.time(), "") +elapsed_duration +options(warn=warn_level_initial) # Restore the whatever warning level you started with. + +# ---- close-log --------------------------------------------------------------- +# close(file_log) +if( sink_log ) { + sink(file = NULL, type = "message") # ends the last diversion (of the specified type). + message("Closing flow log file at ", gsub("/", "\\\\", config$path_log_flow)) +} + +# bash: Rscript flow.R diff --git a/manipulation/README.md b/manipulation/README.md new file mode 100644 index 0000000..8e0dfc7 --- /dev/null +++ b/manipulation/README.md @@ -0,0 +1,4 @@ +`manipulation/` Directory +========= + +Files in this directory manipulate/groom/munge the project data. The resulting "derived" datasets produce less friction when analyzing. By centralizing most (and ideally all) of the manipulation code in one place, it's easier to determine how the data was changed before analyzing. It also reduces duplication of manipulation code, so analyses in different files are more consistent and understandable. diff --git a/suku-cqi-1.Rproj b/suku-cqi-1.Rproj new file mode 100644 index 0000000..9833a70 --- /dev/null +++ b/suku-cqi-1.Rproj @@ -0,0 +1,20 @@ +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: No + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: knitr +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source