-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.R
164 lines (151 loc) · 4.64 KB
/
data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Data Wrangling
library(dplyr)
library(reshape2)
# Statistical Analysis: Regression
library(survey)
library(forecast)
# Statistical Analysis: Trees
library(tree)
library(randomForest)
# Visualization and Reporting
library(pander)
library(gridExtra)
library(ggplot2)
library(stargazer)
#
# SQL queries not needed (saved to 'dep.csv')
#
# # ==== Load NHIS Data from Postgres Database ====
# source('postgres_pw.R') # Get Postgres password and user name
# source('sql_queries.R')
#
# nhis_db <- src_postgres("nhis", user=postgres_user, password=password)
#
# # ==== Load Person File for Each Year ====
# # Create a bunch of temporary tables
# for (yr in DATA_RANGE) {
# compute(
# tbl(nhis_db,
# sql(sprintf(
# query.person_gen,
# yr)
# )),
# name=sprintf("person_%s", yr))
# }
#
# # ==== Load Adult Disability File for Each Year ====
# # Keep this for loop separate from above to allow for easier debugging
# for (yr in DATA_RANGE) {
# compute(
# tbl(nhis_db,
# sql(sprintf(
# query.sample_adult_gen(yr),
# yr)
# )),
# name=sprintf("sample_adult_%s", yr))
# }
#
# # ==== Load Family File for Each Year ====
# # Keep this for loop separate from above to allow for easier debugging
# for (yr in DATA_RANGE) {
# compute(
# tbl(nhis_db,
# sql(sprintf(
# query.family_gen,
# yr)
# )),
# name=sprintf("family_%s", yr))
# }
#
# # ==== Create Aggregate Depression File for Each Year ====
# # Keep this for loop separate from above to allow for easier debugging
# for (yr in DATA_RANGE) {
# compute(
# tbl(nhis_db,
# sql(sprintf(
# query.dep_gen,
# yr, yr, yr)
# )),
# name=sprintf("depression_%s", yr))
# }
#
# # ==== Not Used: Create Aggregate Disability File for 2012-2013 ====
# # Keep this for loop separate from above to allow for easier debugging
# # for (yr in DISAB_RANGE) {
# # compute(
# # tbl(nhis_db,
# # sql(sprintf(
# # query.adult_disab_gen,
# # yr, yr, yr, length(DISAB_RANGE), yr)
# # )),
# # name=sprintf("adult_disab_%s", yr))
# # }
#
# # ==== Not Used: Create temporary table for aggregate disability data ====
# # compute(tbl(nhis_db, sql("
# # SELECT * FROM
# # adult_disab_2012 UNION ALL
# # SELECT * FROM
# # adult_disab_2013")),
# # name="adult_disab_all")
# #
# # disab <- collect(tbl(nhis_db, sql("SELECT * FROM adult_disab_all")))
#
# # ==== Load 2007-2013 Depression Data ====
# # Create temporary table for aggregate depression data
# compute(tbl(nhis_db, sql("
# SELECT * FROM
# depression_2007 UNION ALL
# SELECT * FROM
# depression_2008 UNION ALL
# SELECT * FROM
# depression_2009 UNION ALL
# SELECT * FROM
# depression_2011 UNION ALL
# SELECT * FROM
# depression_2012 UNION ALL
# SELECT * FROM
# depression_2013")),
# name="depression_all")
#
# # Get aggregate data
# dep <- collect(tbl(nhis_db, sql("SELECT * FROM depression_all")), n=Inf)
#
# # Add column which determines assignment to test set
# set.seed(420)
# dep[, "test"] <- ifelse(runif(n=nrow(dep), min=0, max=1) <= 0.2, yes=1, no=0)
#
# # ==== Save Aggregated Depression Data to CSV ====
# write.csv(dep, "dep.csv")
# ==== Load Data from Saved CSV Instead of SQL ====
dep <- read.csv('dep.csv', header=TRUE)
# ==== Testing and training set labels ====
dep.train_labels.worthless <- dep %>%
filter(test == 0) %>%
select(worthless_once)
dep.test_labels.worthless <- dep %>%
filter(test == 1) %>%
select(worthless_once)
dep.train_labels.hopeless <- dep %>%
filter(test == 0) %>%
select(hopeless_once)
dep.test_labels.hopeless <- dep %>%
filter(test == 1) %>%
select(hopeless_once)
dep.train_labels.sleep <- dep %>%
filter(test == 0) %>%
select(abnormal_sleep)
dep.test_labels.sleep <- dep %>%
filter(test == 1) %>%
select(abnormal_sleep)
# ==== Logistic Regression ====
dep_design = svydesign(ids=~0,
data=dep %>% filter(test == 0),
weights=dep %>% filter(test == 0) %>% select(weight_sa))
# Create a shorthand for running a GLM with my default desired parameters
dep_glm <- function(frml) {
return(svyglm(frml,
design=dep_design,
family=binomial(link='logit'),
method="glm.fit"))
}