-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path02_apply_model_to_voterfile.R
70 lines (61 loc) · 4.17 KB
/
02_apply_model_to_voterfile.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
library(readr)
library(stringr)
library(dplyr)
library(tidyr)
library(rstanarm)
library(stats)
setwd("~/Projects/trump_voters_stimulus")
source('stats_functions.R')
source('base_data.R')
county_election_results = county_election_results %>%
select(county_fips, biden_vote_frac, trump_vote_frac) %>%
rename(biden_vote_frac_county=biden_vote_frac, trump_vote_frac_county=trump_vote_frac)
state_election_results = state_election_results %>%
select(state_fips, biden_vote_frac, trump_vote_frac) %>%
rename(biden_vote_frac_state=biden_vote_frac, trump_vote_frac_state=trump_vote_frac)
df_voters = read_csv('data/voterfile_appalachia_individual_data.csv') %>%
mutate(postal_fips=paste(state, voters_fips, sep="")) %>% # we generate this to make the next step easier, otherwise we would need to lookup fips for ~60m
#filter(postal_fips %in% arc_counties$postal_fips) %>% # originally ~60m voters (thanks NY and Ohio), but only keep the ones in Appalachia
filter(general_2020 == 'Y') %>% # only keep the ones who voted
rename(postal_code=state) %>%
mutate(state_fips=usmap::fips(postal_code)) %>%
mutate(county_fips=paste(state_fips, voters_fips, sep="")) %>%
mutate(gender=as.numeric(voters_gender == 'F') + 1) %>% # 1 is male, 2 is female -> from CES
mutate(gender=if_else(is.na(gender), 9, gender)) %>% # 9 is not asked from CES, could also use 8, which is skipped
mutate(age_bucket=cut(voters_age, breaks=c(18, 29, 44, 65, Inf))) %>%
mutate(race=case_when(EthnicGroups_EthnicGroup1Desc == 'European' ~ 1, # convert to same (or similar) from CES
EthnicGroups_EthnicGroup1Desc == 'Likely African-American' ~ 2,
EthnicGroups_EthnicGroup1Desc == 'Hispanic or Portuguese' ~ 3,
EthnicGroups_EthnicGroup1Desc == 'East and South Asian' ~ 4,
EthnicGroups_EthnicGroup1Desc == 'Other' ~ 7)) %>%
mutate(race=if_else(is.na(race), 8, race)) %>%
select(lalvoterid, state_fips, county_fips, gender, age_bucket, race, general_2020) %>%
tidylog::left_join(additional_state_data, by='state_fips') %>%
tidylog::left_join(county_election_results, by='county_fips') %>%
tidylog::left_join(state_election_results, by='state_fips') %>%
drop_na(age_bucket) # need to drop where this is true, since stan prediction doesn't work otherwise
load('models/county_model_1.rda')
# stratify data to make applying the model feasible (or a lot faster)
voters_poststrat = df_voters %>%
count(race, age_bucket, gender, pvi, z.average_income, biden_vote_frac_state, biden_vote_frac_county, state_fips, region, county_fips)
# sample from posterior, samples 4000 times.
posterior_prob = posterior_linpred(model, transform=TRUE, newdata=voters_poststrat)
# take mean of posterior samples
voters_poststrat['p_voted_trump'] = colMeans(posterior_prob)
# since the predictions came out of a model, there is no reason to think that the probabilities should sum up
# to what we want them to be (ie. the state or county totals)
# To force this to be true, we adjust the strata probabilities to make sure that they sum up to the
# state/county fraction of Trump/Biden voters.
# If the smallest geographic unit used in the model is state, then the state election results should be passed
# If the smallest geographic unit used in the model is county, then the county election results should be passed
p_voted_trump_corrected = correct_probs(voters_poststrat %>% mutate(fips=county_fips),
'n',
'p_voted_trump',
county_election_results %>% mutate(fips=county_fips),
'trump_vote_frac_county')
voters_poststrat['p_voted_trump_corrected'] = p_voted_trump_corrected
# join the probabilities to the voters based on their strata
df_voters_min = df_voters %>%
left_join(voters_poststrat, by=c('race', 'age_bucket', 'gender', 'pvi', 'z.average_income', 'biden_vote_frac_county', 'biden_vote_frac_state', 'state_fips', 'region', 'county_fips')) %>%
select(lalvoterid, p_voted_trump, p_voted_trump_corrected)
write.csv(df_voters_min, 'outputs/county_model_1_appalachia_voters_p_trump_corrected.csv', row.names=FALSE)