-
Notifications
You must be signed in to change notification settings - Fork 0
/
CMAS_Clean.R
207 lines (167 loc) · 7.43 KB
/
CMAS_Clean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
## CMAS Clean
require(googlesheets)
require(tidyverse)
require(lubridate)
require(stringr)
ss_new <- gs_key("1Xw4JefUCS4HHQ0KpvKhr-DjklqzhH3_CeA-zhoAuQfI", visibility = "private") #CMAS_Alerts_Processed
day_file_name <- paste0("data/",today(),"-msgfile.csv")
load_msgs <- function() {
msg <- gs_read(ss = ss_new
, col_names = c("rec_time", "cmac", "full_text")
, coltypes = "Tcc", skip = 1, trim_ws = TRUE) %>%
mutate(rec_time = mdy_hms(gsub(" at ", " ", rec_time)
, tz = "America/New_York"
, truncated = 3)
) %>%
separate(full_text,
c("blank", "gateway_id" ,"msg_id"
,"special_handling", "message_type"
, "category", "response_type", "severity"
, "urgency", "certainty", "expire_time"
, "text_language", "alert_message","dummy")
, sep = "CMAC_[:word:]*: "
, fill = "right" ## drops the warning for rows with too many records
, remove = TRUE
)
## creates a table for fields with "update" records
updates <- filter(msg, nchar(special_handling) < 10) %>%
select(rec_time, cmac, gateway_id, msg_id
, ref_id = special_handling
, special_handling = message_type
, message_type = category
, category = response_type
, response_type = severity
, severity = urgency
, urgency = certainty
, certainty = expire_time
, text_language = alert_message
, alert_message = dummy
)
msg <- filter(msg, nchar(special_handling) >= 10) %>%
select(-blank, -dummy)
## puts all the records back into a single table and
## uses two different separators to split out the alert
## text from the plain English "area" field
## and finally removes the tcs boilerplate
msg <- bind_rows(msg, updates) %>%
mutate(expire_time = ymd_hms(expire_time)) %>%
separate(alert_message, c("message_text","t2")
, sep = "Targeted Areas: "
, remove = TRUE) %>%
separate(t2, c("areas"), sep = "[:punct:]{4}"
, extra = "drop", remove = TRUE) %>%
mutate(threat_type = gsub("\\. .*","", cmac)
, msg_id = str_trim(msg_id)
, areas = str_trim(areas)) %>%
dplyr::filter(!(gateway_id == "http://tcs.tsis.com\n") )
msg <- msg[-grep(" test", msg$threat_type),]
#write.csv(msg, file = day_file_name)
return(msg)
}
## Download local copy of FIPS lookup data and read into memory
load_fips <- function() {
fips_lookup <- albersusa::counties_sf() %>%
transmute(fips, name, lsad
, state, abb = iso_3166_2
, value = paste(abb, name))
}
## isolates the state and county, given column with
## comma separated list in format "city (ST)"
area_find <- function(area_list) {
area_list <- str_replace_all(area_list
, pattern = "(([A-z]*) \\(([A-Z]{2})\\)), \\1"
, replacement = "\\2 city \\(\\3\\), \\2 \\(\\3\\)"
)
m <- str_match_all(string = area_list
, pattern = "[A-z. ]{3,} ")
n <- str_match_all(string = area_list
, pattern = "\\(?([A-Z]{2})\\)?")
area_clean <- paste(n[[1]][,2]
, str_trim(m[[1]][,1], side = "both")) %>%
return() # %>%
# ## Clean TCS county names to match list from albersusa county map
# ##
# str_replace_all(pattern = "E\\.",replacement = "East") %>%
# str_replace_all(pattern = "W\\.",replacement = "West") %>%
# str_replace_all(pattern = "(IN La|IL La) (Porte|Salle)",replacement = "\\1\\2") %>%
# str_replace_all(pattern = "FL Dade", "FL Miami-Dade") %>%
# str_replace_all(pattern = "PR lsabela", "PR Isabela") %>%
# str_replace_all(pattern = "TX wall", "TX Wall") %>%
# str_replace_all(pattern = "TX hell", "TX Hall") %>%
# str_replace_all(pattern = "MT Lewis Clark", "MT Lewis and Clark")
# return(area_clean)
}
## Substitutes all counties in a state
## For areas that include only state names
full_state <- function(areas_states) {
if (!exists("fips_lookup")) fips_lookup <- load_fips()
left_join(areas_states, fips_lookup, by = c("areas" = "abb")) %>%
select(msg_id,fips) %>%
return()
}
flatten_fips <- function(msg) {
areas <- select(msg, msg_id, areas)
#seperate out alerts with full state areas, convert directly to fips
areas_states <- filter(areas, str_length(areas) == 2) %>%
full_state()
#remove those alerts from the other areas
areas <- filter(areas, str_length(areas) > 2)
if (!exists("fips_lookup")) fips_lookup <- load_fips()
# create a matrix of areas for each message id that has individual counties
areas <- tapply(areas$areas, area_find
, INDEX = areas$msg_id
, simplify = TRUE) %>%
as.data.frame.array() %>%
unlist(recursive = TRUE) %>%
as_tibble(validate = TRUE) %>%
rownames_to_column() %>%
transmute(msg_id = str_extract(rowname, "[[:alnum:]]{8}")
, value)
fips_msg <- left_join(areas, fips_lookup) %>%
select(msg_id, fips) %>%
rbind(areas_states)
}
# Classify message type -
# Tornado, Flash Flood,
# AMBER, Tsunami, or Other
classify_message <- function(msg) {
mutate(msg, type =
case_when(
grepl("Tornado", msg$message_text, ignore.case = TRUE) ~ "Tornado",
grepl("Flash Flood", msg$message_text, ignore.case = TRUE) ~ "FlashFlood",
grepl("Amber", msg$message_text, ignore.case = TRUE) ~ "AMBER",
grepl("Tsunami", msg$message_text, ignore.case = TRUE) ~ "Tsunami",
TRUE ~ "Other")
) %>%
transmute(msg_id
, rec_time
, expire_time
, response = response_type
, urgency
, wea = message_text
, type = as.factor(type)
)
}
#####################
## Run Functions ###
#####################
# If msg isn't in memory, check to see if we have already
# downloaded the data and cleaned it today. If so, get
# it from the csv created on today's date.
# If we haven't created the msg df today, then load
# from the google sheet
if (!exists("msg")) {
#if (file.exists(day_file_name)) msg <- read.csv(day_file_name)
#else
msg <- load_msgs()
}
if (!exists("fips_msg")){
fips_msg <- flatten_fips(msg)
}
msg2 <- classify_message(msg) %>% unique()
alert_tally <- left_join(fips_msg, msg2) %>%
select(msg_id, fips, type) %>%
count(fips, type) %>%
rename(WEATYPE = type, WEANUM = n) %>%
spread(WEATYPE, WEANUM, fill = "0", convert = TRUE)
alert_tally[is.na(alert_tally)] <- 0