-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess_hx_data.py
executable file
·149 lines (122 loc) · 5.57 KB
/
process_hx_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""Process all historical IR gauge data.
This file loads historical data from the Indian River stream gauge. It loads
values for critical rate of rise, and critical total rise. It then runs
through all the stream gauge data. It flags any data point at which the
stream gauge has met or exceeded these critical values, and captures a
48-hour period of readings around that point. When these readings are
plotted, any known slides that occurred during this period are plotted as
well.
The script then processes known slide events. If a slide event is not
already associated with a critical period, 48 hours of readings around that
slide event are grabbed, and these are plotted as well.
We want to be able to use the output to answer the following questions. In
this context, 'notification time' refers to the time between the first
critical point identified in a critical period, and the time at which a
slide occurred. This is the amount of time people would have to respond
if a notification were issued the moment the first critical point was
identified.
A true positive is a critical point associated with a slide,
a false positive is a critical point with no associated slide. A false
negative is a slide with no associated critical point.
- How many slides were preceded by a critical point?
- What do the notification times look like?
- How many true and false positives and false negatives were there?
- How many notifications would have been issued over a 5-year period?
- Was there anything special about slides that were missed? (false negative)
- Was there anything special about critical points with no slide? (false
positive)
"""
import argparse, sys, pickle, pdb
from os import listdir, path
from pathlib import Path
import plot_heights as ph
from slide_event import SlideEvent
import utils.analysis_utils as a_utils
from utils.stats import stats
# Define cli arguments.
parser = argparse.ArgumentParser()
parser.add_argument('--no-interactive-plots',
help="Do not generate interactive plots.",
action='store_true')
parser.add_argument('--no-static-plots',
help="Do not generate static plots.",
action='store_true')
parser.add_argument('--use-cached-data',
help="Use pickled data; don't parse raw data files.",
action='store_true')
args = parser.parse_args()
def process_hx_data(root_output_directory='', data_files=None):
"""Process all historical data in ir_data_clean/.
- Get known slide events.
- Get readings from file.
- Pull interesting reading sets from readings. Analysis is done here.
- Pickle reading sets.
- Plot reading sets.
- Summarize results.
Accept a data_files arg, so tests can send test data.
Does not return anything, but generates:
- pkl files of reading sets.
- html files containing interactive plots.
- png files containing static plots.
- console output summarizing what was found.
"""
# Get known slides.
slides_file = 'known_slides/known_slides.json'
known_slides = SlideEvent.load_slides(slides_file)
# DEV: Should probably walk the ir_data_clean directory, instead of making
# this list manually.
if not data_files:
data_files = [
'ir_data_clean/irva_utc_072014-022016_hx_format.txt',
'ir_data_clean/irva_akdt_022016-033124_arch_format.txt'
]
reading_sets = []
if not args.use_cached_data:
print("Parsing raw data files...")
for data_file in data_files:
readings = a_utils.get_readings_from_data_file(data_file)
reading_sets += a_utils.get_reading_sets(readings, known_slides, stats)
if not args.use_cached_data:
print("Pickling reading sets...")
for reading_set in reading_sets:
# Pickle reading sets for faster analysis and plotting later,
# and for use by other programs.
a_utils.pickle_reading_set(reading_set, root_output_directory)
if args.use_cached_data:
print("Reading data from pickled files...")
pkl_file_path = 'other_output/'
pkl_files = [f for f in listdir(pkl_file_path)
if path.isfile(path.join(pkl_file_path, f))
and Path(f).suffix=='.pkl']
# pkl_files = [f for f in pkl_files if Path(f).suffix=='.pkl']
for pkl_file in pkl_files:
filename = f"{pkl_file_path}{pkl_file}"
with open(filename, 'rb') as f:
reading_set = pickle.load(f)
reading_sets.append(reading_set)
if not args.no_interactive_plots:
print("Generating interactive plots...")
for reading_set in reading_sets:
critical_points = a_utils.get_critical_points(reading_set)
ph.plot_data(
reading_set,
known_slides=known_slides,
critical_points=critical_points,
root_output_directory=root_output_directory)
if not args.no_static_plots:
print("Generating static plots...")
for reading_set in reading_sets:
critical_points = a_utils.get_critical_points(reading_set)
ph.plot_data_static(
reading_set,
known_slides=known_slides,
critical_points=critical_points,
root_output_directory=root_output_directory)
if not args.use_cached_data:
a_utils.summarize_results(reading_sets, known_slides, stats)
if __name__ == '__main__':
# Make sure dir for generated plots exists.
plots_dir = Path("current_ir_plots")
if not plots_dir.exists():
plots_dir.mkdir()
process_hx_data()