-
Notifications
You must be signed in to change notification settings - Fork 0
/
lstm_autoencoder_main.py
156 lines (132 loc) · 7.19 KB
/
lstm_autoencoder_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import src.constants as const
from os import listdir
from os.path import isfile, join
import pandas as pd
from sklearn import preprocessing
import numpy as np
import torch
import torch.utils.data as data_utils
import src.lstm_autoencoder as lstm_autoencoder
from src.evaluation import Evaluation
from config import config as conf
import argparse
import random
import time
import optuna
import pymysql
pymysql.install_as_MySQLdb()
def combine_plotly_figs_to_html(plotly_figs, html_fname, include_plotlyjs="cdn"):
with open(html_fname, "w") as f:
f.write(plotly_figs[0].to_html(include_plotlyjs=include_plotlyjs))
for fig in plotly_figs[1:]:
f.write(fig.to_html(full_html=False, include_plotlyjs=False))
def plot_optuna_default_graphs(optuna_study):
history_plot = optuna.visualization.plot_optimization_history(optuna_study)
plot_list = [history_plot]
return plot_list
def main(config, trial_number="best"):
# Pre-requisites
min_max_scaler = preprocessing.MinMaxScaler()
dataset_path = const.PROPRIETARY_DATASET_LOCATION
# Read normal data
normal_path = join(dataset_path,'normal_data/')
# As the first step, combine the csvs inside normal_data folder
normal_data_files = [f for f in listdir(normal_path) if isfile(join(normal_path, f))]
normal_df_list = [pd.read_csv(normal_path + normal_data_file) for normal_data_file in normal_data_files]
# Next, drop the datetime column
normal_df_list_without_datetime = [normal_df.drop(columns=['datetime']) for normal_df in normal_df_list]
# Finally merge those dataframes
normal_df = pd.concat(normal_df_list_without_datetime)
normal_df = normal_df.astype(float)
# Normalise/ standardize the normal dataframe
normal_df_values = normal_df.values
normal_df_values_scaled = min_max_scaler.fit_transform(normal_df_values)
normal_df_scaled = pd.DataFrame(normal_df_values_scaled) # shape = (9652, 9)
# Read anomaly data
anomaly_path = join(dataset_path,'anomaly_data/')
# As the first step, combine the csvs inside anomaly_data folder
anomaly_data_files = [f for f in listdir(anomaly_path) if isfile(join(anomaly_path, f))]
anomaly_df_list = [pd.read_csv(anomaly_path + anomaly_data_file) for anomaly_data_file in anomaly_data_files]
# Next, drop the datetime column
anomaly_df_list_without_datetime = [anomaly_df.drop(columns=['datetime']) for anomaly_df in anomaly_df_list]
# Finally merge those dataframes
anomaly_df = pd.concat(anomaly_df_list_without_datetime)
anomaly_df = anomaly_df.astype(float)
# Separate out the is_anomaly labels before normalisation/standardization
anomaly_df_labels = anomaly_df['is_anomaly']
anomaly_df = anomaly_df.drop(['is_anomaly'], axis=1)
# Normalise/ standardize the anomaly dataframe
anomaly_df_values = anomaly_df.values
anomaly_df_values_scaled = min_max_scaler.transform(anomaly_df_values)
anomaly_df_scaled = pd.DataFrame(anomaly_df_values_scaled) # shape = (3635, 9)
# Preparing the datasets for training and testing using AutoEncoder
windows_normal = normal_df_scaled.values[np.arange(config["WINDOW_SIZE"])[None, :] + np.arange(normal_df_scaled.shape[0] - config["WINDOW_SIZE"])[:, None]] # shape = (9647, 5, 9)
windows_anomaly = anomaly_df_scaled.values[np.arange(config["WINDOW_SIZE"])[None, :] + np.arange(anomaly_df_scaled.shape[0] - config["WINDOW_SIZE"])[:, None]] # shape = (3630, 5, 9)
# Create batches of training and testing data
train_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(
torch.from_numpy(windows_normal).float()
), batch_size=config["BATCH_SIZE"], shuffle=False, num_workers=0)
test_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(
torch.from_numpy(windows_anomaly).float()
), batch_size=config["BATCH_SIZE"], shuffle=False, num_workers=0)
# Initialise the LSTMAutoEncoder model
lstm_autoencoder_model = lstm_autoencoder.LstmAutoencoder(seq_len=config["WINDOW_SIZE"], n_features=windows_normal.shape[2], num_layers=config["NUM_LAYERS"])
# Start training
lstm_autoencoder.training(conf.N_EPOCHS, lstm_autoencoder_model, train_loader, config["LEARNING_RATE"])
# Save the model and load the model
model_path = const.MODEL_LOCATION
model_name = join(model_path,"lstm_ae_model_{}.pth".format(trial_number))
torch.save({
'encoder': lstm_autoencoder_model.encoder.state_dict(),
'decoder': lstm_autoencoder_model.decoder.state_dict()
}, model_name)
checkpoint = torch.load(model_name)
lstm_autoencoder_model.encoder.load_state_dict(checkpoint['encoder'])
lstm_autoencoder_model.decoder.load_state_dict(checkpoint['decoder'])
# Use the trained model to obtain predictions for the test set
results = lstm_autoencoder.testing(lstm_autoencoder_model, test_loader)
y_pred_for_test_set = np.concatenate([torch.stack(results[:-1]).flatten().detach().cpu().numpy(), results[-1].flatten().detach().cpu().numpy()])
# Obtain threshold based on pth percentile of the mean squared error
threshold = np.percentile(y_pred_for_test_set, [81.61])[0] # 90th percentile
# Map the predictions to anomaly labels after applying the threshold
predicted_labels = []
for val in y_pred_for_test_set:
if val > threshold:
predicted_labels.append(1)
else:
predicted_labels.append(0)
# Evaluate the predicted_labels against the actual labels
test_eval = Evaluation(anomaly_df_labels[config["WINDOW_SIZE"]:], predicted_labels)
test_eval.print()
return test_eval.auc
def objective(trial):
params = dict()
params["WINDOW_SIZE"] = trial.suggest_int("WINDOW_SIZE", 6, 100)
params["NUM_LAYERS"] = trial.suggest_int("NUM_LAYERS", 2, 4)
params["BATCH_SIZE"] = trial.suggest_int("BATCH_SIZE", 20, 1000)
params["LEARNING_RATE"] = trial.suggest_float("LEARNING_RATE", 1e-5, 1e-1, log=True)
print(f"Initiating Run {trial.number} with params : {trial.params}")
loss = main(params, trial.number)
return loss
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--optuna-db", type=str, help="Path to the Optuna Database file",
default="sqlite:///optuna.db")
parser.add_argument("-n", "--optuna-study-name", type=str, help="Name of the optuna study",
default="duneesha_lstm_autoencoder_run_2")
args = parser.parse_args()
# wait for some time to avoid overlapping run ids when running parallel
wait_time = random.randint(0, 10) * 3
print(f"Waiting for {wait_time} seconds before starting")
time.sleep(wait_time)
study = optuna.create_study(direction="maximize",
study_name=args.optuna_study_name,
storage=args.optuna_db,
load_if_exists=True,
)
study.optimize(objective, n_trials=1) # When running locally, set n_trials as the no.of trials required
# print best study
best_trial = study.best_trial
print(best_trial.params)
plots = plot_optuna_default_graphs(study)
combine_plotly_figs_to_html(plotly_figs=plots, html_fname="optimization_trial_plots/lstm_autoencoder_hpo.html")