-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv_processor(MAJED).py
259 lines (206 loc) · 11.3 KB
/
csv_processor(MAJED).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import os
import pandas as pd
import matplotlib
import PARAMETERS
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import scipy.sparse as sp
import scipy.spatial
import numpy as np
import sklearn.decomposition
from datetime import datetime
import utils as utils
# GLOBALS
H_IP_SRC = 'IPV4_SRC_ADDR'
H_IP_DST = 'IPV4_DST_ADDR'
H_PORT_SRC = 'L4_SRC_PORT'
H_PORT_DST = 'L4_DST_PORT'
H_LAST_SWITCHED = 'LAST_SWITCHED'
H_IN_PKTS = 'IN_PKTS'
H_OUT_PKTS = 'OUT_PKTS'
H_IN_BYTES = 'IN_BYTES'
H_OUT_BYTES = 'OUT_BYTES'
H_IAT_AVG_SD = 'SRC_TO_DST_IAT_AVG'
H_IAT_STDDEV_SD = 'SRC_TO_DST_IAT_STDDEV'
H_IAT_AVG_DS = 'DST_TO_SRC_IAT_AVG'
H_IAT_STDDEV_DS = 'DST_TO_SRC_IAT_STDDEV'
H_BIN = 'BINNED'
class CsvProcessor(object):
def __init__(self, fname,
attributes=[H_IN_PKTS, H_OUT_PKTS, H_IN_BYTES, H_OUT_BYTES,H_IAT_AVG_SD,H_IAT_AVG_DS,H_IAT_STDDEV_DS,H_IAT_STDDEV_SD],
idx=[H_IP_SRC, H_IP_DST]):
self.fname = fname
self.idx = idx
# READ THE FILE INTO A PANDAS DATAFRAME
self.df = pd.read_csv(fname, usecols=[H_LAST_SWITCHED] + attributes + idx)
self.num_nodes = max(self.df[idx[0]].nunique(),
self.df[idx[1]].nunique())
self.attributes = attributes
self.start_time = self.df[H_LAST_SWITCHED].min()
self.end_time = self.df[H_LAST_SWITCHED].max()
# self.log_transform_attributes() # I need to put numerical features into log + 1 scale
def log_transform_attributes(self):
for attr in self.attributes:
if attr in [H_IN_PKTS, H_OUT_PKTS, H_IN_BYTES, H_OUT_BYTES]: # Ensure the column is numeric
self.df[attr] = self.log_transform_feature(self.df[attr])
def log_transform_feature(self, data, small_constant=1):
data = np.array(data)
transformed_data = np.log(data + small_constant)
return transformed_data
def bin_by_resolution(self, res):
binned_df = self.df.copy()
bins = list(range(self.start_time - res,
self.end_time + res, res))
times = list(range(
len(bins) - 1)) # times in units (res seconds) ## use for labels in the next line
binned_df[H_BIN] = pd.cut(self.df[H_LAST_SWITCHED], bins=bins,
labels=times) ## output categorical list and then be a new column of the df
# DROP UNECESSARY SWITCHING TIMES
binned_df = binned_df.drop(columns=[H_LAST_SWITCHED])
# SORT BY IP TO GET AN ORDERING OF IPS
binned_df = binned_df.sort_values(by=[self.idx[0]])
# CONVERT FROM IP STRINGS FROM OBJECTS TO CATEGORIES
for col in self.idx:
binned_df[col] = binned_df[col].astype('category')
cat_columns = binned_df.select_dtypes(['category']).columns
binned_df[cat_columns] = binned_df[cat_columns].apply(
lambda x: x.cat.codes) ## return int related to categorical value, I might use it directly without lambda
binned_df = binned_df.groupby([H_BIN, self.idx[0], self.idx[1]]).sum().reset_index() ## Aggregation
return binned_df
def add_new_data(self, fname, old_sparse, res_small, res_big):
"""
Add new data stored in fname csv to the existing pandas data.
Also return the new sparse array with the complete history, including
the new data
"""
num_nodes_before_new_data = self.num_nodes
new_df = pd.read_csv(fname,
usecols=[H_LAST_SWITCHED] + self.attributes + self.idx)
self.df = pd.concat([self.df, new_df])
# Update start and end times, number of nodes
old_end_time = self.end_time
old_start_time = self.start_time
self.end_time = max(self.end_time, new_df[H_LAST_SWITCHED].max())
self.start_time = min(self.start_time, new_df[H_LAST_SWITCHED].min())
self.num_nodes = max(self.df[self.idx[0]].nunique(),
self.df[self.idx[1]].nunique())
# If new nodes have been added to the graph, we need to modify
# the number of columns stored in old data matrix
if self.num_nodes != num_nodes_before_new_data:
num_new_nodes = self.num_nodes - num_nodes_before_new_data
num_new_features = num_new_nodes * num_nodes_before_new_data * 2 + \
num_new_nodes ** 2
num_new_features = num_new_features * len(self.attributes)
expansion = sp.csr_matrix((old_sparse.shape[0], num_new_features))
old_sparse = sp.hstack((old_sparse, expansion))
new_sparse = self.get_sparse_array(res_small, res_big,
old_array=old_sparse)
return new_sparse
def plot_by_resolution(self, res, name=''):
binned = self.bin_by_resolution(res)
duration = binned[H_BIN].max()
for t in range(duration):
snapshot = binned[binned[H_BIN] == t]
snapshot = snapshot.values[:, 1:] # Ignore the time column
posix_time = self.start_time + t * res
plt.plot(snapshot[:, 0], snapshot[:, 1], 's') # Plot only src and dst
plt.xlim([0, self.num_nodes])
plt.ylim([0, self.num_nodes])
plt.title(datetime.utcfromtimestamp(posix_time). \
strftime('%Y-%m-%d %H:%M:%S'))
plt.gca().set_aspect('equal', adjustable='box')
plt.savefig(name + str(t) + '.png')
plt.close()
def get_sparse_array(self, res_small, res_big, old_array=None):
# data = self._load_npz_if_exists(res_small, res_big)
# if not (data is None):
# return data
if not (old_array is None): ### Condition to start sparsing from zero or after last array
start_from = old_array.shape[0]
else:
start_from = 0
binned = self.bin_by_resolution(res_small) ## Result in aggregated df
duration = binned[H_BIN].max()
data = sp.lil_matrix( \
(duration - start_from, self.num_nodes ** 2 * len(self.attributes))) ## Sparse matrix generation
# Data is ordered in time
for t in range(start_from, duration):
snapshot = binned[binned[H_BIN] == t]
snapshot = snapshot.values[:, 1:] # Ignore the time column
for row_idx in range(snapshot.shape[0]):
row = snapshot[row_idx, :]
# Note: Even if some of the attributes take a value of 0,
# we explicitly store them. This is desired.
for attribute in range(len(self.attributes)):
idx = self.src_dst_att_to_idx(row[0], row[1], attribute)
data[t - start_from, idx] = row[attribute]
data = sp.csr_matrix(data)
if not (old_array is None):
data = sp.vstack((old_array, data))
# sp.save_npz(self._gen_save_fname(res_small, res_big), data)
return data
def _gen_save_fname(self, res_small, res_big):
return \
self.fname[:-4] + '_' + self.idx[0] + '_' + str(res_small) + '_' + str(res_big) + \
'.npz'
def _load_npz_if_exists(self, res_small, res_big):
fname = self._gen_save_fname(res_small, res_big)
try:
return sp.load_npz(fname)
except:
return None
def src_dst_att_to_idx(self, src, dst, a):
"""
Convert 3 dimensional index (source, destination, attribute) to 1
dimension.
"""
return (self.num_nodes * src + dst) * (a + 1)
if __name__ == '__main__':
# FNAME = '/afm01/Q1/Q1002/comscentre/CSV_files/customer_cbcp_1712_1713_1month.csv'
# FNAME = '/afm01/Q1/Q1002/comscentre/attack_files/attack_injected_csvs/customer_ipsw_742_743_1month_IPs_76092_[Port_Scan_Nmap_sS]_injected_D10H5.csv'
# FNAME = '/afm01/Q1/Q1002/comscentre/CSV_files/customer_pecv_773_1month_first.csv'
# FNAME2 = '/afm01/Q1/Q1002/comscentre/CSV_files/customer_pecv_773_1month_last.csv'
FNAME = PARAMETERS.UNSW_Niloo
# FNAME2 = PARAMETERS.REAL2
RES_SMALL = 3600 #60 * 60 # Temporal resolution of averaging (in seconds)
RES_BIG = 10 # Number of temporal resolution units per timestep
consistient_scale = True
Heading = f'IAT_BoT_4June_{RES_SMALL}'
if not consistient_scale:
Heading += '_NoScale'
csv_processor_ip = CsvProcessor(FNAME, idx=[H_IP_SRC, H_IP_DST])
# csv_processor_ip.plot_by_resolution(RES_SMALL*RES_BIG, name='ip')
data_ip = csv_processor_ip.get_sparse_array(RES_SMALL, RES_BIG) ## data_ip is sparse lil matrix
csv_processor_port = CsvProcessor(FNAME, idx=[H_PORT_SRC, H_PORT_DST]) ## Same process with ports instead of IPs
# csv_processor_port.plot_by_resolution(RES_SMALL*RES_BIG, name='port')
data_port = csv_processor_port.get_sparse_array(RES_SMALL, RES_BIG)
# data_ip = csv_processor_ip.add_new_data(FNAME2, data_ip, RES_SMALL, # Uncomment these to add extra data
# RES_BIG) ## Read new df and make the first one as a history
# data_port = csv_processor_port.add_new_data(FNAME2, data_port, RES_SMALL, RES_BIG)
n_features_ip = \
csv_processor_ip.num_nodes ** 2 * len(csv_processor_ip.attributes)
n_features_port = \
csv_processor_port.num_nodes ** 2 * len(csv_processor_port.attributes)
data_ip = utils.drop_sparse_cols(data_ip) ## Drop zeros
data_ip = utils.moving_average(data_ip, RES_BIG)
data_port = utils.drop_sparse_cols(data_port)
data_port = utils.moving_average(data_port, RES_BIG)
data_ip = sklearn.preprocessing.scale(data_ip,
with_mean=False) ## scale the values and with means=false because it is a sparse matrix
data_port = sklearn.preprocessing.scale(data_port, with_mean=False)
data_all = sp.hstack((data_ip, data_port))
output_dir = rf"D:\Frames\LIAM\{Heading}"
output_dir_ip = os.path.join(output_dir, 'ip')
output_dir_port = os.path.join(output_dir, 'port')
output_dir_all = os.path.join(output_dir, 'all')
# Execute KPCA and plot each step
kpca = sklearn.decomposition.KernelPCA(n_components=2, kernel='rbf', gamma=1 / (n_features_ip))
transformed_ip = kpca.fit_transform(data_ip)
utils.plot_transformed_eachwindow(transformed_ip, csv_processor_ip.start_time, RES_SMALL, output_dir_ip,consistient_scale)
kpca = sklearn.decomposition.KernelPCA(n_components=2, kernel='rbf', gamma=1 / (n_features_port))
transformed_port = kpca.fit_transform(data_port)
utils.plot_transformed_eachwindow(transformed_port, csv_processor_port.start_time, RES_SMALL, output_dir_port,consistient_scale)
kpca = sklearn.decomposition.KernelPCA(n_components=2, kernel='rbf', gamma=1 / ((n_features_ip + n_features_port)))
transformed_all = kpca.fit_transform(data_all)
utils.plot_transformed_eachwindow(transformed_all, csv_processor_ip.start_time, RES_SMALL, output_dir_all,consistient_scale)