-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathfilter.py
86 lines (74 loc) · 2.91 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import numpy as np
import pickle
import os
import random
def find_key(input_dict, value):
return {k for k, v in input_dict.items() if v == value}
def parse_csv(csv_path, interval,final_names, threshold):#option: 'sonly', 'tonly', 'both'
#fw=open('/data/seoh/greaterthan50.txt','w+')
HERE_PATH = csv_path+'inflow'
THERE_PATH = csv_path+'outflow'
print(HERE_PATH,THERE_PATH,interval)
#here
here=[]
there=[]
here_len=[]
there_len=[]
h_cnt = 0
t_cnt = 0
flow_cnt = 0
file_names = []
for txt_file in os.listdir(HERE_PATH):
file_names.append(txt_file)
#for txt_file in open('/data/seoh/greaterthan50.txt','r').readlines():
# file_names.append(txt_file.strip())
for i in range(len(file_names)):
here_seq = []
there_seq = []
num_here_big_pkt_cnt = []
num_there_big_pkt_cnt = []
with open(HERE_PATH+'/'+file_names[i]) as f:
#print(HERE_PATH+'/'+file_names[i])
h_lines=[]
full_lines=f.readlines()
for line in full_lines:
time=float(line.split('\t')[0])
if float(time) > interval[1]:
break
if float(time) < interval[0]:
continue
h_lines.append(line)
with open (THERE_PATH + '/' + file_names[i]) as f:
t_lines = []
full_lines = f.readlines ()
for line in full_lines:
time = float (line.split ('\t')[0])
if float (time) > interval[1]:
break
if float (time) < interval[0]:
continue
t_lines.append (line)
if(len(h_lines)>threshold) and (len(t_lines)>threshold):
if file_names[i] in final_names.keys():
final_names[file_names[i]] += 1
else:
final_names[file_names[i]] = 1
for x in final_names:
print(x,final_names[x])
def create_overlap_window_csv(csv_path, out_path, threshold, interval, num_windows, addnum):
global final_names
final_names={}
fw = open (out_path, 'w+')
for win in range(num_windows):
parse_csv(csv_path, [win*addnum,win*addnum+interval],final_names, threshold)
#np.savez_compressed('/project/hoppernj/research/seoh/new_dcf_data/new_overlap_interval' + str(interval) + '_win' + str(win) + '_addn' + str(addnum) + '.npz',
# tor=here, exit=there)
for name in list(find_key(final_names, num_windows)):
fw.write(name)
fw.write ('\n')
fw.close()
data_path = '/data/website-fingerprinting/datasets/CrawlE_Proc_100/'
out_file_path = '/data/seoh/CrawlE_Proc_100_files.txt'
threshold=10 # min number of packets per window in both ends, used 30 for 500
# That is, we drop the flow pairs if either of them has pkt count < threshold.
create_overlap_window_csv(data_path, out_file_path, threshold, 5, 11, 2)