forked from Minqi824/ADBench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_generator.py
543 lines (432 loc) · 24.1 KB
/
data_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
import numpy as np
import pandas as pd
import random
import scipy.io
import os
import mat73
from math import ceil
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from itertools import combinations
from sklearn.mixture import GaussianMixture
from copulas.multivariate import VineCopula
from copulas.univariate import GaussianKDE
from myutils import Utils
# to do: leverage the categorical feature in the current datasets
# currently, data generator only supports for generating the binary classification datasets
class DataGenerator():
def __init__(self, seed:int=42, dataset:str=None, test_size:float=0.3,
generate_duplicates=False, n_samples_threshold=1000, show_statistic=False):
'''
Only global parameters should be provided in the DataGenerator instantiation
seed: seed for reproducible experimental results
dataset: dataset name
test_size: testing data size
'''
self.seed = seed
self.dataset = dataset
self.test_size = test_size
# 当数据量不够时, 是否生成重复样本
self.generate_duplicates = generate_duplicates
self.n_samples_threshold = n_samples_threshold
# 是否展示统计结果
self.show_statistic = show_statistic
# myutils function
self.utils = Utils()
'''
By default, generator will generate real-world datasets.
However, the realistic synthetic data can be generated, which follows the paper
"Benchmarking Unsupervised Outlier Detection with Realistic Synthetic Data"
realistic_synthetic_mode can be either local or global based on the original paper
alpha and percentage are two parameters cited in the paper
'''
def generate_realistic_synthetic(self, X, y, realistic_synthetic_mode, alpha:int, percentage:float):
'''
Currently, three types of realistic synthetic outliers can be generated:
1. local outliers: where normal data follows the GMM distribuion, and anomalies follow the GMM distribution with bigger covariance
2. dependency outliers: where normal data follows the vine coupula distribution, and anomalies follow the independent distribution captured by GaussianKDE
3. global outliers: where normal data follows the GMM distribuion, and anomalies follow the uniform distribution
'''
if realistic_synthetic_mode in ['local', 'cluster', 'dependency', 'global']:
pass
else:
raise NotImplementedError
# the number of normal data and anomalies
pts_n = len(np.where(y == 0)[0])
pts_a = len(np.where(y == 1)[0])
# only use the normal data to fit the model
X = X[y == 0]
y = y[y == 0]
# generate the synthetic normal data
if realistic_synthetic_mode in ['local', 'cluster', 'global']:
# select the best n_components based on the BIC value
metric_list = []
n_components_list = list(np.arange(1, 10))
for n_components in n_components_list:
gm = GaussianMixture(n_components=n_components, random_state=self.seed).fit(X)
metric_list.append(gm.bic(X))
best_n_components = n_components_list[np.argmin(metric_list)]
# refit based on the best n_components
gm = GaussianMixture(n_components=best_n_components, random_state=self.seed).fit(X)
# generate the synthetic normal data
X_synthetic_normal = gm.sample(pts_n)[0]
# we found that copula function may occur error in some datasets
elif realistic_synthetic_mode == 'dependency':
# sampling the feature since copulas method may spend too long to fit
if X.shape[1] > 50:
idx = np.random.choice(np.arange(X.shape[1]), 50, replace=False)
X = X[:, idx]
copula = VineCopula('center') # default is the C-vine copula
copula.fit(pd.DataFrame(X))
# sample to generate synthetic normal data
X_synthetic_normal = copula.sample(pts_n).values
else:
pass
# generate the synthetic abnormal data
if realistic_synthetic_mode == 'local':
# generate the synthetic anomalies (local outliers)
gm.covariances_ = alpha * gm.covariances_
X_synthetic_anomalies = gm.sample(pts_a)[0]
elif realistic_synthetic_mode == 'cluster':
# generate the clustering synthetic anomalies
gm.means_ = alpha * gm.means_
X_synthetic_anomalies = gm.sample(pts_a)[0]
elif realistic_synthetic_mode == 'dependency':
X_synthetic_anomalies = np.zeros((pts_a, X.shape[1]))
# using the GuassianKDE for generating independent feature
for i in range(X.shape[1]):
kde = GaussianKDE()
kde.fit(X[:, i])
X_synthetic_anomalies[:, i] = kde.sample(pts_a)
elif realistic_synthetic_mode == 'global':
# generate the synthetic anomalies (global outliers)
X_synthetic_anomalies = []
for i in range(X_synthetic_normal.shape[1]):
low = np.min(X_synthetic_normal[:, i]) * (1 + percentage)
high = np.max(X_synthetic_normal[:, i]) * (1 + percentage)
X_synthetic_anomalies.append(np.random.uniform(low=low, high=high, size=pts_a))
X_synthetic_anomalies = np.array(X_synthetic_anomalies).T
else:
pass
X = np.concatenate((X_synthetic_normal, X_synthetic_anomalies), axis=0)
y = np.append(np.repeat(0, X_synthetic_normal.shape[0]),
np.repeat(1, X_synthetic_anomalies.shape[0]))
return X, y
'''
Here we also consider the robustness of baseline models, where three types of noise can be added
1. Duplicated anomalies, which should be added to training and testing set, respectively
2. Irrelevant features, which should be added to both training and testing set
3. Anomaly contamination, which should be only added to the training set
4. Label contamination, which should be only added to the training set
'''
def add_duplicated_anomalies(self, X, y, duplicate_times:int):
if duplicate_times <= 1:
pass
else:
# index of normal and anomaly data
idx_n = np.where(y==0)[0]
idx_a = np.where(y==1)[0]
# # generate duplicated anomalies
# pts_a = len(idx_a)
# idx_a = np.random.choice(idx_a, ceil(pts_a / duplicate_times))
# idx_a = np.random.choice(idx_a, pts_a)
#
# idx = np.append(idx_n, idx_a); random.shuffle(idx)
# X = X[idx]; y = y[idx]
# generate duplicated anomalies
idx_a = np.random.choice(idx_a, int(len(idx_a) * duplicate_times))
idx = np.append(idx_n, idx_a); random.shuffle(idx)
X = X[idx]; y = y[idx]
return X, y
def add_irrelevant_features(self, X, y, noise_ratio:float):
# adding uniform noise
if noise_ratio == 0.0:
pass
else:
noise_dim = int(noise_ratio / (1 - noise_ratio) * X.shape[1])
if noise_dim > 0:
X_noise = []
for i in range(noise_dim):
idx = np.random.choice(np.arange(X.shape[1]), 1)
X_min = np.min(X[:, idx])
X_max = np.max(X[:, idx])
X_noise.append(np.random.uniform(X_min, X_max, size=(X.shape[0], 1)))
# concat the irrelevant noise feature
X_noise = np.hstack(X_noise)
X = np.concatenate((X, X_noise), axis=1)
# shuffle the dimension
idx = np.random.choice(np.arange(X.shape[1]), X.shape[1], replace=False)
X = X[:, idx]
return X, y
def remove_anomaly_contamination(self, idx_unlabeled_anomaly, contam_ratio:float):
idx_unlabeled_anomaly = np.random.choice(idx_unlabeled_anomaly, int(contam_ratio * len(idx_unlabeled_anomaly)), replace=False)
return idx_unlabeled_anomaly
def add_label_contamination(self, X, y, noise_ratio:float):
if noise_ratio == 0.0:
pass
else:
# # here we mainly consider the situation where labeled anomalies have contamination, i.e., unlabeled normal data
# idx_n = np.where(y==0)[0]
# idx_a = np.where(y==1)[0]
#
# noise_num = int(noise_ratio / (1 - noise_ratio) * len(idx_a))
# # here replace is set to true for we observe that some datasets the anomalies are even more than normal data
# idx_n_noise = np.random.choice(idx_n, noise_num, replace=True)
# y[idx_n_noise] = 1
# here we consider the label flips situation: a label is randomly filpped to another class with probability p (i.e., noise ratio)
idx_flips = np.random.choice(np.arange(len(y)), int(len(y) * noise_ratio), replace=False)
y[idx_flips] = 1 - y[idx_flips] # change 0 to 1 and 1 to 0
return X, y
def generator(self, la=None, at_least_one_labeled=False,
realistic_synthetic_mode=None, alpha:int=5, percentage:float=0.1,
noise_type=None, duplicate_times:int=2, contam_ratio=1.00, noise_ratio:float=0.05):
'''
la: labeled anomalies, can be either the ratio of labeled anomalies or the number of labeled anomalies
at_least_one_labeled: whether to guarantee at least one labeled anomalies in the training set
'''
# set seed for reproducible results
self.utils.set_seed(self.seed)
# transfer different file format to the numpy array
if self.dataset in ['annthyroid', 'cardio', 'mammography', 'musk', 'optdigits', 'pendigits',
'satimage-2', 'speech', 'thyroid', 'vowels', 'cover', 'http', 'letter',
'mnist', 'satellite', 'shuttle', 'smtp', 'breastw', 'vertebral',
'wine']:
if self.dataset in ['http', 'smtp']:
data = mat73.loadmat(os.path.join('datasets', self.dataset + '.mat'))
else:
data = scipy.io.loadmat(os.path.join('datasets', self.dataset + '.mat'))
X = data['X']
y = data['y'].squeeze().astype('int64')
elif self.dataset in ['Waveform_withoutdupl_v10', 'InternetAds_withoutdupl_norm_19', 'PageBlocks_withoutdupl_09',
'SpamBase_withoutdupl_40', 'Wilt_withoutdupl_05', 'Cardiotocography_withoutdupl_22',
'WBC_withoutdupl_v10', 'WDBC_withoutdupl_v10', 'WPBC_withoutdupl_norm',
'Arrhythmia_withoutdupl_46', 'HeartDisease_withoutdupl_44', 'Hepatitis_withoutdupl_16',
'Parkinson_withoutdupl_75', 'Pima_withoutdupl_35', 'Stamps_withoutdupl_09']:
data = pd.read_csv(os.path.join('datasets', self.dataset + '.csv'))
data.columns = [_.split("'")[1] for _ in data.columns]
X = data.drop(['outlier', 'id'], axis=1).values
y = [_.split("'")[1] for _ in data['outlier'].values]
y = np.array([0 if _ == 'no' else 1 for _ in y])
elif self.dataset in ['ALOI_withoutdupl', 'glass_withoutduplicates_normalized',
'Ionosphere_withoutdupl_norm', 'Lymphography_withoutdupl_idf']:
data = pd.read_csv(os.path.join('datasets', self.dataset + '.csv'))
X = data.drop(['outlier','id'], axis=1).values
y = np.array([0 if _ == 'no' else 1 for _ in data['outlier'].values])
elif self.dataset in ['abalone.diff', 'comm.and.crime.diff', 'concrete.diff', 'fault.diff', 'imgseg.diff',
'landsat.diff', 'magic.gamma.diff', 'skin.diff', 'yeast.diff']:
data = pd.read_csv(os.path.join('datasets', self.dataset + '.csv'))
X = data.drop(['point.id', 'motherset', 'origin', 'original.label', 'diff.score', 'ground.truth'],
axis=1).values
y = np.array([0 if _ == 'nominal' else 1 for _ in data['ground.truth'].values])
# Credit Card Fraud Detection (CCFD) dataset
elif self.dataset == 'CCFD':
data = pd.read_csv(os.path.join('datasets', self.dataset + '.csv'))
X = data.drop(['Time', 'Class'], axis=1)
y = data['Class'].values
# Taiwan Bankruptcy Prediction (TBP) dataset
elif self.dataset == 'TBP':
data = pd.read_csv(os.path.join('datasets', self.dataset + '.csv'))
X = data.drop(['Flag'], axis=1)
y = data['Flag'].values
elif self.dataset in ['amazon', 'yelp', 'imdb'] + \
['agnews_' + str(i) for i in range(4)] +\
['FashionMNIST_' + str(i) for i in range(10)] +\
['CIFAR10_' + str(i) for i in range(10)] +\
['SVHN_' + str(i) for i in range(10)]:
data = np.load(os.path.join('datasets', 'NLPCV', self.dataset + '.npz'))
X = data['X']
y = data['y']
else:
raise NotImplementedError
# to array
X = np.array(X)
y = np.array(y)
# number of labeled anomalies in the original data
if type(la) == float:
if at_least_one_labeled:
n_labeled_anomalies = ceil(sum(y) * (1 - self.test_size) * la)
else:
n_labeled_anomalies = int(sum(y) * (1 - self.test_size) * la)
elif type(la) == int:
n_labeled_anomalies = la
else:
raise NotImplementedError
################################
# 如果数据集异常占比过高(超过5%), 异常样本抽样
# if (sum(y) / len(y)) > 0.05:
# print(f'数据集{self.dataset}异常占比过大, 正在抽样...')
# self.utils.set_seed(self.seed)
# idx_n = np.where(y==0)[0]
# idx_a = np.where(y==1)[0]
# idx_a = np.random.choice(idx_a, int(0.05 * len(idx_n) / 0.95), replace=False)
#
# idx = np.append(idx_n, idx_a)
# random.shuffle(idx)
#
# X = X[idx]
# y = y[idx]
# 如果数据集大小不足, 生成duplicate样本:
if len(y) < self.n_samples_threshold and self.generate_duplicates:
print(f'数据集{self.dataset}正在生成duplicate samples...')
self.utils.set_seed(self.seed)
idx_duplicate = np.random.choice(np.arange(len(y)), self.n_samples_threshold, replace=True)
X = X[idx_duplicate]
y = y[idx_duplicate]
# 如果数据集过大, 整体样本抽样
if len(y) > 10000:
print(f'数据集{self.dataset}数据量过大, 正在抽样...')
self.utils.set_seed(self.seed)
idx_sample = np.random.choice(np.arange(len(y)), 10000, replace=False)
X = X[idx_sample]
y = y[idx_sample]
################################
# whether to generate realistic synthetic outliers
if realistic_synthetic_mode is not None:
# 因为dependency outlier生成的时间太长了, 这边加载了提前生成好的样本(大于3000的数据集抽样到了3000)
if realistic_synthetic_mode == 'dependency':
print('使用了提前生成的dependency outliers...')
dataset_dict = np.load(os.path.join('datasets', 'Dependency_outlier', 'dependency_outlier_large.npz'), allow_pickle=True)
dataset_dict = dataset_dict['dataset'].item()
if self.dataset in dataset_dict.keys():
X, y = dataset_dict[self.dataset]
else:
raise NotImplementedError
else:
X, y = self.generate_realistic_synthetic(X, y,
realistic_synthetic_mode=realistic_synthetic_mode,
alpha=alpha, percentage=percentage)
# whether to add different types of noise for testing the robustness of benchmark models
if noise_type is None:
pass
elif noise_type == 'duplicated_anomalies':
# X, y = self.add_duplicated_anomalies(X, y, duplicate_times=duplicate_times)
pass
elif noise_type == 'irrelevant_features':
X, y = self.add_irrelevant_features(X, y, noise_ratio=noise_ratio)
elif noise_type == 'anomaly_contamination':
pass
elif noise_type == 'label_contamination':
pass
else:
raise NotImplementedError
print(f'current noise type: {noise_type}')
# show the statistic
self.utils.data_description(X=X, y=y)
# spliting the current data to the training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, shuffle=True, stratify=y)
# we respectively generate the duplicated anomalies for the training and testing set
if noise_type == 'duplicated_anomalies':
X_train, y_train = self.add_duplicated_anomalies(X_train, y_train, duplicate_times=duplicate_times)
X_test, y_test = self.add_duplicated_anomalies(X_test, y_test, duplicate_times=duplicate_times)
# notice that label contamination can only be added in the training set
elif noise_type == 'label_contamination':
X_train, y_train = self.add_label_contamination(X_train, y_train, noise_ratio=noise_ratio)
# minmax scaling
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# idx of normal samples and unlabeled/labeled anomalies
idx_normal = np.where(y_train == 0)[0]
idx_anomaly = np.where(y_train == 1)[0]
if type(la) == float:
if at_least_one_labeled:
idx_labeled_anomaly = np.random.choice(idx_anomaly, ceil(la * len(idx_anomaly)), replace=False)
else:
idx_labeled_anomaly = np.random.choice(idx_anomaly, int(la * len(idx_anomaly)), replace=False)
elif type(la) == int:
if la > len(idx_anomaly):
raise AssertionError(f'设置的nla超过了异常样本的总数:{len(idx_anomaly)}')
else:
idx_labeled_anomaly = np.random.choice(idx_anomaly, la, replace=False)
else:
raise NotImplementedError
idx_unlabeled_anomaly = np.setdiff1d(idx_anomaly, idx_labeled_anomaly)
# whether to remove the anomaly contamination in the unlabeled data
if noise_type == 'anomaly_contamination':
idx_unlabeled_anomaly = self.remove_anomaly_contamination(idx_unlabeled_anomaly, contam_ratio)
# unlabel data = normal data + unlabeled anomalies (which is considered as contamination)
idx_unlabeled = np.append(idx_normal, idx_unlabeled_anomaly)
del idx_anomaly, idx_unlabeled_anomaly
# the label of unlabeled data is 0, and that of labeled anomalies is 1
y_train[idx_unlabeled] = 0
y_train[idx_labeled_anomaly] = 1
return {'X_train':X_train, 'y_train':y_train, 'X_test':X_test, 'y_test':y_test,
'n_labeled_anomalies':n_labeled_anomalies}
# generate out-of-distribution dataset (to do)
def generator_OOD(self,
ID_num: int=2, OOD_ratio: float=0.5,
n_samples_train=7000, n_samples_test=3000, anomaly_ratio=0.01):
'''
ID_num: number of in-distribution anomaly classes in the training set
OOD_ratio: the out-of-distribution anomaly ratio in the testing set
n_samples_train: the training set size
n_samples_test: the testing set size
'''
# kddcup99 data
data_org = pd.read_csv(os.path.join('datasets_unprocessed', 'kddcup99.csv'))
#观察所有类别分布
if self.show_statistic:
classes = list(data_org['label'].drop_duplicates())
for i in range(len(classes)):
print(f"class: {classes[i]}, count: {data_org[data_org['label']==classes[i]].shape[0]}")
#set seed and shuffle
self.utils.set_seed(self.seed)
data = data_org.sample(frac=1).reset_index(drop=True)
# global parameters
anomaly_class_pool = data['label'].drop_duplicates().values
anomaly_class_pool = [_ for _ in anomaly_class_pool if _ != 'normal.' and sum(data['label']==_) >=
int((n_samples_train+n_samples_test)*anomaly_ratio)]
# ID anomaly class pool(从总共的异常类别中抽取几个作为in distribution的异常)
anomaly_class_ID_pool = list(combinations(anomaly_class_pool, self.ID_num))
# ID anomaly class, 从所有的ID异常类别中随机抽取一种情况
anomaly_class_ID = list(anomaly_class_ID_pool[np.random.choice(np.arange(len(anomaly_class_ID_pool)), 1)[0]])
# OOD anomaly class
anomaly_class_OOD = list(set(anomaly_class_pool) - set(anomaly_class_ID))
if self.show_statistic:
print(f'ID 异常类别: {anomaly_class_ID}; OOD 异常类别: {anomaly_class_OOD}')
# sampling for the normal samples
idx_normal = np.where(data['label'] == 'normal.')[0]
# normal samples of training set
idx_normal_train = np.random.choice(idx_normal, int(n_samples_train * (1-anomaly_ratio)), replace=False)
# normal samples of testing set
idx_normal_test = np.random.choice(np.setdiff1d(idx_normal, idx_normal_train),
int(n_samples_test * (1-anomaly_ratio)), replace=False)
# in the training set, all anomalies are in-distribution anomaly class
# in the testing set, half of the anomalies are in-distribution and another half are out-of-distribution
# the index of in-distribution anomalies
idx_anomalies_ID = []
for c in anomaly_class_ID:
idx_anomalies_ID.extend(list(np.where(data['label'] == c)[0]))
# the index of out-of-distribution anomalies
idx_anomalies_OOD = []
for c in anomaly_class_OOD:
idx_anomalies_OOD.extend(list(np.where(data['label'] == c)[0]))
# to array
idx_anomalies_ID = np.array(idx_anomalies_ID)
idx_anomalies_OOD = np.array(idx_anomalies_OOD)
# training set anomalies
idx_anomalies_ID_train = np.random.choice(idx_anomalies_ID, int(n_samples_train*anomaly_ratio), replace=False)
idx_anomalies_ID_test = np.random.choice(np.setdiff1d(idx_anomalies_ID, idx_anomalies_ID_train),
int(n_samples_test * anomaly_ratio * (1 - self.OOD_ratio)), replace=False)
idx_anomalies_OOD_test =np.random.choice(idx_anomalies_OOD,
int(n_samples_test * anomaly_ratio * self.OOD_ratio), replace=False)
idx_train = np.concatenate([idx_normal_train, idx_anomalies_ID_train])
idx_test = np.concatenate([idx_normal_test, idx_anomalies_ID_test, idx_anomalies_OOD_test])
# combine
y_train = np.append(np.repeat(0, len(idx_normal_train)),
np.repeat(1, len(idx_anomalies_ID_train)))
X_train = data.drop(['label'], axis=1).values[idx_train]
y_test = np.append(np.repeat(0, len(idx_normal_test)),
np.repeat(1, len(idx_anomalies_ID_test) + len(idx_anomalies_OOD_test)))
X_test = data.drop(['label'], axis=1).values[idx_test]
# shuffle idx
self.utils.set_seed(self.seed)
idx_train = np.random.choice(np.arange(len(idx_train)), len(idx_train), replace=False)
idx_test = np.random.choice(np.arange(len(idx_test)), len(idx_test), replace=False)
X_train = X_train[idx_train]; y_train = y_train[idx_train]
X_test = X_test[idx_test]; y_test = y_test[idx_test]
# minmax scaling
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
return {'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test}