diff --git a/baseline/DeepSAD/src/base/odds_dataset.py b/baseline/DeepSAD/src/base/odds_dataset.py index 7455ed2..d94bf59 100644 --- a/baseline/DeepSAD/src/base/odds_dataset.py +++ b/baseline/DeepSAD/src/base/odds_dataset.py @@ -22,19 +22,14 @@ class ODDSDataset(Dataset): def __init__(self, data, train=True): super(Dataset, self).__init__() - X_train = data['X_train'] - y_train = data['y_train'] - X_test = data['X_test'] - y_test = data['y_test'] - self.train = train if self.train: - self.data = torch.tensor(X_train, dtype=torch.float32) - self.targets = torch.tensor(y_train, dtype=torch.int64) + self.data = torch.tensor(data['X_train'], dtype=torch.float32) + self.targets = torch.tensor(data['y_train'], dtype=torch.int64) else: - self.data = torch.tensor(X_test, dtype=torch.float32) - self.targets = torch.tensor(y_test, dtype=torch.int64) + self.data = torch.tensor(data['X_test'], dtype=torch.float32) + self.targets = torch.tensor(data['y_test'], dtype=torch.int64) # self.semi_targets = torch.zeros_like(self.targets) self.semi_targets = self.targets diff --git a/baseline/DeepSAD/src/datasets/main.py b/baseline/DeepSAD/src/datasets/main.py index e69f62b..1ce36ad 100644 --- a/baseline/DeepSAD/src/datasets/main.py +++ b/baseline/DeepSAD/src/datasets/main.py @@ -4,38 +4,10 @@ from .odds import ODDSADDataset -def load_dataset(data): +def load_dataset(data, train=True): """Loads the dataset.""" - #代码中暂不支持DeepSAD部署在CV数据集中,之后会更新 - # if dataset == 'mnist': - # dataset = MNIST_Dataset(root=data_path, - # normal_class=normal_class, - # known_outlier_class=known_outlier_class, - # n_known_outlier_classes=n_known_outlier_classes, - # ratio_known_normal=ratio_known_normal, - # ratio_known_outlier=ratio_known_outlier, - # ratio_pollution=ratio_pollution) - # - # elif dataset == 'fmnist': - # dataset = FashionMNIST_Dataset(root=data_path, - # normal_class=normal_class, - # known_outlier_class=known_outlier_class, - # n_known_outlier_classes=n_known_outlier_classes, - # ratio_known_normal=ratio_known_normal, - # ratio_known_outlier=ratio_known_outlier, - # ratio_pollution=ratio_pollution) - # - # elif dataset == 'cifar10': - # dataset = CIFAR10_Dataset(root=data_path, - # normal_class=normal_class, - # known_outlier_class=known_outlier_class, - # n_known_outlier_classes=n_known_outlier_classes, - # ratio_known_normal=ratio_known_normal, - # ratio_known_outlier=ratio_known_outlier, - # ratio_pollution=ratio_pollution) - - #tabular data - dataset = ODDSADDataset(data=data) + # for tabular data + dataset = ODDSADDataset(data=data, train=train) return dataset diff --git a/baseline/DeepSAD/src/datasets/odds.py b/baseline/DeepSAD/src/datasets/odds.py index 64fc51d..9553014 100644 --- a/baseline/DeepSAD/src/datasets/odds.py +++ b/baseline/DeepSAD/src/datasets/odds.py @@ -8,7 +8,7 @@ class ODDSADDataset(BaseADDataset): - def __init__(self, data): + def __init__(self, data, train): super().__init__(self) # Define normal and outlier classes @@ -16,17 +16,24 @@ def __init__(self, data): self.normal_classes = (0,) self.outlier_classes = (1,) - # Get training set - self.train_set = ODDSDataset(data=data, train=True) - # Get testing set - self.test_set = ODDSDataset(data=data, train=False) + # training or testing dataset + self.train = train + + if self.train: + # Get training set + self.train_set = ODDSDataset(data=data, train=True) + else: + # Get testing set + self.test_set = ODDSDataset(data=data, train=False) def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> ( DataLoader, DataLoader): - train_loader = DataLoader(dataset=self.train_set, batch_size=batch_size, shuffle=shuffle_train, - num_workers=num_workers, drop_last=True) - test_loader = DataLoader(dataset=self.test_set, batch_size=batch_size, shuffle=shuffle_test, - num_workers=num_workers, drop_last=False) - - return train_loader, test_loader + if self.train: + train_loader = DataLoader(dataset=self.train_set, batch_size=batch_size, shuffle=shuffle_train, + num_workers=num_workers, drop_last=True) + return train_loader + else: + test_loader = DataLoader(dataset=self.test_set, batch_size=batch_size, shuffle=shuffle_test, + num_workers=num_workers, drop_last=False) + return test_loader \ No newline at end of file diff --git a/baseline/DeepSAD/src/deepsad.py b/baseline/DeepSAD/src/deepsad.py index f773ceb..2445313 100644 --- a/baseline/DeepSAD/src/deepsad.py +++ b/baseline/DeepSAD/src/deepsad.py @@ -80,14 +80,16 @@ def test(self, dataset: BaseADDataset, device: str = 'cuda', n_jobs_dataloader: if self.trainer is None: self.trainer = DeepSADTrainer(self.c, self.eta, device=device, n_jobs_dataloader=n_jobs_dataloader) - self.trainer.test(dataset, self.net) + score = self.trainer.test(dataset, self.net) # Get results - self.results['test_aucroc'] = self.trainer.test_aucroc - self.results['test_aucpr'] = self.trainer.test_aucpr + # self.results['test_aucroc'] = self.trainer.test_aucroc + # self.results['test_aucpr'] = self.trainer.test_aucpr self.results['test_time'] = self.trainer.test_time self.results['test_scores'] = self.trainer.test_scores + return score + def pretrain(self, dataset: BaseADDataset, input_size ,optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 100, lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', n_jobs_dataloader: int = 0): diff --git a/baseline/DeepSAD/src/optim/DeepSAD_trainer.py b/baseline/DeepSAD/src/optim/DeepSAD_trainer.py index c48df0e..613ccdb 100644 --- a/baseline/DeepSAD/src/optim/DeepSAD_trainer.py +++ b/baseline/DeepSAD/src/optim/DeepSAD_trainer.py @@ -36,7 +36,7 @@ def train(self, dataset: BaseADDataset, net: BaseNet): logger = logging.getLogger() # Get train data loader - train_loader, _ = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) + train_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) # Set device for network net = net.to(self.device) @@ -101,7 +101,7 @@ def test(self, dataset: BaseADDataset, net: BaseNet): logger = logging.getLogger() # Get test data loader - _, test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) + test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) # Set device for network net = net.to(self.device) @@ -141,18 +141,20 @@ def test(self, dataset: BaseADDataset, net: BaseNet): # Compute AUC _, labels, scores = zip(*idx_label_score) - labels = np.array(labels) + # labels = np.array(labels) scores = np.array(scores) - self.test_aucroc = roc_auc_score(labels, scores) - self.test_aucpr = average_precision_score(labels, scores, pos_label = 1) + # self.test_aucroc = roc_auc_score(labels, scores) + # self.test_aucpr = average_precision_score(labels, scores, pos_label = 1) # Log results logger.info('Test Loss: {:.6f}'.format(epoch_loss / n_batches)) - logger.info('Test AUCROC: {:.2f}%'.format(100. * self.test_aucroc)) - logger.info('Test AUCPR: {:.2f}%'.format(100. * self.test_aucpr)) + # logger.info('Test AUCROC: {:.2f}%'.format(100. * self.test_aucroc)) + # logger.info('Test AUCPR: {:.2f}%'.format(100. * self.test_aucpr)) logger.info('Test Time: {:.3f}s'.format(self.test_time)) logger.info('Finished testing.') + return scores + def init_center_c(self, train_loader: DataLoader, net: BaseNet, eps=0.1): """Initialize hypersphere center c as the mean from an initial forward pass on the data.""" n_samples = 0 diff --git a/baseline/DeepSAD/src/optim/ae_trainer.py b/baseline/DeepSAD/src/optim/ae_trainer.py index 43116ca..057fd22 100644 --- a/baseline/DeepSAD/src/optim/ae_trainer.py +++ b/baseline/DeepSAD/src/optim/ae_trainer.py @@ -27,7 +27,7 @@ def train(self, dataset: BaseADDataset, ae_net: BaseNet): logger = logging.getLogger() # Get train data loader - train_loader, _ = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) + train_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) # Set loss criterion = nn.MSELoss(reduction='none') @@ -86,7 +86,7 @@ def test(self, dataset: BaseADDataset, ae_net: BaseNet): logger = logging.getLogger() # Get test data loader - _, test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) + test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) # Set loss criterion = nn.MSELoss(reduction='none') diff --git a/baseline/DeepSAD/src/run.py b/baseline/DeepSAD/src/run.py index e3ae66e..ce12a8d 100644 --- a/baseline/DeepSAD/src/run.py +++ b/baseline/DeepSAD/src/run.py @@ -37,7 +37,7 @@ def __init__(self, seed, model_name='DeepSAD'): self.num_threads = 0 self.n_jobs_dataloader = 0 - def fit2test(self, data): + def fit(self, X_train, y_train, ratio=None): """ Deep SAD, a method for deep semi-supervised anomaly detection. @@ -56,23 +56,24 @@ def fit2test(self, data): logging.info('Number of threads: %d' % self.num_threads) logging.info('Number of dataloader workers: %d' % self.n_jobs_dataloader) - #Load data (modified) - dataset = load_dataset(data=data) + # Load data + data = {'X_train': X_train, 'y_train': y_train} + dataset = load_dataset(data=data, train=True) input_size = dataset.train_set.data.size(1) #input size # Initialize DeepSAD model and set neural network phi - deepSAD = deepsad(self.eta) - deepSAD.set_network(self.net_name, input_size) + self.deepSAD = deepsad(self.eta) + self.deepSAD.set_network(self.net_name, input_size) # If specified, load Deep SAD model (center c, network weights, and possibly autoencoder weights) if self.load_model: - deepSAD.load_model(model_path=self.load_model, load_ae=True, map_location=self.device) + self.deepSAD.load_model(model_path=self.load_model, load_ae=True, map_location=self.device) logging.info('Loading model from %s.' % self.load_model) logging.info('Pretraining: %s' % self.pretrain) if self.pretrain: # Pretrain model on dataset (via autoencoder) - deepSAD.pretrain(dataset, + self.deepSAD.pretrain(dataset, input_size, optimizer_name=self.ae_optimizer_name, lr=self.ae_lr, @@ -84,18 +85,15 @@ def fit2test(self, data): n_jobs_dataloader=self.n_jobs_dataloader) # Train model on dataset - deepSAD.train(dataset, - optimizer_name=self.optimizer_name, - lr=self.lr, - n_epochs=self.n_epochs, - lr_milestones=self.lr_milestone, - batch_size=self.batch_size, - weight_decay=self.weight_decay, - device=self.device, - n_jobs_dataloader=self.n_jobs_dataloader) - - # Test model - deepSAD.test(dataset, device=self.device, n_jobs_dataloader=self.n_jobs_dataloader) + self.deepSAD.train(dataset, + optimizer_name=self.optimizer_name, + lr=self.lr, + n_epochs=self.n_epochs, + lr_milestones=self.lr_milestone, + batch_size=self.batch_size, + weight_decay=self.weight_decay, + device=self.device, + n_jobs_dataloader=self.n_jobs_dataloader) # Save results, model, and configuration # deepSAD.save_results(export_json=xp_path + '/results.json') @@ -108,23 +106,11 @@ def fit2test(self, data): # idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score # idx_normal_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # from lowest to highest score - if dataset in ('mnist', 'fmnist', 'cifar10'): + return self - if dataset in ('mnist', 'fmnist'): - X_all_low = dataset.test_set.data[idx_all_sorted[:32], ...].unsqueeze(1) - X_all_high = dataset.test_set.data[idx_all_sorted[-32:], ...].unsqueeze(1) - X_normal_low = dataset.test_set.data[idx_normal_sorted[:32], ...].unsqueeze(1) - X_normal_high = dataset.test_set.data[idx_normal_sorted[-32:], ...].unsqueeze(1) + def predict_score(self, X): + # input randomly generated y label for consistence + dataset = load_dataset(data={'X_test': X, 'y_test': np.random.choice([0, 1], X.shape[0])}, train=False) + score = self.deepSAD.test(dataset, device=self.device, n_jobs_dataloader=self.n_jobs_dataloader) - if dataset == 'cifar10': - X_all_low = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[:32], ...], (0,3,1,2))) - X_all_high = torch.tensor(np.transpose(dataset.test_set.data[idx_all_sorted[-32:], ...], (0,3,1,2))) - X_normal_low = torch.tensor(np.transpose(dataset.test_set.data[idx_normal_sorted[:32], ...], (0,3,1,2))) - X_normal_high = torch.tensor(np.transpose(dataset.test_set.data[idx_normal_sorted[-32:], ...], (0,3,1,2))) - - plot_images_grid(X_all_low, export_img=self.xp_path + '/all_low', padding=2) - plot_images_grid(X_all_high, export_img=self.xp_path + '/all_high', padding=2) - plot_images_grid(X_normal_low, export_img=self.xp_path + '/normals_low', padding=2) - plot_images_grid(X_normal_high, export_img=self.xp_path + '/normals_high', padding=2) - - return {'aucroc':deepSAD.results['test_aucroc'], 'aucpr':deepSAD.results['test_aucpr']} \ No newline at end of file + return score \ No newline at end of file diff --git a/baseline/DevNet/run.py b/baseline/DevNet/run.py index 3429037..cd3d975 100644 --- a/baseline/DevNet/run.py +++ b/baseline/DevNet/run.py @@ -3,8 +3,8 @@ @author: Guansong Pang The algorithm was implemented using Python 3.6.6, Keras 2.2.2 and TensorFlow 1.10.1. More details can be found in our KDD19 paper. -Guansong Pang, Chunhua Shen, and Anton van den Hengel. 2019. -Deep Anomaly Detection with Deviation Networks. +Guansong Pang, Chunhua Shen, and Anton van den Hengel. 2019. +Deep Anomaly Detection with Deviation Networks. In The 25th ACM SIGKDDConference on Knowledge Discovery and Data Mining (KDD ’19), August4–8, 2019, Anchorage, AK, USA.ACM, New York, NY, USA, 10 pages. https://doi.org/10.1145/3292500.3330871 """ @@ -16,9 +16,13 @@ from keras import backend as K from keras.models import Model, load_model from keras.layers import Input, Dense -from keras.optimizers import RMSprop from keras.callbacks import ModelCheckpoint, TensorBoard +try: + from keras.optimizers import RMSprop # old tf version +except: + from tensorflow.keras.optimizers import RMSprop + import argparse import numpy as np import pandas as pd @@ -67,6 +71,7 @@ def __init__(self, seed, model_name='DevNet', save_suffix=None): # random_seed = args.ramdn_seed self.save_suffix = save_suffix + self.ref = None # normal distribution reference, created for reusing across subsequent function calls def dev_network_d(self,input_shape): ''' @@ -105,12 +110,15 @@ def deviation_loss(self, y_true, y_pred): ''' z-score-based deviation loss ''' + confidence_margin = 5. ## size=5000 is the setting of l in algorithm 1 in the paper - ref = K.variable(np.random.normal(loc = 0., scale= 1.0, size = 5000) , dtype='float32') - dev = (y_pred - K.mean(ref)) / K.std(ref) + if self.ref is None: + self.ref = K.variable(np.random.normal(loc = 0., scale= 1.0, size = 5000), dtype='float32') + dev = (y_pred - K.mean(self.ref)) / K.std(self.ref) inlier_loss = K.abs(dev) outlier_loss = K.abs(K.maximum(confidence_margin - dev, 0.)) + return K.mean((1 - y_true) * inlier_loss + y_true * outlier_loss) def deviation_network(self, input_shape, network_depth): @@ -160,7 +168,7 @@ def input_batch_generation_sup(self, X_train, outlier_indices, inlier_indices, b sid = rng.choice(n_outliers, 1) ref[i] = X_train[outlier_indices[sid]] training_labels += [1] - return np.array(ref), np.array(training_labels) + return np.array(ref), np.array(training_labels, dtype=float) def input_batch_generation_sup_sparse(self, X_train, outlier_indices, inlier_indices, batch_size, rng): ''' @@ -218,9 +226,9 @@ def fit(self, X_train, y_train, ratio=None): save_best_only = True, save_weights_only = True) self.model.fit_generator(self.batch_generator_sup(X_train, outlier_indices, inlier_indices, batch_size, nb_batch, rng), - steps_per_epoch = nb_batch, - epochs = epochs, - callbacks=[checkpointer]) + steps_per_epoch = nb_batch, + epochs = epochs, + callbacks=[checkpointer]) return self diff --git a/baseline/FEAWAD/run.py b/baseline/FEAWAD/run.py index 48819e0..8f8c2b3 100644 --- a/baseline/FEAWAD/run.py +++ b/baseline/FEAWAD/run.py @@ -25,10 +25,18 @@ from keras import backend as K from keras.models import Model from keras.layers import Input, Dense, Subtract,concatenate,Lambda,Reshape -from keras.optimizers import Adam, RMSprop from keras.callbacks import ModelCheckpoint from keras.losses import mean_squared_error +try: + from keras.optimizers import Adam, RMSprop +except: + from tensorflow.keras.optimizers import Adam, RMSprop + +# Disable TF eager execution mode for avoid the errors caused by the custom loss function +from tensorflow.python.framework.ops import disable_eager_execution +disable_eager_execution() + class FEAWAD(): def __init__(self, seed, model_name='FEAWAD', save_suffix=None): self.utils = Utils() @@ -103,7 +111,7 @@ def dev_network_d(self, input_shape, modelname, testflag): cal_norm2 = Lambda(lambda x: tf.norm(x,ord = 2,axis=1)) sub_norm2 = cal_norm2(sub_result) sub_norm2 = Reshape((1,))(sub_norm2) - division = Lambda(lambda x:tf.div(x[0],x[1])) + division = Lambda(lambda x:tf.divide(x[0],x[1])) sub_result = division([sub_result,sub_norm2]) # normalized reconstruction residual error conca_tensor = concatenate([sub_result,en2],axis=1) # [hidden representation, normalized reconstruction residual error] @@ -113,7 +121,7 @@ def dev_network_d(self, input_shape, modelname, testflag): cal_norm2 = Lambda(lambda x: tf.norm(x,ord = 2,axis=1)) sub_norm2 = cal_norm2(sub_result) sub_norm2 = Reshape((1,))(sub_norm2) - division = Lambda(lambda x:tf.div(x[0],x[1])) + division = Lambda(lambda x:tf.divide(x[0],x[1])) sub_result = division([sub_result,sub_norm2]) conca_tensor = concatenate([sub_result,en2],axis=1) @@ -125,6 +133,7 @@ def dev_network_d(self, input_shape, modelname, testflag): intermediate = concatenate([intermediate,sub_norm2],axis=1) # again, concat the intermediate vector with the residual error output_pre = Dense(1, kernel_initializer='glorot_normal',use_bias=True,activation='linear', name = 'score')(intermediate) dev_model = Model(x_input, output_pre) + def multi_loss(y_true,y_pred): confidence_margin = 5. @@ -188,7 +197,7 @@ def AE_input_batch_generation_sup(self, train_x, inlier_indices, batch_size, rng sid = rng.choice(n_inliers, 1) ref[i] = train_x[inlier_indices[sid]] training_labels[i] = train_x[inlier_indices[sid]] - return np.array(ref), np.array(training_labels) + return np.array(ref), np.array(training_labels, dtype=float) def batch_generator_sup(self, x, outlier_indices, inlier_indices, batch_size, nb_batch, rng): """batch generator @@ -227,7 +236,7 @@ def input_batch_generation_sup(self, train_x, outlier_indices, inlier_indices, b sid = rng.choice(n_outliers, 1) ref[i] = train_x[outlier_indices[sid]] training_labels += [1] - return np.array(ref), np.array(training_labels) + return np.array(ref), np.array(training_labels, dtype=float) def input_batch_generation_sup_sparse(self, train_x, outlier_indices, inlier_indices, batch_size, rng): ''' @@ -249,7 +258,7 @@ def input_batch_generation_sup_sparse(self, train_x, outlier_indices, inlier_ind ref[i] = outlier_indices[sid] training_labels += [1] ref = train_x[ref, :].toarray() - return ref, np.array(training_labels) + return ref, np.array(training_labels, dtype=float) def load_model_weight_predict(self, model_name, input_shape, network_depth, test_x): ''' diff --git a/baseline/FTTransformer/run.py b/baseline/FTTransformer/run.py index 909ed88..5d64964 100644 --- a/baseline/FTTransformer/run.py +++ b/baseline/FTTransformer/run.py @@ -26,7 +26,11 @@ def __init__(self, seed:int, model_name:str, n_epochs=100, batch_size=64): self.utils = Utils() # device - self.device = torch.device('cuda:0') if model_name == 'FTTransformer' else torch.device('cpu') + if model_name == 'FTTransformer': + self.device = self.utils.get_device(gpu_specific=True) + else: + self.device = self.utils.get_device(gpu_specific=False) + # Docs: https://yura52.github.io/zero/0.0.4/reference/api/zero.improve_reproducibility.html zero.improve_reproducibility(seed=self.seed) @@ -34,26 +38,55 @@ def __init__(self, seed:int, model_name:str, n_epochs=100, batch_size=64): self.n_epochs = n_epochs # default is 1000 self.batch_size = batch_size # default is 256 - def fit2test(self, data): + def apply_model(self, x_num, x_cat=None): + if isinstance(self.model, rtdl.FTTransformer): + return self.model(x_num, x_cat) + elif isinstance(self.model, (rtdl.MLP, rtdl.ResNet)): + assert x_cat is None + return self.model(x_num) + else: + raise NotImplementedError( + f'Looks like you are using a custom model: {type(self.model)}.' + ' Then you have to implement this branch first.' + ) + + @torch.no_grad() + def evaluate(self, X, y=None): + self.model.eval() + score = [] + # for batch in zero.iter_batches(X[part], 1024): + for batch in zero.iter_batches(X, self.batch_size): + score.append(self.apply_model(batch)) + score = torch.cat(score).squeeze(1).cpu().numpy() + score = scipy.special.expit(score) + + # calculate the metric + if y is not None: + target = y.cpu().numpy() + metric = self.utils.metric(y_true=target, y_score=score) + else: + metric = {'aucroc': None, 'aucpr': None} + + return score, metric['aucpr'] + + def fit(self, X_train, y_train, ratio=None): # set seed self.utils.set_seed(self.seed) # training set is used as the validation set in the anomaly detection task - X = {'train': torch.from_numpy(data['X_train']).float().to(self.device), - 'val': torch.from_numpy(data['X_train']).float().to(self.device), - 'test': torch.from_numpy(data['X_test']).float().to(self.device)} + X = {'train': torch.from_numpy(X_train).float().to(self.device), + 'val': torch.from_numpy(X_train).float().to(self.device)} - y = {'train': torch.from_numpy(data['y_train']).float().to(self.device), - 'val': torch.from_numpy(data['y_train']).float().to(self.device), - 'test': torch.from_numpy(data['y_test']).float().to(self.device)} + y = {'train': torch.from_numpy(y_train).float().to(self.device), + 'val': torch.from_numpy(y_train).float().to(self.device)} task_type = 'binclass' n_classes = None d_out = n_classes or 1 if self.model_name == 'ResNet': - model = rtdl.ResNet.make_baseline( - d_in=data['X_train'].shape[1], + self.model = rtdl.ResNet.make_baseline( + d_in=X_train.shape[1], d_main=128, d_hidden=256, dropout_first=0.2, @@ -65,8 +98,8 @@ def fit2test(self, data): weight_decay = 0.0 elif self.model_name == 'FTTransformer': - model = rtdl.FTTransformer.make_default( - n_num_features=data['X_train'].shape[1], + self.model = rtdl.FTTransformer.make_default( + n_num_features=X_train.shape[1], cat_cardinalities=None, last_layer_query_idx=[-1], # it makes the model faster and does NOT affect its output d_out=d_out, @@ -75,11 +108,11 @@ def fit2test(self, data): else: raise NotImplementedError - model.to(self.device) + self.model.to(self.device) optimizer = ( - model.make_default_optimizer() - if isinstance(model, rtdl.FTTransformer) - else torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay) + self.model.make_default_optimizer() + if isinstance(self.model, rtdl.FTTransformer) + else torch.optim.AdamW(self.model.parameters(), lr=lr, weight_decay=weight_decay) ) loss_fn = ( F.binary_cross_entropy_with_logits @@ -89,34 +122,6 @@ def fit2test(self, data): else F.mse_loss ) - def apply_model(x_num, x_cat=None): - if isinstance(model, rtdl.FTTransformer): - return model(x_num, x_cat) - elif isinstance(model, (rtdl.MLP, rtdl.ResNet)): - assert x_cat is None - return model(x_num) - else: - raise NotImplementedError( - f'Looks like you are using a custom model: {type(model)}.' - ' Then you have to implement this branch first.' - ) - - @torch.no_grad() - def evaluate(part): - model.eval() - score = [] - # for batch in zero.iter_batches(X[part], 1024): - for batch in zero.iter_batches(X[part], self.batch_size): - score.append(apply_model(batch)) - score = torch.cat(score).squeeze(1).cpu().numpy() - score = scipy.special.expit(score) - - # calculate the metric - target = y[part].cpu().numpy() - metric = self.utils.metric(y_true=target, y_score=score) - - return score, metric['aucpr'] - # Create a dataloader for batches of indices # Docs: https://yura52.github.io/zero/reference/api/zero.data.IndexLoader.html train_loader = zero.data.IndexLoader(len(X['train']), self.batch_size, device=self.device) @@ -125,39 +130,36 @@ def evaluate(part): # Docs: https://yura52.github.io/zero/reference/api/zero.ProgressTracker.html progress = zero.ProgressTracker(patience=100) - _, metric = evaluate("test") - print(f'Test metric before training: {metric:.4f}') - - # training # report_frequency = len(X['train']) // self.batch_size // 5 - score_test = None for epoch in range(1, self.n_epochs + 1): for iteration, batch_idx in enumerate(train_loader): - model.train() + self.model.train() optimizer.zero_grad() x_batch = X['train'][batch_idx] y_batch = y['train'][batch_idx] - loss = loss_fn(apply_model(x_batch).squeeze(1), y_batch) + loss = loss_fn(self.apply_model(x_batch).squeeze(1), y_batch) loss.backward() optimizer.step() # if iteration % report_frequency == 0: # print(f'(epoch) {epoch} (batch) {iteration} (loss) {loss.item():.4f}') - _, val_metric = evaluate('val') - score_test_epoch, test_metric = evaluate('test') - print(f'Epoch {epoch:03d} | Validation metric: {val_metric:.4f} | Test metric: {test_metric:.4f}', end='') + _, val_metric = self.evaluate(X=X['val'], y=y['val']) + print(f'Epoch {epoch:03d} | Validation metric: {val_metric:.4f}', end='') progress.update((-1 if task_type == 'regression' else 1) * val_metric) if progress.success: print(' <<< BEST VALIDATION EPOCH', end='') - score_test = score_test_epoch.copy() print() if progress.fail: break - result = self.utils.metric(y_true=data['y_test'], y_score=score_test) + return self + + def predict_score(self, X): + X = torch.from_numpy(X).float().to(self.device) + score, _ = self.evaluate(X=X, y=None) + return score - return result diff --git a/data_generator.py b/data_generator.py index e811b10..cf79feb 100644 --- a/data_generator.py +++ b/data_generator.py @@ -208,71 +208,10 @@ def generator(self, la=None, at_least_one_labeled=False, # set seed for reproducible results self.utils.set_seed(self.seed) - # transfer different file format to the numpy array - if self.dataset in ['annthyroid', 'cardio', 'mammography', 'musk', 'optdigits', 'pendigits', - 'satimage-2', 'speech', 'thyroid', 'vowels', 'cover', 'http', 'letter', - 'mnist', 'satellite', 'shuttle', 'smtp', 'breastw', 'vertebral', - 'wine']: - if self.dataset in ['http', 'smtp']: - data = mat73.loadmat(os.path.join('datasets', self.dataset + '.mat')) - else: - data = scipy.io.loadmat(os.path.join('datasets', self.dataset + '.mat')) - X = data['X'] - y = data['y'].squeeze().astype('int64') - - elif self.dataset in ['Waveform_withoutdupl_v10', 'InternetAds_withoutdupl_norm_19', 'PageBlocks_withoutdupl_09', - 'SpamBase_withoutdupl_40', 'Wilt_withoutdupl_05', 'Cardiotocography_withoutdupl_22', - 'WBC_withoutdupl_v10', 'WDBC_withoutdupl_v10', 'WPBC_withoutdupl_norm', - 'Arrhythmia_withoutdupl_46', 'HeartDisease_withoutdupl_44', 'Hepatitis_withoutdupl_16', - 'Parkinson_withoutdupl_75', 'Pima_withoutdupl_35', 'Stamps_withoutdupl_09']: - data = pd.read_csv(os.path.join('datasets', self.dataset + '.csv')) - - data.columns = [_.split("'")[1] for _ in data.columns] - X = data.drop(['outlier', 'id'], axis=1).values - y = [_.split("'")[1] for _ in data['outlier'].values] - y = np.array([0 if _ == 'no' else 1 for _ in y]) - - elif self.dataset in ['ALOI_withoutdupl', 'glass_withoutduplicates_normalized', - 'Ionosphere_withoutdupl_norm', 'Lymphography_withoutdupl_idf']: - data = pd.read_csv(os.path.join('datasets', self.dataset + '.csv')) - - X = data.drop(['outlier','id'], axis=1).values - y = np.array([0 if _ == 'no' else 1 for _ in data['outlier'].values]) - - elif self.dataset in ['abalone.diff', 'comm.and.crime.diff', 'concrete.diff', 'fault.diff', 'imgseg.diff', - 'landsat.diff', 'magic.gamma.diff', 'skin.diff', 'yeast.diff']: - data = pd.read_csv(os.path.join('datasets', self.dataset + '.csv')) - X = data.drop(['point.id', 'motherset', 'origin', 'original.label', 'diff.score', 'ground.truth'], - axis=1).values - y = np.array([0 if _ == 'nominal' else 1 for _ in data['ground.truth'].values]) - - # Credit Card Fraud Detection (CCFD) dataset - elif self.dataset == 'CCFD': - data = pd.read_csv(os.path.join('datasets', self.dataset + '.csv')) - X = data.drop(['Time', 'Class'], axis=1) - y = data['Class'].values - - # Taiwan Bankruptcy Prediction (TBP) dataset - elif self.dataset == 'TBP': - data = pd.read_csv(os.path.join('datasets', self.dataset + '.csv')) - X = data.drop(['Flag'], axis=1) - y = data['Flag'].values - - elif self.dataset in ['amazon', 'yelp', 'imdb'] + \ - ['agnews_' + str(i) for i in range(4)] +\ - ['FashionMNIST_' + str(i) for i in range(10)] +\ - ['CIFAR10_' + str(i) for i in range(10)] +\ - ['SVHN_' + str(i) for i in range(10)]: - data = np.load(os.path.join('datasets', 'NLPCV', self.dataset + '.npz')) - X = data['X'] - y = data['y'] - - else: - raise NotImplementedError - - # to array - X = np.array(X) - y = np.array(y) + # load dataset + data = np.load(os.path.join('datasets', self.dataset), allow_pickle=True) + X = data['X'] + y = data['y'] # number of labeled anomalies in the original data if type(la) == float: diff --git a/myutils.py b/myutils.py index 2121f7d..f4c4600 100644 --- a/myutils.py +++ b/myutils.py @@ -40,18 +40,19 @@ def set_seed(self, seed): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False - def get_device(self): - # if torch.cuda.is_available(): - # n_gpu = torch.cuda.device_count() - # print(f'number of gpu: {n_gpu}') - # print(f'cuda name: {torch.cuda.get_device_name(0)}') - # print('GPU is on') - # else: - # print('GPU is off') - # - # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - device = torch.device("cpu") + def get_device(self, gpu_specific=False): + if gpu_specific: + if torch.cuda.is_available(): + n_gpu = torch.cuda.device_count() + print(f'number of gpu: {n_gpu}') + print(f'cuda name: {torch.cuda.get_device_name(0)}') + print('GPU is on') + else: + print('GPU is off') + + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + else: + device = torch.device("cpu") return device # generate unique value diff --git a/run.py b/run.py index 9329edb..e023656 100644 --- a/run.py +++ b/run.py @@ -14,6 +14,7 @@ # unsupervised models from baseline.PyOD import PYOD from baseline.DAGMM.run import DAGMM + # semi-supervised models from baseline.GANomaly.run import GANomaly from baseline.DeepSAD.src.run import DeepSAD @@ -21,12 +22,13 @@ from baseline.DevNet.run import DevNet from baseline.PReNet.run import PReNet from baseline.FEAWAD.run import FEAWAD -# fully-supervised models + +# # fully-supervised models from baseline.Supervised import supervised from baseline.FTTransformer.run import FTTransformer class RunPipeline(): - def __init__(self, suffix:str=None, mode:str='rla', parallel:str=None, NLP_CV=False, + def __init__(self, suffix:str=None, mode:str='rla', parallel:str=None, generate_duplicates=True, n_samples_threshold=1000, realistic_synthetic_mode:str=None, noise_type=None): @@ -34,7 +36,6 @@ def __init__(self, suffix:str=None, mode:str='rla', parallel:str=None, NLP_CV=Fa :param suffix: saved file suffix (including the model performance result and model weights) :param mode: rla or nla —— ratio of labeled anomalies or number of labeled anomalies :param parallel: unsupervise, semi-supervise or supervise, choosing to parallelly run the code - :param NLP_CV: whether to test on the NLP and CV datasets, which are transformed by the pretrained Bert and ResNet18 model :param generate_duplicates: whether to generate duplicated samples when sample size is too small :param n_samples_threshold: threshold for generating the above duplicates, if generate_duplicates is False, then datasets with sample size smaller than n_samples_threshold will be dropped :param realistic_synthetic_mode: local, global, dependency or cluster —— whether to generate the realistic synthetic anomalies to test different algorithms @@ -55,17 +56,8 @@ def __init__(self, suffix:str=None, mode:str='rla', parallel:str=None, NLP_CV=Fa self.noise_type = noise_type # the suffix of all saved files - if NLP_CV: - self.suffix = suffix + '_NLP_CV_' + str(realistic_synthetic_mode) + '_' + str(noise_type) + '_' + self.parallel - else: - self.suffix = suffix + '_Tabular_' + str(realistic_synthetic_mode) + '_' + str(noise_type) + '_' + self.parallel - - # whether to test on the NLP and CV datasets - self.NLP_CV = NLP_CV - - if self.NLP_CV: - assert self.realistic_synthetic_mode is None - assert self.noise_type is None + self.suffix = suffix + '_' + 'type(' + str(realistic_synthetic_mode) + ')_' + 'noise(' + str(noise_type) + ')_'\ + + self.parallel # data generator instantiation self.data_generator = DataGenerator(generate_duplicates=self.generate_duplicates, @@ -110,9 +102,9 @@ def __init__(self, suffix:str=None, mode:str='rla', parallel:str=None, NLP_CV=Fa # DAGMM self.model_dict['DAGMM'] = DAGMM - # # DeepSVDD (if necessary, the DeepSVDD is only for tensorflow 2.0+) - # for _ in ['DeepSVDD']: - # self.model_dict[_] = PYOD + # DeepSVDD (if necessary, the DeepSVDD is only for tensorflow 2.0+) + for _ in ['DeepSVDD']: + self.model_dict[_] = PYOD # semi-supervised algorithms elif self.parallel == 'semi-supervise': @@ -129,6 +121,7 @@ def __init__(self, suffix:str=None, mode:str='rla', parallel:str=None, NLP_CV=Fa # from sklearn for _ in ['LR', 'NB', 'SVM', 'MLP', 'RF', 'LGB', 'XGB', 'CatB']: self.model_dict[_] = supervised + # ResNet and FTTransformer for tabular data for _ in ['ResNet', 'FTTransformer']: self.model_dict[_] = FTTransformer else: @@ -142,8 +135,7 @@ def __init__(self, suffix:str=None, mode:str='rla', parallel:str=None, NLP_CV=Fa # dataset filter for delelting those datasets that do not satisfy the experimental requirement def dataset_filter(self): # dataset list in the current folder - dataset_list_org = [os.path.splitext(_)[0] for _ in os.listdir(os.path.join(os.getcwd(), 'datasets')) - if os.path.splitext(_)[1] != ''] + dataset_list_org = os.listdir('datasets') dataset_list = [] dataset_size = [] @@ -158,9 +150,6 @@ def dataset_filter(self): if not self.generate_duplicates and len(data['y_train']) + len(data['y_test']) < self.n_samples_threshold: add = False - # elif len(data['y_train']) + len(data['y_test']) > 50000: - # add = False - else: if self.mode == 'nla' and sum(data['y_train']) >= self.nla_list[-1]: pass @@ -171,6 +160,11 @@ def dataset_filter(self): else: add = False + # remove high-dimensional CV and NLP datasets if generating synthetic anomalies or robustness test + if self.realistic_synthetic_mode is not None or self.noise_type is not None: + if any([_ in dataset for _ in ['CIFAR10, FashionMNIST', 'SVHN', 'agnews', 'amazon', 'imdb', 'yelp']]): + add = False + if add: dataset_list.append(dataset) dataset_size.append(len(data['y_train']) + len(data['y_test'])) @@ -196,20 +190,17 @@ def model_fit(self): pass try: - # model fitting, currently most of models are implemented to output the anomaly score - if self.model_name not in ['DeepSAD', 'ResNet', 'FTTransformer']: - # fitting - self.clf = self.clf.fit(X_train=self.data['X_train'], y_train=self.data['y_train'], - ratio=sum(self.data['y_test']) / len(self.data['y_test'])) - # predicting score - if self.model_name == 'DAGMM': - score_test = self.clf.predict_score(self.data['X_train'], self.data['X_test']) - else: - score_test = self.clf.predict_score(self.data['X_test']) - # performance - result = self.utils.metric(y_true=self.data['y_test'], y_score=score_test, pos_label=1) + # fitting + self.clf = self.clf.fit(X_train=self.data['X_train'], y_train=self.data['y_train'], + ratio=sum(self.data['y_test']) / len(self.data['y_test'])) + # predicting score + if self.model_name == 'DAGMM': + score_test = self.clf.predict_score(self.data['X_train'], self.data['X_test']) else: - result = self.clf.fit2test(self.data) + score_test = self.clf.predict_score(self.data['X_test']) + + # performance + result = self.utils.metric(y_true=self.data['y_test'], y_score=score_test, pos_label=1) K.clear_session() print(f"Model: {self.model_name}, AUC-ROC: {result['aucroc']}, AUC-PR: {result['aucpr']}") @@ -227,11 +218,7 @@ def model_fit(self): # run the experiment def run(self): # filteting dataset that do not meet the experimental requirements - if self.NLP_CV: - dataset_list = [os.path.splitext(_)[0] for _ in os.listdir(os.path.join(os.getcwd(), 'datasets_NLP_CV')) - if os.path.splitext(_)[-1] == '.npz'] - else: - dataset_list = self.dataset_filter() + dataset_list = self.dataset_filter() # experimental parameters if self.mode == 'nla': @@ -262,9 +249,6 @@ def run(self): if self.parallel == 'unsupervise' and la != 0.0 and self.noise_type is None: continue - if self.NLP_CV and any([_ in dataset for _ in ['agnews', 'FashionMNIST', 'CIFAR10', 'SVHN']]) and self.seed > 1: - continue - print(f'Current experiment parameters: {params}') # generate data @@ -315,6 +299,5 @@ def run(self): df_time.to_csv(os.path.join(os.getcwd(), 'result', 'Time_' + self.suffix + '.csv'), index=True) # run the above pipeline for reproducing the results in the paper -pipeline = RunPipeline(suffix='DB_test', parallel='unsupervise', NLP_CV=False, - realistic_synthetic_mode=None, noise_type=None) +pipeline = RunPipeline(suffix='ADBench_test', parallel='semi-supervise', realistic_synthetic_mode=None, noise_type=None) pipeline.run() \ No newline at end of file