From 909b84f6fa0e9938d40c2fe3e9ae08f81a996d94 Mon Sep 17 00:00:00 2001 From: yzhao062 Date: Tue, 24 Oct 2023 22:35:51 -0700 Subject: [PATCH] add DIF (#506) --- CHANGES.txt | 3 +- README.rst | 7 +- docs/index.rst | 3 +- docs/pyod.models.rst | 11 + examples/dif_example.py | 65 ++-- pyod/models/dif.py | 816 ++++++++++++++++++++-------------------- pyod/test/test_dif.py | 208 +++++----- pyod/version.py | 2 +- 8 files changed, 566 insertions(+), 549 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 6e98df804..83eed4930 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -178,4 +178,5 @@ v<1.0.8>, <03/08/2023> -- Add QMCD detector (#452). v<1.0.8>, <03/08/2023> -- Optimized ECDF and drop Statsmodels dependency (#467). v<1.0.9>, <03/19/2023> -- Hot fix for errors in ECOD and COPOD due to the issue of scipy. v<1.1.0>, <06/19/2023> -- Further integration of PyThresh. -v<1.1.1>, <07/03/2023> -- Bump up sklearn requirement and some hot fixes. \ No newline at end of file +v<1.1.1>, <07/03/2023> -- Bump up sklearn requirement and some hot fixes. +v<1.1.1>, <10/24/2023> -- Add deep isolation forest (#506) \ No newline at end of file diff --git a/README.rst b/README.rst index 39ad4487b..1a01529a1 100644 --- a/README.rst +++ b/README.rst @@ -58,7 +58,7 @@ Python Outlier Detection (PyOD) ----- -**News**: We just released a 45-page, the most comprehensive `anomaly detection benchmark paper `_. +**News**: We have a 45-page, the most comprehensive `anomaly detection benchmark paper `_. The fully `open-sourced ADBench `_ compares 30 anomaly detection algorithms on 57 benchmark datasets. **For time-series outlier detection**, please use `TODS `_. @@ -70,7 +70,7 @@ multivariate data. This exciting yet challenging field is commonly referred as or `Anomaly Detection `_. PyOD includes more than 40 detection algorithms, from classical LOF (SIGMOD 2000) to -the latest ECOD (TKDE 2022). Since 2017, PyOD has been successfully used in numerous academic researches and +the latest ECOD and DIF (TKDE 2022 and 2023). Since 2017, PyOD has been successfully used in numerous academic researches and commercial products with more than `10 million downloads `_. It is also well acknowledged by the machine learning community with various dedicated posts/tutorials, including `Analytics Vidhya `_, @@ -199,9 +199,10 @@ Alternatively, you could clone and run setup.py file: * numpy>=1.19 * numba>=0.51 * scipy>=1.5.1 -* scikit_learn>=0.20.0 +* scikit_learn>=0.22.0 * six + **Optional Dependencies (see details below)**\ : * combo (optional, required for models/combination.py and FeatureBagging) diff --git a/docs/index.rst b/docs/index.rst index a59079402..9519607e6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -76,7 +76,7 @@ multivariate data. This exciting yet challenging field is commonly referred as or `Anomaly Detection `_. PyOD includes more than 40 detection algorithms, from classical LOF (SIGMOD 2000) to -the latest ECOD (TKDE 2022). Since 2017, PyOD :cite:`a-zhao2019pyod` has been successfully used in numerous +the latest ECOD and DIF (TKDE 2022 and 2023). Since 2017, PyOD :cite:`a-zhao2019pyod` has been successfully used in numerous academic researches and commercial products with more than `10 million downloads `_. It is also well acknowledged by the machine learning community with various dedicated posts/tutorials, including `Analytics Vidhya `_, @@ -209,6 +209,7 @@ Proximity-Based SOD Subspace Outlier Detection Proximity-Based ROD Rotation-based Outlier Detection 2020 :class:`pyod.models.rod.ROD` :cite:`a-almardeny2020novel` Outlier Ensembles IForest Isolation Forest 2008 :class:`pyod.models.iforest.IForest` :cite:`a-liu2008isolation,a-liu2012isolation` Outlier Ensembles INNE Isolation-based Anomaly Detection Using Nearest-Neighbor Ensembles 2018 :class:`pyod.models.inne.INNE` :cite:`a-bandaragoda2018isolation` +Outlier Ensembles DIF Deep Isolation Forest for Anomaly Detection 2023 :class:`pyod.models.dif.DIF` :cite:`a-Xu2023Deep` Outlier Ensembles FB Feature Bagging 2005 :class:`pyod.models.feature_bagging.FeatureBagging` :cite:`a-lazarevic2005feature` Outlier Ensembles LSCP LSCP: Locally Selective Combination of Parallel Outlier Ensembles 2019 :class:`pyod.models.lscp.LSCP` :cite:`a-zhao2019lscp` Outlier Ensembles XGBOD Extreme Boosting Based Outlier Detection **(Supervised)** 2018 :class:`pyod.models.xgbod.XGBOD` :cite:`a-zhao2018xgbod` diff --git a/docs/pyod.models.rst b/docs/pyod.models.rst index fe40d0d8a..76792a8a1 100644 --- a/docs/pyod.models.rst +++ b/docs/pyod.models.rst @@ -105,6 +105,17 @@ pyod.models.deep\_svdd module :show-inheritance: :inherited-members: +pyod.models.dif module +----------------------------- + +.. automodule:: pyod.models.dif + :members: + :exclude-members: + :undoc-members: + :show-inheritance: + :inherited-members: + + pyod.models.ecod module ------------------------ diff --git a/examples/dif_example.py b/examples/dif_example.py index 8d64ed6b1..55fffa9ce 100644 --- a/examples/dif_example.py +++ b/examples/dif_example.py @@ -13,42 +13,41 @@ # temporary solution for relative imports in case pyod is not installed # if pyod is installed, no need to use the following line sys.path.append( - os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) + os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) from pyod.models.dif import DIF from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print - if __name__ == "__main__": - contamination = 0.1 # percentage of outliers - n_train = 20000 # number of training points - n_test = 2000 # number of testing points - n_features = 300 # number of features - - # Generate sample data - X_train, X_test, y_train, y_test = \ - generate_data(n_train=n_train, - n_test=n_test, - n_features=n_features, - contamination=contamination, - random_state=42) - - # train AutoEncoder detector - clf_name = 'DIF' - clf = DIF() - clf.fit(X_train) - - # get the prediction labels and outlier scores of the training data - y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) - y_train_scores = clf.decision_scores_ # raw outlier scores - - # get the prediction on the test data - y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) - y_test_scores = clf.decision_function(X_test) # outlier scores - - # evaluate and print the results - print("\nOn Training Data:") - evaluate_print(clf_name, y_train, y_train_scores) - print("\nOn Test Data:") - evaluate_print(clf_name, y_test, y_test_scores) + contamination = 0.1 # percentage of outliers + n_train = 1000 # number of training points + n_test = 200 # number of testing points + n_features = 30 # number of features + + # Generate sample data + X_train, X_test, y_train, y_test = \ + generate_data(n_train=n_train, + n_test=n_test, + n_features=n_features, + contamination=contamination, + random_state=42) + + # train deep isolation forest detector + clf_name = 'DIF' + clf = DIF() + clf.fit(X_train) + + # get the prediction labels and outlier scores of the training data + y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) + y_train_scores = clf.decision_scores_ # raw outlier scores + + # get the prediction on the test data + y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) + y_test_scores = clf.decision_function(X_test) # outlier scores + + # evaluate and print the results + print("\nOn Training Data:") + evaluate_print(clf_name, y_train, y_train_scores) + print("\nOn Test Data:") + evaluate_print(clf_name, y_test, y_test_scores) diff --git a/pyod/models/dif.py b/pyod/models/dif.py index 1128f18bf..4e6025941 100644 --- a/pyod/models/dif.py +++ b/pyod/models/dif.py @@ -16,433 +16,437 @@ from torch.utils.data import DataLoader from .base import BaseDetector -from ..utils.utility import check_parameter from ..utils.torch_utility import get_activation_by_name class DIF(BaseDetector): - """Deep Isolation Forest (DIF) is an extension of iForest. It uses deep - representation ensemble to achieve non-linear isolation on original data - space. See :cite:`xu2023dif` for details. - - Parameters - ---------- - batch_size : int, optional (default=1000) - Number of samples per gradient update. - - representation_dim, int, optional (default=20) - Dimensionality of the representation space. - - hidden_neurons, list, optional (default=[64, 32]) - The number of neurons per hidden layers. So the network has the - structure as [n_features, hidden_neurons[0], hidden_neurons[1], - ..., representation_dim] - - hidden_activation, str, optional (default='tanh') - Activation function to use for hidden layers. - All hidden layers are forced to use the same type of activation. - See https://pytorch.org/docs/stable/nn.html for details. - Currently only - 'relu': nn.ReLU() - 'sigmoid': nn.Sigmoid() - 'tanh': nn.Tanh() - are supported. See pyod/utils/torch_utility.py for details. - - skip_connection, boolean, optional (default=False) - If True, apply skip-connection in the neural network structure. - - n_ensemble, int, optional (default=50) - The number of deep representation ensemble members. - - n_estimators, int, optional (default=6) - The number of isolation forest of each representation. - - max_samples, int, optional (default=256) - The number of samples to draw from X to train each base isolation tree. - - contamination : float in (0., 0.5), optional (default=0.1) - The amount of contamination of the data set, - i.e. the proportion of outliers in the data set. Used when fitting to - define the threshold on the decision function. - - random_state : int or None, optional (default=None) - If int, random_state is the seed used by the random - number generator; - If None, the random number generator is the - RandomState instance used by `np.random`. - - device, 'cuda', 'cpu', or None, optional (default=None) - if 'cuda', use GPU acceleration in torch - if 'cpu', use cpu in torch - if None, automatically determine whether GPU is available - - - Attributes - ---------- - net_lst : list of torch.Module - The list of representation neural networks. - - iForest_lst : list of iForest - The list of instantiated iForest model. - - x_reduced_lst: list of numpy array - The list of training data representations - - decision_scores_ : numpy array of shape (n_samples,) - The outlier scores of the training data. - The higher, the more abnormal. Outliers tend to have higher - scores. This value is available once the detector is fitted. - - threshold_ : float - The threshold is based on ``contamination``. It is the - ``n_samples * contamination`` most abnormal samples in - ``decision_scores_``. The threshold is calculated for generating - binary outlier labels. - - labels_ : int, either 0 or 1 - The binary labels of the training data. 0 stands for inliers - and 1 for outliers/anomalies. It is generated by applying - ``threshold_`` on ``decision_scores_``. - """ - - def __init__(self, - batch_size=1000, - representation_dim=20, - hidden_neurons=None, - hidden_activation='tanh', - skip_connection=False, - n_ensemble=50, - n_estimators=6, - max_samples=256, - contamination=0.1, - random_state=None, - device=None): - super(DIF, self).__init__(contamination=contamination) - self.batch_size = batch_size - self.representation_dim = representation_dim - self.hidden_activation = hidden_activation - self.skip_connection = skip_connection - self.hidden_neurons = hidden_neurons - - self.n_ensemble = n_ensemble - self.n_estimators = n_estimators - self.max_samples = max_samples - - self.random_state = random_state - self.device = device - - self.minmax_scaler = None - - # create default calculation device (support GPU if available) - if self.device is None: - self.device = torch.device( - "cuda:0" if torch.cuda.is_available() else "cpu") - - # set random seed - if self.random_state is not None: - torch.manual_seed(self.random_state) - torch.cuda.manual_seed(self.random_state) - torch.cuda.manual_seed_all(self.random_state) - np.random.seed(self.random_state) - - # default values for the amount of hidden neurons - if self.hidden_neurons is None: - self.hidden_neurons = [500, 100] - - def fit(self, X, y=None): - """Fit detector. y is ignored in unsupervised methods. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - y : Ignored - Not used, present for API consistency by convention. - - Returns - ------- - self : object - Fitted estimator. - """ - # validate inputs X and y (optional) - X = check_array(X) - self._set_n_classes(y) - - n_samples, n_features = X.shape[0], X.shape[1] - - # conduct min-max normalization before feeding into neural networks - self.minmax_scaler = MinMaxScaler() - self.minmax_scaler.fit(X) - X = self.minmax_scaler.transform(X) - - # prepare neural network parameters - network_params = { - 'n_features': n_features, - 'n_hidden': self.hidden_neurons, - 'n_output': self.representation_dim, - 'activation': self.hidden_activation, - 'skip_connection': self.skip_connection - } - - # iteration - self.net_lst = [] - self.iForest_lst = [] - self.x_reduced_lst = [] - ensemble_seeds = np.random.randint(0, 100000, self.n_ensemble) - for i in range(self.n_ensemble): - # instantiate network class and seed random seed - net = MLPnet(**network_params).to(self.device) - torch.manual_seed(ensemble_seeds[i]) - - # initialize network parameters - for name, param in net.named_parameters(): - if name.endswith('weight'): - torch.nn.init.normal_(param, mean=0., std=1.) - - x_reduced = self._deep_representation(net, X) - - # save network and representations - self.x_reduced_lst.append(x_reduced) - self.net_lst.append(net) - - # perform iForest upon representations - self.iForest_lst.append( - IsolationForest(n_estimators=self.n_estimators, - max_samples=self.max_samples, - random_state=ensemble_seeds[i]) - ) - self.iForest_lst[i].fit(x_reduced) - - self.decision_scores_ = self.decision_function(X) - self._process_decision_scores() - return self - - def decision_function(self, X): - """Predict raw anomaly score of X using the fitted detector. - - The anomaly score of an input sample is computed based on different - detector algorithms. For consistency, outliers are assigned with - larger anomaly scores. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The training input samples. Sparse matrices are accepted only - if they are supported by the base estimator. - - Returns - ------- - anomaly_scores : numpy array of shape (n_samples,) - The anomaly score of the input samples. - """ - check_is_fitted(self, ['net_lst', 'iForest_lst', 'x_reduced_lst']) - X = check_array(X) - - # conduct min-max normalization before feeding into neural networks - X = self.minmax_scaler.transform(X) - - testing_n_samples = X.shape[0] - score_lst = np.zeros([self.n_ensemble, testing_n_samples]) - - # iteration - for i in range(self.n_ensemble): - # transform testing data to representation - x_reduced = self._deep_representation(self.net_lst[i], X) - - # calculate outlier scores - scores = _cal_score(x_reduced, self.iForest_lst[i]) - score_lst[i] = scores - - final_scores = np.average(score_lst, axis=0) - return final_scores - - def _deep_representation(self, net, X): - x_reduced = [] - - with torch.no_grad(): - loader = DataLoader(X, batch_size=self.batch_size, - drop_last=False, pin_memory=True, - shuffle=False) - for batch_x in loader: - batch_x = batch_x.float().to(self.device) - batch_x_reduced = net(batch_x) - x_reduced.append(batch_x_reduced) - - x_reduced = torch.cat(x_reduced).data.cpu().numpy() - x_reduced = StandardScaler().fit_transform(x_reduced) - x_reduced = np.tanh(x_reduced) - return x_reduced + """Deep Isolation Forest (DIF) is an extension of iForest. It uses deep + representation ensemble to achieve non-linear isolation on original data + space. See :cite:`xu2023dif` for details. + + Parameters + ---------- + batch_size : int, optional (default=1000) + Number of samples per gradient update. + + representation_dim, int, optional (default=20) + Dimensionality of the representation space. + + hidden_neurons, list, optional (default=[64, 32]) + The number of neurons per hidden layers. So the network has the + structure as [n_features, hidden_neurons[0], hidden_neurons[1], + ..., representation_dim] + + hidden_activation, str, optional (default='tanh') + Activation function to use for hidden layers. + All hidden layers are forced to use the same type of activation. + See https://pytorch.org/docs/stable/nn.html for details. + Currently only + 'relu': nn.ReLU() + 'sigmoid': nn.Sigmoid() + 'tanh': nn.Tanh() + are supported. See pyod/utils/torch_utility.py for details. + + skip_connection, boolean, optional (default=False) + If True, apply skip-connection in the neural network structure. + + n_ensemble, int, optional (default=50) + The number of deep representation ensemble members. + + n_estimators, int, optional (default=6) + The number of isolation forest of each representation. + + max_samples, int, optional (default=256) + The number of samples to draw from X to train each base isolation tree. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + random_state : int or None, optional (default=None) + If int, random_state is the seed used by the random + number generator; + If None, the random number generator is the + RandomState instance used by `np.random`. + + device, 'cuda', 'cpu', or None, optional (default=None) + if 'cuda', use GPU acceleration in torch + if 'cpu', use cpu in torch + if None, automatically determine whether GPU is available + + + Attributes + ---------- + net_lst : list of torch.Module + The list of representation neural networks. + + iForest_lst : list of iForest + The list of instantiated iForest model. + + x_reduced_lst: list of numpy array + The list of training data representations + + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + def __init__(self, + batch_size=1000, + representation_dim=20, + hidden_neurons=None, + hidden_activation='tanh', + skip_connection=False, + n_ensemble=50, + n_estimators=6, + max_samples=256, + contamination=0.1, + random_state=None, + device=None): + super(DIF, self).__init__(contamination=contamination) + self.batch_size = batch_size + self.representation_dim = representation_dim + self.hidden_activation = hidden_activation + self.skip_connection = skip_connection + self.hidden_neurons = hidden_neurons + + self.n_ensemble = n_ensemble + self.n_estimators = n_estimators + self.max_samples = max_samples + + self.random_state = random_state + self.device = device + + self.minmax_scaler = None + + # create default calculation device (support GPU if available) + if self.device is None: + self.device = torch.device( + "cuda:0" if torch.cuda.is_available() else "cpu") + + # set random seed + if self.random_state is not None: + torch.manual_seed(self.random_state) + torch.cuda.manual_seed(self.random_state) + torch.cuda.manual_seed_all(self.random_state) + np.random.seed(self.random_state) + + # default values for the amount of hidden neurons + if self.hidden_neurons is None: + self.hidden_neurons = [500, 100] + + def fit(self, X, y=None): + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + # validate inputs X and y (optional) + X = check_array(X) + self._set_n_classes(y) + + n_samples, n_features = X.shape[0], X.shape[1] + + # conduct min-max normalization before feeding into neural networks + self.minmax_scaler = MinMaxScaler() + self.minmax_scaler.fit(X) + X = self.minmax_scaler.transform(X) + + # prepare neural network parameters + network_params = { + 'n_features': n_features, + 'n_hidden': self.hidden_neurons, + 'n_output': self.representation_dim, + 'activation': self.hidden_activation, + 'skip_connection': self.skip_connection + } + + # iteration + self.net_lst = [] + self.iForest_lst = [] + self.x_reduced_lst = [] + ensemble_seeds = np.random.randint(0, 100000, self.n_ensemble) + for i in range(self.n_ensemble): + # instantiate network class and seed random seed + net = MLPnet(**network_params).to(self.device) + torch.manual_seed(ensemble_seeds[i]) + + # initialize network parameters + for name, param in net.named_parameters(): + if name.endswith('weight'): + torch.nn.init.normal_(param, mean=0., std=1.) + + x_reduced = self._deep_representation(net, X) + + # save network and representations + self.x_reduced_lst.append(x_reduced) + self.net_lst.append(net) + + # perform iForest upon representations + self.iForest_lst.append( + IsolationForest(n_estimators=self.n_estimators, + max_samples=self.max_samples, + random_state=ensemble_seeds[i]) + ) + self.iForest_lst[i].fit(x_reduced) + + self.decision_scores_ = self.decision_function(X) + self._process_decision_scores() + return self + + def decision_function(self, X): + """Predict raw anomaly score of X using the fitted detector. + + The anomaly score of an input sample is computed based on different + detector algorithms. For consistency, outliers are assigned with + larger anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['net_lst', 'iForest_lst', 'x_reduced_lst']) + X = check_array(X) + + # conduct min-max normalization before feeding into neural networks + X = self.minmax_scaler.transform(X) + + testing_n_samples = X.shape[0] + score_lst = np.zeros([self.n_ensemble, testing_n_samples]) + + # iteration + for i in range(self.n_ensemble): + # transform testing data to representation + x_reduced = self._deep_representation(self.net_lst[i], X) + + # calculate outlier scores + scores = _cal_score(x_reduced, self.iForest_lst[i]) + score_lst[i] = scores + + final_scores = np.average(score_lst, axis=0) + return final_scores + + def _deep_representation(self, net, X): + x_reduced = [] + + with torch.no_grad(): + loader = DataLoader(X, batch_size=self.batch_size, + drop_last=False, pin_memory=True, + shuffle=False) + for batch_x in loader: + batch_x = batch_x.float().to(self.device) + batch_x_reduced = net(batch_x) + x_reduced.append(batch_x_reduced) + + x_reduced = torch.cat(x_reduced).data.cpu().numpy() + x_reduced = StandardScaler().fit_transform(x_reduced) + x_reduced = np.tanh(x_reduced) + return x_reduced class MLPnet(torch.nn.Module): - def __init__(self, n_features, n_hidden=[500, 100], n_output=20, - activation='ReLU', bias=False, batch_norm=False, - skip_connection=False): - super(MLPnet, self).__init__() - self.skip_connection = skip_connection - self.n_output = n_output - - num_layers = len(n_hidden) - - if type(activation) == str: - activation = [activation] * num_layers - activation.append(None) - - assert len(activation) == len(n_hidden)+1, 'activation and n_hidden are not matched' - - self.layers = [] - for i in range(num_layers+1): - in_channels, out_channels = \ - self.get_in_out_channels(i, num_layers, n_features, - n_hidden, n_output, skip_connection) - self.layers += [ - LinearBlock(in_channels, out_channels, - bias=bias, batch_norm=batch_norm, - activation=activation[i], - skip_connection=skip_connection if i != num_layers else False) - ] - self.network = torch.nn.Sequential(*self.layers) - - - def forward(self, x): - x = self.network(x) - return x - - @staticmethod - def get_in_out_channels(i, num_layers, n_features, n_hidden, n_output, skip_connection): - if skip_connection is False: - in_channels = n_features if i == 0 else n_hidden[i-1] - out_channels = n_output if i == num_layers else n_hidden[i] - else: - in_channels = n_features if i == 0 else np.sum(n_hidden[:i])+n_features - out_channels = n_output if i == num_layers else n_hidden[i] - return in_channels, out_channels + def __init__(self, n_features, n_hidden=[500, 100], n_output=20, + activation='ReLU', bias=False, batch_norm=False, + skip_connection=False): + super(MLPnet, self).__init__() + self.skip_connection = skip_connection + self.n_output = n_output + + num_layers = len(n_hidden) + + if type(activation) == str: + activation = [activation] * num_layers + activation.append(None) + + assert len(activation) == len( + n_hidden) + 1, 'activation and n_hidden are not matched' + + self.layers = [] + for i in range(num_layers + 1): + in_channels, out_channels = \ + self.get_in_out_channels(i, num_layers, n_features, + n_hidden, n_output, skip_connection) + self.layers += [ + LinearBlock(in_channels, out_channels, + bias=bias, batch_norm=batch_norm, + activation=activation[i], + skip_connection=skip_connection if i != num_layers else False) + ] + self.network = torch.nn.Sequential(*self.layers) + + def forward(self, x): + x = self.network(x) + return x + + @staticmethod + def get_in_out_channels(i, num_layers, n_features, n_hidden, n_output, + skip_connection): + if skip_connection is False: + in_channels = n_features if i == 0 else n_hidden[i - 1] + out_channels = n_output if i == num_layers else n_hidden[i] + else: + in_channels = n_features if i == 0 else np.sum( + n_hidden[:i]) + n_features + out_channels = n_output if i == num_layers else n_hidden[i] + return in_channels, out_channels class LinearBlock(torch.nn.Module): - def __init__(self, in_channels, out_channels, - activation='Tanh', bias=False, batch_norm=False, - skip_connection=False): - super(LinearBlock, self).__init__() + def __init__(self, in_channels, out_channels, + activation='Tanh', bias=False, batch_norm=False, + skip_connection=False): + super(LinearBlock, self).__init__() - self.skip_connection = skip_connection + self.skip_connection = skip_connection - self.linear = torch.nn.Linear(in_channels, out_channels, bias=bias) + self.linear = torch.nn.Linear(in_channels, out_channels, bias=bias) - if activation is not None: - # self.act_layer = _instantiate_class("torch.nn.modules.activation", activation) - self.act_layer = get_activation_by_name(activation) - else: - self.act_layer = torch.nn.Identity() + if activation is not None: + # self.act_layer = _instantiate_class("torch.nn.modules.activation", activation) + self.act_layer = get_activation_by_name(activation) + else: + self.act_layer = torch.nn.Identity() - self.batch_norm = batch_norm - if batch_norm is True: - dim = out_channels - self.bn_layer = torch.nn.BatchNorm1d(dim, affine=bias) + self.batch_norm = batch_norm + if batch_norm is True: + dim = out_channels + self.bn_layer = torch.nn.BatchNorm1d(dim, affine=bias) - def forward(self, x): - x1 = self.linear(x) - x1 = self.act_layer(x1) + def forward(self, x): + x1 = self.linear(x) + x1 = self.act_layer(x1) - if self.batch_norm is True: - x1 = self.bn_layer(x1) + if self.batch_norm is True: + x1 = self.bn_layer(x1) - if self.skip_connection: - x1 = torch.cat([x, x1], axis=1) + if self.skip_connection: + x1 = torch.cat([x, x1], axis=1) - return x1 + return x1 def _cal_score(xx, clf): - depths = np.zeros((xx.shape[0], len(clf.estimators_))) - depth_sum = np.zeros(xx.shape[0]) - deviations = np.zeros((xx.shape[0], len(clf.estimators_))) - leaf_samples = np.zeros((xx.shape[0], len(clf.estimators_))) - - for ii, estimator_tree in enumerate(clf.estimators_): - tree = estimator_tree.tree_ - n_node = tree.node_count - - if n_node == 1: - continue - - # get feature and threshold of each node in the iTree - # in feature_lst, -2 indicates the leaf node - feature_lst, threshold_lst = tree.feature.copy(), tree.threshold.copy() - - # compute depth and score - leaves_index = estimator_tree.apply(xx) - node_indicator = estimator_tree.decision_path(xx) - - # The number of training samples in each test sample leaf - n_node_samples = estimator_tree.tree_.n_node_samples - - # node_indicator is a sparse matrix with shape (n_samples, n_nodes), - # indicating the path of input data samples - # each layer would result in a non-zero element in this matrix, - # and then the row-wise summation is the depth of data sample - n_samples_leaf = estimator_tree.tree_.n_node_samples[leaves_index] - d = (np.ravel(node_indicator.sum(axis=1)) + _average_path_length(n_samples_leaf) - 1.0) - depths[:, ii] = d - depth_sum += d - - # decision path of data matrix XX - node_indicator = np.array(node_indicator.todense()) - - # set a matrix with shape [n_sample, n_node], - # representing the feature value of each sample on each node - # set the leaf node as -2 - value_mat = np.array([xx[i][feature_lst] for i in range(xx.shape[0])]) - value_mat[:, np.where(feature_lst == -2)[0]] = -2 - th_mat = np.array([threshold_lst for _ in range(xx.shape[0])]) - - mat = np.abs(value_mat - th_mat) * node_indicator - - exist = (mat != 0) - dev = mat.sum(axis=1)/(exist.sum(axis=1)+1e-6) - deviations[:, ii] = dev - - scores = 2 ** (-depth_sum / (len(clf.estimators_) * _average_path_length([clf.max_samples_]))) - deviation = np.mean(deviations, axis=1) - leaf_sample = (clf.max_samples_ - np.mean(leaf_samples, axis=1)) / clf.max_samples_ - - scores = scores * deviation - # scores = scores * deviation * leaf_sample - return scores + depths = np.zeros((xx.shape[0], len(clf.estimators_))) + depth_sum = np.zeros(xx.shape[0]) + deviations = np.zeros((xx.shape[0], len(clf.estimators_))) + leaf_samples = np.zeros((xx.shape[0], len(clf.estimators_))) + + for ii, estimator_tree in enumerate(clf.estimators_): + tree = estimator_tree.tree_ + n_node = tree.node_count + + if n_node == 1: + continue + + # get feature and threshold of each node in the iTree + # in feature_lst, -2 indicates the leaf node + feature_lst, threshold_lst = tree.feature.copy(), tree.threshold.copy() + + # compute depth and score + leaves_index = estimator_tree.apply(xx) + node_indicator = estimator_tree.decision_path(xx) + + # The number of training samples in each test sample leaf + n_node_samples = estimator_tree.tree_.n_node_samples + + # node_indicator is a sparse matrix with shape (n_samples, n_nodes), + # indicating the path of input data samples + # each layer would result in a non-zero element in this matrix, + # and then the row-wise summation is the depth of data sample + n_samples_leaf = estimator_tree.tree_.n_node_samples[leaves_index] + d = (np.ravel(node_indicator.sum(axis=1)) + _average_path_length( + n_samples_leaf) - 1.0) + depths[:, ii] = d + depth_sum += d + + # decision path of data matrix XX + node_indicator = np.array(node_indicator.todense()) + + # set a matrix with shape [n_sample, n_node], + # representing the feature value of each sample on each node + # set the leaf node as -2 + value_mat = np.array([xx[i][feature_lst] for i in range(xx.shape[0])]) + value_mat[:, np.where(feature_lst == -2)[0]] = -2 + th_mat = np.array([threshold_lst for _ in range(xx.shape[0])]) + + mat = np.abs(value_mat - th_mat) * node_indicator + + exist = (mat != 0) + dev = mat.sum(axis=1) / (exist.sum(axis=1) + 1e-6) + deviations[:, ii] = dev + + scores = 2 ** (-depth_sum / (len(clf.estimators_) * _average_path_length( + [clf.max_samples_]))) + deviation = np.mean(deviations, axis=1) + leaf_sample = (clf.max_samples_ - np.mean(leaf_samples, + axis=1)) / clf.max_samples_ + + scores = scores * deviation + # scores = scores * deviation * leaf_sample + return scores def _average_path_length(n_samples_leaf): - """ - The average path length in a n_samples iTree, which is equal to - the average path length of an unsuccessful BST search since the - latter has the same structure as an isolation tree. - Parameters - ---------- - n_samples_leaf : array-like of shape (n_samples,) - The number of training samples in each test sample leaf, for - each estimators. - - Returns - ------- - average_path_length : ndarray of shape (n_samples,) - """ - - n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False) - - n_samples_leaf_shape = n_samples_leaf.shape - n_samples_leaf = n_samples_leaf.reshape((1, -1)) - average_path_length = np.zeros(n_samples_leaf.shape) - - mask_1 = n_samples_leaf <= 1 - mask_2 = n_samples_leaf == 2 - not_mask = ~np.logical_or(mask_1, mask_2) - - average_path_length[mask_1] = 0. - average_path_length[mask_2] = 1. - average_path_length[not_mask] = ( - 2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma) - - 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask] - ) - - return average_path_length.reshape(n_samples_leaf_shape) + """ + The average path length in a n_samples iTree, which is equal to + the average path length of an unsuccessful BST search since the + latter has the same structure as an isolation tree. + Parameters + ---------- + n_samples_leaf : array-like of shape (n_samples,) + The number of training samples in each test sample leaf, for + each estimators. + + Returns + ------- + average_path_length : ndarray of shape (n_samples,) + """ + + n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False) + + n_samples_leaf_shape = n_samples_leaf.shape + n_samples_leaf = n_samples_leaf.reshape((1, -1)) + average_path_length = np.zeros(n_samples_leaf.shape) + + mask_1 = n_samples_leaf <= 1 + mask_2 = n_samples_leaf == 2 + not_mask = ~np.logical_or(mask_1, mask_2) + + average_path_length[mask_1] = 0. + average_path_length[mask_2] = 1. + average_path_length[not_mask] = ( + 2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma) + - 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask] + ) + + return average_path_length.reshape(n_samples_leaf_shape) diff --git a/pyod/test/test_dif.py b/pyod/test/test_dif.py index 14bf52c01..d00998844 100644 --- a/pyod/test/test_dif.py +++ b/pyod/test/test_dif.py @@ -24,110 +24,110 @@ class TestDIF(unittest.TestCase): - def setUp(self): - self.n_train = 3000 - self.n_test = 1000 - self.n_features = 200 - self.contamination = 0.1 - self.roc_floor = 0.8 - self.X_train, self.X_test, self.y_train, self.y_test = generate_data( - n_train=self.n_train, n_test=self.n_test, - n_features=self.n_features, contamination=self.contamination, - random_state=42) - - self.clf = DIF(skip_connection=True, contamination=self.contamination) - self.clf.fit(self.X_train) - - self.clf2 = DIF(skip_connection=False, contamination=self.contamination) - self.clf2.fit(self.X_train) - - def test_parameters(self): - assert (hasattr(self.clf, 'decision_scores_') and - self.clf.decision_scores_ is not None) - assert (hasattr(self.clf, 'labels_') and - self.clf.labels_ is not None) - assert (hasattr(self.clf, 'threshold_') and - self.clf.threshold_ is not None) - - def test_train_scores(self): - assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) - assert_equal(len(self.clf2.decision_scores_), self.X_train.shape[0]) - - def test_prediction_scores(self): - pred_scores = self.clf.decision_function(self.X_test) - pred_scores2 = self.clf2.decision_function(self.X_test) - - # check score shapes - assert_equal(pred_scores.shape[0], self.X_test.shape[0]) - assert_equal(pred_scores2.shape[0], self.X_test.shape[0]) - - # check performance - assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) - assert (roc_auc_score(self.y_test, pred_scores2) >= self.roc_floor) - - def test_prediction_labels(self): - pred_labels = self.clf.predict(self.X_test) - assert_equal(pred_labels.shape, self.y_test.shape) - - def test_prediction_proba(self): - pred_proba = self.clf.predict_proba(self.X_test) - assert (pred_proba.min() >= 0) - assert (pred_proba.max() <= 1) - - def test_prediction_proba_linear(self): - pred_proba = self.clf.predict_proba(self.X_test, method='linear') - assert (pred_proba.min() >= 0) - assert (pred_proba.max() <= 1) - - def test_prediction_proba_unify(self): - pred_proba = self.clf.predict_proba(self.X_test, method='unify') - assert (pred_proba.min() >= 0) - assert (pred_proba.max() <= 1) - - def test_prediction_proba_parameter(self): - with assert_raises(ValueError): - self.clf.predict_proba(self.X_test, method='something') - - def test_prediction_labels_confidence(self): - pred_labels, confidence = self.clf.predict(self.X_test, - return_confidence=True) - assert_equal(pred_labels.shape, self.y_test.shape) - assert_equal(confidence.shape, self.y_test.shape) - assert (confidence.min() >= 0) - assert (confidence.max() <= 1) - - def test_prediction_proba_linear_confidence(self): - pred_proba, confidence = self.clf.predict_proba(self.X_test, - method='linear', - return_confidence=True) - assert (pred_proba.min() >= 0) - assert (pred_proba.max() <= 1) - - assert_equal(confidence.shape, self.y_test.shape) - assert (confidence.min() >= 0) - assert (confidence.max() <= 1) - - def test_fit_predict(self): - pred_labels = self.clf.fit_predict(self.X_train) - assert_equal(pred_labels.shape, self.y_train.shape) - - def test_fit_predict_score(self): - self.clf.fit_predict_score(self.X_test, self.y_test) - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='roc_auc_score') - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='prc_n_score') - with assert_raises(NotImplementedError): - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='something') - - def test_model_clone(self): - pass - # clone_clf = clone(self.clf) - - def tearDown(self): - pass + def setUp(self): + self.n_train = 1000 + self.n_test = 100 + self.n_features = 20 + self.contamination = 0.1 + self.roc_floor = 0.8 + self.X_train, self.X_test, self.y_train, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + n_features=self.n_features, contamination=self.contamination, + random_state=42) + + self.clf = DIF(skip_connection=True, contamination=self.contamination) + self.clf.fit(self.X_train) + + self.clf2 = DIF(skip_connection=False, contamination=self.contamination) + self.clf2.fit(self.X_train) + + def test_parameters(self): + assert (hasattr(self.clf, 'decision_scores_') and + self.clf.decision_scores_ is not None) + assert (hasattr(self.clf, 'labels_') and + self.clf.labels_ is not None) + assert (hasattr(self.clf, 'threshold_') and + self.clf.threshold_ is not None) + + def test_train_scores(self): + assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) + assert_equal(len(self.clf2.decision_scores_), self.X_train.shape[0]) + + def test_prediction_scores(self): + pred_scores = self.clf.decision_function(self.X_test) + pred_scores2 = self.clf2.decision_function(self.X_test) + + # check score shapes + assert_equal(pred_scores.shape[0], self.X_test.shape[0]) + assert_equal(pred_scores2.shape[0], self.X_test.shape[0]) + + # check performance + assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) + assert (roc_auc_score(self.y_test, pred_scores2) >= self.roc_floor) + + def test_prediction_labels(self): + pred_labels = self.clf.predict(self.X_test) + assert_equal(pred_labels.shape, self.y_test.shape) + + def test_prediction_proba(self): + pred_proba = self.clf.predict_proba(self.X_test) + assert (pred_proba.min() >= 0) + assert (pred_proba.max() <= 1) + + def test_prediction_proba_linear(self): + pred_proba = self.clf.predict_proba(self.X_test, method='linear') + assert (pred_proba.min() >= 0) + assert (pred_proba.max() <= 1) + + def test_prediction_proba_unify(self): + pred_proba = self.clf.predict_proba(self.X_test, method='unify') + assert (pred_proba.min() >= 0) + assert (pred_proba.max() <= 1) + + def test_prediction_proba_parameter(self): + with assert_raises(ValueError): + self.clf.predict_proba(self.X_test, method='something') + + def test_prediction_labels_confidence(self): + pred_labels, confidence = self.clf.predict(self.X_test, + return_confidence=True) + assert_equal(pred_labels.shape, self.y_test.shape) + assert_equal(confidence.shape, self.y_test.shape) + assert (confidence.min() >= 0) + assert (confidence.max() <= 1) + + def test_prediction_proba_linear_confidence(self): + pred_proba, confidence = self.clf.predict_proba(self.X_test, + method='linear', + return_confidence=True) + assert (pred_proba.min() >= 0) + assert (pred_proba.max() <= 1) + + assert_equal(confidence.shape, self.y_test.shape) + assert (confidence.min() >= 0) + assert (confidence.max() <= 1) + + def test_fit_predict(self): + pred_labels = self.clf.fit_predict(self.X_train) + assert_equal(pred_labels.shape, self.y_train.shape) + + def test_fit_predict_score(self): + self.clf.fit_predict_score(self.X_test, self.y_test) + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='roc_auc_score') + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='prc_n_score') + with assert_raises(NotImplementedError): + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='something') + + def test_model_clone(self): + pass + # clone_clf = clone(self.clf) + + def tearDown(self): + pass if __name__ == '__main__': - unittest.main() + unittest.main() diff --git a/pyod/version.py b/pyod/version.py index 56acab0f1..0cdbd9ac2 100644 --- a/pyod/version.py +++ b/pyod/version.py @@ -20,4 +20,4 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '1.1.0' # pragma: no cover +__version__ = '1.1.1' # pragma: no cover