From 992c53c19aae8dbe6d3f9ff08f77824cf688a79f Mon Sep 17 00:00:00 2001 From: hackeT <40039738+Tatsuya-hasegawa@users.noreply.github.com> Date: Sat, 23 Nov 2024 18:44:58 +0900 Subject: [PATCH] Add multi dimension plots to analysis.outliers module. (#805) * add multi dimension subplots and chosen 2 features scatter from many features * add multi dimension subplots and chosen 2 features scatter from many features * add multi dimension subplots and chosen 2 features scatter from many features --------- Co-authored-by: Ian Hellen --- .pylintrc | 2 + msticpy/analysis/outliers.py | 179 ++++++++++++++++++++++++++--------- 2 files changed, 134 insertions(+), 47 deletions(-) diff --git a/.pylintrc b/.pylintrc index ffd2524c..94f08182 100644 --- a/.pylintrc +++ b/.pylintrc @@ -69,6 +69,8 @@ disable=raw-checker-failed, deprecated-pragma, use-symbolic-message-instead, too-many-positional-arguments, + too-many-arguments, + too-many-statements, # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/msticpy/analysis/outliers.py b/msticpy/analysis/outliers.py index 069c5658..26fff8c4 100644 --- a/msticpy/analysis/outliers.py +++ b/msticpy/analysis/outliers.py @@ -14,7 +14,7 @@ """ import math -from typing import List, Tuple +from typing import List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -33,12 +33,15 @@ ) from imp_err __version__ = VERSION -__author__ = "Ian Hellen" +__author__ = "Ian Hellen, Tatsuya Hasegawa" # pylint: disable=invalid-name def identify_outliers( - x: np.ndarray, x_predict: np.ndarray, contamination: float = 0.05 + x: np.ndarray, + x_predict: np.ndarray, + contamination: float = 0.05, + max_features: Optional[Union[int, float]] = None, ) -> Tuple[IsolationForest, np.ndarray, np.ndarray]: """ Identify outlier items using SkLearn IsolationForest. @@ -51,6 +54,10 @@ def identify_outliers( Model contamination : float Percentage contamination (default: {0.05}) + max_features : int or float, optional + Specifies max num or max rate of features + to be randomly selected when building each tree. + default: None => {math.floor(math.sqrt(cols))} Returns ------- @@ -64,8 +71,10 @@ def identify_outliers( # fit the model rows, cols = x.shape - max_samples = min(100, cols) - max_features = math.floor(math.sqrt(rows)) + max_samples = min(100, rows) + if not max_features: + max_features = math.floor(math.sqrt(cols)) + clf = IsolationForest( max_samples=max_samples, max_features=max_features, @@ -109,52 +118,128 @@ def plot_outlier_results( list of feature columns to display plt_title : str Plot title - """ - # plot the line, the samples, and the nearest vectors to the plane - x_max_x = x[:, 0].max() + (x[:, 0].max() / 10) - x_min_x = -x[:, 0].max() / 10 - x_max_y = x[:, 1].max() + (x[:, 1].max() / 10) - x_min_y = -x[:, 1].max() / 10 - xx, yy = np.meshgrid( - np.linspace(x_min_x, x_max_x, 100), np.linspace(x_min_y, x_max_y, 100) - ) - z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) - z = z.reshape(xx.shape) + if len(feature_columns) == 2: + # two dimension plot: mostly remain original codes from msticpy 2.14.0 + # plot the line, the samples, and the nearest vectors to the plane + x_max_x = x[:, 0].max() + (x[:, 0].max() / 10) + x_min_x = -x[:, 0].max() / 10 + x_max_y = x[:, 1].max() + (x[:, 1].max() / 10) + x_min_y = -x[:, 1].max() / 10 + xx, yy = np.meshgrid( + np.linspace(x_min_x, x_max_x, 100), np.linspace(x_min_y, x_max_y, 100) + ) + z = clf.decision_function( + np.c_[ + xx.ravel(), + yy.ravel(), + np.zeros( + (xx.ravel().shape[0], clf.n_features_in_ - len(feature_columns)) + ), + ] + ) + z = z.reshape(xx.shape) - plt.rcParams["figure.figsize"] = (20, 10) + plt.rcParams["figure.figsize"] = (20, 10) - plt.title(plt_title) - # pylint: disable=no-member - plt.contourf(xx, yy, z, cmap=plt.cm.Blues_r) # type: ignore + plt.title(plt_title) + # pylint: disable=no-member + plt.contourf(xx, yy, z, cmap=plt.cm.Blues_r) # type: ignore - b1 = plt.scatter(x[:, 0], x[:, 1], c="white", s=20, edgecolor="k") - b2 = plt.scatter(x_predict[:, 0], x_predict[:, 1], c="green", s=40, edgecolor="k") - c = plt.scatter( - x_outliers[:, 0], x_outliers[:, 1], c="red", marker="x", s=200, edgecolor="k" - ) - plt.axis("tight") - - xp_max_x = x_predict[:, 0].max() + (x_predict[:, 0].max() / 10) - xp_min_x = -x_predict[:, 0].max() / 10 - xp_max_y = x_predict[:, 1].max() + (x_predict[:, 1].max() / 10) - xp_min_y = -x_predict[:, 1].max() / 10 - - plt.xlim((xp_min_x, xp_max_x)) - plt.ylim((xp_min_y, xp_max_y)) - plt.xlabel(feature_columns[0]) # type: ignore - plt.ylabel(feature_columns[1]) # type: ignore - - plt.legend( - [b1, b2, c], - [ - "training observations", - "new regular observations", - "new abnormal observations", - ], - loc="upper right", - ) - plt.show() + b1 = plt.scatter(x[:, 0], x[:, 1], c="white", s=20, edgecolor="k") + b2 = plt.scatter( + x_predict[:, 0], x_predict[:, 1], c="green", s=40, edgecolor="k" + ) + c = plt.scatter(x_outliers[:, 0], x_outliers[:, 1], c="red", marker="x", s=200) + plt.axis("tight") + + xp_max_x = x_predict[:, 0].max() + (x_predict[:, 0].max() / 10) + xp_min_x = -x_predict[:, 0].max() / 10 + xp_max_y = x_predict[:, 1].max() + (x_predict[:, 1].max() / 10) + xp_min_y = -x_predict[:, 1].max() / 10 + + plt.xlim((xp_min_x, xp_max_x)) + plt.ylim((xp_min_y, xp_max_y)) + plt.xlabel(feature_columns[0]) # type: ignore + plt.ylabel(feature_columns[1]) # type: ignore + + plt.legend( + [b1, b2, c], + [ + "training observations", + "new regular observations", + "new abnormal observations", + ], + loc="upper right", + ) + plt.show() + + elif len(feature_columns) > 2: # multi dimension subplots + dimension_num = x.shape[1] + fig, axes = plt.subplots( + dimension_num, dimension_num, figsize=(20, 20), constrained_layout=True + ) + for i in range(dimension_num): + for j in range(dimension_num): + if i != j: + # plot the line, the samples, and the nearest vectors to the plane + x_max_x = x[:, j].max() + (x[:, j].max() / 10) + x_min_x = -x[:, j].max() / 10 + x_max_y = x[:, i].max() + (x[:, i].max() / 10) + x_min_y = -x[:, i].max() / 10 + xx, yy = np.meshgrid( + np.linspace(x_min_x, x_max_x, 100), + np.linspace(x_min_y, x_max_y, 100), + ) + z = clf.decision_function( + np.c_[ + xx.ravel(), + yy.ravel(), + np.zeros((xx.ravel().shape[0], len(feature_columns) - 2)), + ] + ) + z = z.reshape(xx.shape) + + # pylint: disable=no-member + axes[i, j].contourf(xx, yy, z, cmap=plt.cm.Blues_r) # type: ignore + + b1 = axes[i, j].scatter(x[:, j], x[:, i], c="white", edgecolor="k") + b2 = axes[i, j].scatter( + x_predict[:, j], x_predict[:, i], c="green", edgecolor="k" + ) + c = axes[i, j].scatter( + x_outliers[:, j], x_outliers[:, i], c="red", marker="x" + ) + + xp_max_x = x_predict[:, 0].max() + (x_predict[:, 0].max() / 10) + xp_min_x = -x_predict[:, 0].max() / 10 + xp_max_y = x_predict[:, 1].max() + (x_predict[:, 1].max() / 10) + xp_min_y = -x_predict[:, 1].max() / 10 + + axes[i, j].axis(xmin=xp_min_x, xmax=xp_max_x) + axes[i, j].axis(ymin=xp_min_y, ymax=xp_max_y) + axes[i, j].set_xlabel(f"{feature_columns[j]}") + axes[i, j].set_ylabel(f"{feature_columns[i]}") + + else: + # do not show the same features x,y each other. + axes[i, j].axis("off") + + fig.suptitle(plt_title) + plt.legend( + [b1, b2, c], + [ + "training observations", + "new regular observations", + "new abnormal observations", + ], + facecolor="#0072BD", + framealpha=0.3, + ) + plt.show() + + else: + raise ValueError("plot_outlier_results function needs more than two features.") def remove_common_items(data: pd.DataFrame, columns: List[str]) -> pd.DataFrame: