diff --git a/doc/api.rst b/doc/api.rst index cdd56c6ce..3ffe85f8d 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -133,7 +133,8 @@ Metrics Functions --------- .. autosummary:: -:toctree: generated/ + :toctree: generated/ + metrics.sensitivity_specificity_support metrics.sensitivity_score metrics.specificity_score diff --git a/examples/applications/plot_multi_class_under_sampling.py b/examples/applications/plot_multi_class_under_sampling.py new file mode 100644 index 000000000..c3cad21a0 --- /dev/null +++ b/examples/applications/plot_multi_class_under_sampling.py @@ -0,0 +1,40 @@ +""" +============================================= +Multiclass classification with under-sampling +============================================= + +Some balancing methods allow for balancing dataset with multiples classes. +We provide an example to illustrate the use of those methods which do +not differ from the binary case. + +""" + +from sklearn.datasets import load_iris +from sklearn.svm import LinearSVC +from sklearn.model_selection import train_test_split + +from imblearn.under_sampling import NearMiss +from imblearn.pipeline import make_pipeline +from imblearn.metrics import classification_report_imbalanced + +print(__doc__) + +RANDOM_STATE = 42 + +# Create a folder to fetch the dataset +iris = load_iris() +# Make the dataset imbalanced +# Select only half of the first class +iris.data = iris.data[25:-1, :] +iris.target = iris.target[25:-1] + +X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, + random_state=RANDOM_STATE) + +# Create a pipeline +pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE), + LinearSVC(random_state=RANDOM_STATE)) +pipeline.fit(X_train, y_train) + +# Classify and report the results +print(classification_report_imbalanced(y_test, pipeline.predict(X_test))) diff --git a/examples/datasets/README.txt b/examples/datasets/README.txt new file mode 100644 index 000000000..8b733a738 --- /dev/null +++ b/examples/datasets/README.txt @@ -0,0 +1,6 @@ +.. _dataset_examples: + +Dataset examples +----------------------- + +Examples concerning the :mod:`imblearn.datasets` module. diff --git a/examples/evaluation/plot_classification_report.py b/examples/evaluation/plot_classification_report.py new file mode 100644 index 000000000..b5ef3b113 --- /dev/null +++ b/examples/evaluation/plot_classification_report.py @@ -0,0 +1,45 @@ +""" +============================================= +Evaluate classification by compiling a report +============================================= + +Specific metrics have been developed to evaluate classifier which has been +trained using imbalanced data. `imblearn` provides a classification +report similar to `sklearn`, with additional metrics specific to imbalanced +learning problem. +""" + +from sklearn import datasets +from sklearn.svm import LinearSVC +from sklearn.model_selection import train_test_split + +from imblearn import over_sampling as os +from imblearn import pipeline as pl +from imblearn.metrics import classification_report_imbalanced + +print(__doc__) + +RANDOM_STATE = 42 + +# Generate a dataset +X, y = datasets.make_classification(n_classes=2, class_sep=2, + weights=[0.1, 0.9], n_informative=10, + n_redundant=1, flip_y=0, n_features=20, + n_clusters_per_class=4, n_samples=5000, + random_state=RANDOM_STATE) + +pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE), + LinearSVC(random_state=RANDOM_STATE)) + +# Split the data +X_train, X_test, y_train, y_test = train_test_split(X, y, + random_state=RANDOM_STATE) + +# Train the classifier with balancing +pipeline.fit(X_train, y_train) + +# Test the classifier and get the prediction +y_pred_bal = pipeline.predict(X_test) + +# Show the classification report +print(classification_report_imbalanced(y_test, y_pred_bal)) diff --git a/examples/evaluation/plot_metrics.py b/examples/evaluation/plot_metrics.py new file mode 100644 index 000000000..f39813dd6 --- /dev/null +++ b/examples/evaluation/plot_metrics.py @@ -0,0 +1,74 @@ +""" +======================================= +Metrics specific to imbalanced learning +======================================= + +Specific metrics have been developed to evaluate classifier which +has been trained using imbalanced data. `imblearn` provides mainly +two additional metrics which are not implemented in `sklearn`: (i) +geometric mean and (ii) index balanced accuracy. +""" + +from sklearn import datasets +from sklearn.svm import LinearSVC +from sklearn.model_selection import train_test_split + +from imblearn import over_sampling as os +from imblearn import pipeline as pl +from imblearn.metrics import (geometric_mean_score, + make_index_balanced_accuracy) + +print(__doc__) + +RANDOM_STATE = 42 + +# Generate a dataset +X, y = datasets.make_classification(n_classes=3, class_sep=2, + weights=[0.1, 0.9], n_informative=10, + n_redundant=1, flip_y=0, n_features=20, + n_clusters_per_class=4, n_samples=5000, + random_state=RANDOM_STATE) + +pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE), + LinearSVC(random_state=RANDOM_STATE)) + +# Split the data +X_train, X_test, y_train, y_test = train_test_split(X, y, + random_state=RANDOM_STATE) + +# Train the classifier with balancing +pipeline.fit(X_train, y_train) + +# Test the classifier and get the prediction +y_pred_bal = pipeline.predict(X_test) + +############################################################################### +# The geometric mean corresponds to the square root of the product of the +# sensitivity and specificity. Combining the two metrics should account for +# the balancing of the dataset. + +print('The geometric mean is {}'.format(geometric_mean_score( + y_test, + y_pred_bal))) + +############################################################################### +# The index balanced accuracy can transform any metric to be used in +# imbalanced learning problems. + +alpha = 0.1 +geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)( + geometric_mean_score) + +print('The IBA using alpha = {} and the geometric mean: {}'.format( + alpha, geo_mean( + y_test, + y_pred_bal))) + +alpha = 0.5 +geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)( + geometric_mean_score) + +print('The IBA using alpha = {} and the geometric mean: {}'.format( + alpha, geo_mean( + y_test, + y_pred_bal))) diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py index df43c3a55..abfeb32dc 100644 --- a/examples/model_selection/plot_validation_curve.py +++ b/examples/model_selection/plot_validation_curve.py @@ -29,7 +29,7 @@ weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, - random_state=10) + random_state=RANDOM_STATE) smote = os.SMOTE(random_state=RANDOM_STATE) cart = tree.DecisionTreeClassifier(random_state=RANDOM_STATE) pipeline = pl.make_pipeline(smote, cart) diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index 6937c892c..d9a24d2fc 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -14,6 +14,8 @@ import logging import functools +from inspect import getcallargs + import numpy as np from sklearn.metrics.classification import (_check_targets, _prf_divide, @@ -22,6 +24,12 @@ from sklearn.utils.fixes import bincount from sklearn.utils.multiclass import unique_labels +try: + from inspect import signature +except ImportError: + from sklearn.externals.funcsigs import signature + + LOGGER = logging.getLogger(__name__) @@ -563,10 +571,10 @@ def geometric_mean_score(y_true, def make_index_balanced_accuracy(alpha=0.1, squared=True): - """Balance any scoring function using the indexed balanced accuracy + """Balance any scoring function using the index balanced accuracy This factory function wraps scoring function to express it as the - indexed balanced accuracy (IBA). You need to use this function to + index balanced accuracy (IBA). You need to use this function to decorate any scoring function. Parameters @@ -582,7 +590,7 @@ def make_index_balanced_accuracy(alpha=0.1, squared=True): ------- iba_scoring_func : callable, Returns the scoring metric decorated which will automatically compute - the indexed balanced accuracy. + the index balanced accuracy. Examples -------- @@ -603,21 +611,16 @@ def compute_score(*args, **kwargs): # Square if desired if squared: _score = np.power(_score, 2) - # args will contain the y_pred and y_true - # kwargs will contain the other parameters - labels = kwargs.get('labels', None) - pos_label = kwargs.get('pos_label', 1) - average = kwargs.get('average', 'binary') - sample_weight = kwargs.get('sample_weight', None) - # Compute the sensitivity and specificity - dict_sen_spe = { - 'labels': labels, - 'pos_label': pos_label, - 'average': average, - 'sample_weight': sample_weight - } - sen, spe, _ = sensitivity_specificity_support(*args, - **dict_sen_spe) + # Create the list of tags + tags_scoring_func = getcallargs(scoring_func, *args, **kwargs) + # Get the signature of the sens/spec function + sens_spec_sig = signature(sensitivity_specificity_support) + # Filter the inputs required by the sens/spec function + tags_sens_spec = sens_spec_sig.bind(**tags_scoring_func) + # Call the sens/spec function + sen, spe, _ = sensitivity_specificity_support( + *tags_sens_spec.args, + **tags_sens_spec.kwargs) # Compute the dominance dom = sen - spe return (1. + alpha * dom) * _score @@ -640,7 +643,7 @@ def classification_report_imbalanced(y_true, Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. This report compiles the state-of-the-art metrics: precision/recall/specificity, geometric - mean, and indexed balanced accuracy of the + mean, and index balanced accuracy of the geometric mean. Parameters @@ -674,7 +677,7 @@ def classification_report_imbalanced(y_true, ------- report : string Text summary of the precision, recall, specificity, geometric mean, - and indexed balanced accuracy. + and index balanced accuracy. Examples -------- @@ -746,7 +749,7 @@ class 2 1.00 0.67 1.00 0.80 0.82 0.69\ labels=labels, average=None, sample_weight=sample_weight) - # Indexed balanced accuracy + # Index balanced accuracy iba_gmean = make_index_balanced_accuracy( alpha=alpha, squared=True)(geometric_mean_score) iba = iba_gmean(