Skip to content

Commit

Permalink
[MRG] Example for JMLR (#211)
Browse files Browse the repository at this point in the history
* Add a classification report example

* add an example for multiclass

* finish the example

* Use signature instead of poping kwargs

* Solve the issue with the doc

* Correct mispealing

* Add readme for dataset examples
  • Loading branch information
glemaitre authored Dec 31, 2016
1 parent ac16d91 commit ca8e7f4
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 23 deletions.
3 changes: 2 additions & 1 deletion doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ Metrics
Functions
---------
.. autosummary::
:toctree: generated/
:toctree: generated/

metrics.sensitivity_specificity_support
metrics.sensitivity_score
metrics.specificity_score
Expand Down
40 changes: 40 additions & 0 deletions examples/applications/plot_multi_class_under_sampling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
=============================================
Multiclass classification with under-sampling
=============================================
Some balancing methods allow for balancing dataset with multiples classes.
We provide an example to illustrate the use of those methods which do
not differ from the binary case.
"""

from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

RANDOM_STATE = 42

# Create a folder to fetch the dataset
iris = load_iris()
# Make the dataset imbalanced
# Select only half of the first class
iris.data = iris.data[25:-1, :]
iris.target = iris.target[25:-1]

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
random_state=RANDOM_STATE)

# Create a pipeline
pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE),
LinearSVC(random_state=RANDOM_STATE))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
6 changes: 6 additions & 0 deletions examples/datasets/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.. _dataset_examples:

Dataset examples
-----------------------

Examples concerning the :mod:`imblearn.datasets` module.
45 changes: 45 additions & 0 deletions examples/evaluation/plot_classification_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
=============================================
Evaluate classification by compiling a report
=============================================
Specific metrics have been developed to evaluate classifier which has been
trained using imbalanced data. `imblearn` provides a classification
report similar to `sklearn`, with additional metrics specific to imbalanced
learning problem.
"""

from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from imblearn import over_sampling as os
from imblearn import pipeline as pl
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

RANDOM_STATE = 42

# Generate a dataset
X, y = datasets.make_classification(n_classes=2, class_sep=2,
weights=[0.1, 0.9], n_informative=10,
n_redundant=1, flip_y=0, n_features=20,
n_clusters_per_class=4, n_samples=5000,
random_state=RANDOM_STATE)

pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
LinearSVC(random_state=RANDOM_STATE))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=RANDOM_STATE)

# Train the classifier with balancing
pipeline.fit(X_train, y_train)

# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)

# Show the classification report
print(classification_report_imbalanced(y_test, y_pred_bal))
74 changes: 74 additions & 0 deletions examples/evaluation/plot_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
=======================================
Metrics specific to imbalanced learning
=======================================
Specific metrics have been developed to evaluate classifier which
has been trained using imbalanced data. `imblearn` provides mainly
two additional metrics which are not implemented in `sklearn`: (i)
geometric mean and (ii) index balanced accuracy.
"""

from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from imblearn import over_sampling as os
from imblearn import pipeline as pl
from imblearn.metrics import (geometric_mean_score,
make_index_balanced_accuracy)

print(__doc__)

RANDOM_STATE = 42

# Generate a dataset
X, y = datasets.make_classification(n_classes=3, class_sep=2,
weights=[0.1, 0.9], n_informative=10,
n_redundant=1, flip_y=0, n_features=20,
n_clusters_per_class=4, n_samples=5000,
random_state=RANDOM_STATE)

pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
LinearSVC(random_state=RANDOM_STATE))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=RANDOM_STATE)

# Train the classifier with balancing
pipeline.fit(X_train, y_train)

# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)

###############################################################################
# The geometric mean corresponds to the square root of the product of the
# sensitivity and specificity. Combining the two metrics should account for
# the balancing of the dataset.

print('The geometric mean is {}'.format(geometric_mean_score(
y_test,
y_pred_bal)))

###############################################################################
# The index balanced accuracy can transform any metric to be used in
# imbalanced learning problems.

alpha = 0.1
geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
geometric_mean_score)

print('The IBA using alpha = {} and the geometric mean: {}'.format(
alpha, geo_mean(
y_test,
y_pred_bal)))

alpha = 0.5
geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
geometric_mean_score)

print('The IBA using alpha = {} and the geometric mean: {}'.format(
alpha, geo_mean(
y_test,
y_pred_bal)))
2 changes: 1 addition & 1 deletion examples/model_selection/plot_validation_curve.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
weights=[0.1, 0.9], n_informative=10,
n_redundant=1, flip_y=0, n_features=20,
n_clusters_per_class=4, n_samples=5000,
random_state=10)
random_state=RANDOM_STATE)
smote = os.SMOTE(random_state=RANDOM_STATE)
cart = tree.DecisionTreeClassifier(random_state=RANDOM_STATE)
pipeline = pl.make_pipeline(smote, cart)
Expand Down
45 changes: 24 additions & 21 deletions imblearn/metrics/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import logging
import functools

from inspect import getcallargs

import numpy as np

from sklearn.metrics.classification import (_check_targets, _prf_divide,
Expand All @@ -22,6 +24,12 @@
from sklearn.utils.fixes import bincount
from sklearn.utils.multiclass import unique_labels

try:
from inspect import signature
except ImportError:
from sklearn.externals.funcsigs import signature


LOGGER = logging.getLogger(__name__)


Expand Down Expand Up @@ -563,10 +571,10 @@ def geometric_mean_score(y_true,


def make_index_balanced_accuracy(alpha=0.1, squared=True):
"""Balance any scoring function using the indexed balanced accuracy
"""Balance any scoring function using the index balanced accuracy
This factory function wraps scoring function to express it as the
indexed balanced accuracy (IBA). You need to use this function to
index balanced accuracy (IBA). You need to use this function to
decorate any scoring function.
Parameters
Expand All @@ -582,7 +590,7 @@ def make_index_balanced_accuracy(alpha=0.1, squared=True):
-------
iba_scoring_func : callable,
Returns the scoring metric decorated which will automatically compute
the indexed balanced accuracy.
the index balanced accuracy.
Examples
--------
Expand All @@ -603,21 +611,16 @@ def compute_score(*args, **kwargs):
# Square if desired
if squared:
_score = np.power(_score, 2)
# args will contain the y_pred and y_true
# kwargs will contain the other parameters
labels = kwargs.get('labels', None)
pos_label = kwargs.get('pos_label', 1)
average = kwargs.get('average', 'binary')
sample_weight = kwargs.get('sample_weight', None)
# Compute the sensitivity and specificity
dict_sen_spe = {
'labels': labels,
'pos_label': pos_label,
'average': average,
'sample_weight': sample_weight
}
sen, spe, _ = sensitivity_specificity_support(*args,
**dict_sen_spe)
# Create the list of tags
tags_scoring_func = getcallargs(scoring_func, *args, **kwargs)
# Get the signature of the sens/spec function
sens_spec_sig = signature(sensitivity_specificity_support)
# Filter the inputs required by the sens/spec function
tags_sens_spec = sens_spec_sig.bind(**tags_scoring_func)
# Call the sens/spec function
sen, spe, _ = sensitivity_specificity_support(
*tags_sens_spec.args,
**tags_sens_spec.kwargs)
# Compute the dominance
dom = sen - spe
return (1. + alpha * dom) * _score
Expand All @@ -640,7 +643,7 @@ def classification_report_imbalanced(y_true,
Specific metrics have been proposed to evaluate the classification
performed on imbalanced dataset. This report compiles the
state-of-the-art metrics: precision/recall/specificity, geometric
mean, and indexed balanced accuracy of the
mean, and index balanced accuracy of the
geometric mean.
Parameters
Expand Down Expand Up @@ -674,7 +677,7 @@ def classification_report_imbalanced(y_true,
-------
report : string
Text summary of the precision, recall, specificity, geometric mean,
and indexed balanced accuracy.
and index balanced accuracy.
Examples
--------
Expand Down Expand Up @@ -746,7 +749,7 @@ class 2 1.00 0.67 1.00 0.80 0.82 0.69\
labels=labels,
average=None,
sample_weight=sample_weight)
# Indexed balanced accuracy
# Index balanced accuracy
iba_gmean = make_index_balanced_accuracy(
alpha=alpha, squared=True)(geometric_mean_score)
iba = iba_gmean(
Expand Down

0 comments on commit ca8e7f4

Please sign in to comment.