From d0ddd7a32cc1e9091512e8ee827b400decba4e44 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 24 Dec 2023 17:56:05 +0100
Subject: [PATCH] Add UserGuideDocExtractor class to extract text from User
 Guide documentation

---
 ragger_duck/scraping/__init__.py              |    4 +
 ragger_duck/scraping/_api_doc.py              |    5 +-
 ragger_duck/scraping/_user_guide.py           |  179 +-
 .../tests/data/user_guide_doc/clustering.html | 2310 +++++++++++++++++
 ragger_duck/scraping/tests/test_user_guide.py |   25 +
 5 files changed, 2518 insertions(+), 5 deletions(-)
 create mode 100644 ragger_duck/scraping/tests/data/user_guide_doc/clustering.html
 create mode 100644 ragger_duck/scraping/tests/test_user_guide.py

diff --git a/ragger_duck/scraping/__init__.py b/ragger_duck/scraping/__init__.py
index 8ffbe18..0383f42 100644
--- a/ragger_duck/scraping/__init__.py
+++ b/ragger_duck/scraping/__init__.py
@@ -9,10 +9,14 @@
     extract_api_doc,
     extract_api_doc_from_single_file,
 )
+from ._user_guide import (
+    UserGuideDocExtractor,
+)
 
 __all__ = [
     "extract_api_doc",
     "extract_api_doc_from_single_file",
     "APIDocExtractor",
     "APINumPyDocExtractor",
+    "UserGuideDocExtractor",
 ]
diff --git a/ragger_duck/scraping/_api_doc.py b/ragger_duck/scraping/_api_doc.py
index 3677d04..e386a6f 100644
--- a/ragger_duck/scraping/_api_doc.py
+++ b/ragger_duck/scraping/_api_doc.py
@@ -15,10 +15,7 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils._param_validation import Interval
 
-from ._shared import (
-    _chunk_document,
-    _extract_text_from_section,
-)
+from ._shared import _chunk_document, _extract_text_from_section
 
 SKLEARN_API_URL = "https://scikit-learn.org/stable/modules/generated/"
 
diff --git a/ragger_duck/scraping/_user_guide.py b/ragger_duck/scraping/_user_guide.py
index 516d726..bf2c86a 100644
--- a/ragger_duck/scraping/_user_guide.py
+++ b/ragger_duck/scraping/_user_guide.py
@@ -1,7 +1,17 @@
 """Utilities to scrape User Guide documentation."""
+from itertools import chain
+from numbers import Integral
+from pathlib import Path
 
+from bs4 import BeautifulSoup
+from joblib import Parallel, delayed
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils._param_validation import Interval
 
-SKLEARN_USER_GUIDE_URL = "https://scikit-learn.org/dev/modules/"
+from ._shared import _chunk_document, _extract_text_from_section
+
+SKLEARN_USER_GUIDE_URL = "https://scikit-learn.org/stable/modules/"
 
 
 def _user_guide_path_to_user_guide_url(path):
@@ -18,3 +28,170 @@ def _user_guide_path_to_user_guide_url(path):
         The User Guide URL.
     """
     return SKLEARN_USER_GUIDE_URL + path.name
+
+
+def extract_user_guide_doc_from_single_file(html_file):
+    """Extract the text from the User Guide documentation.
+
+    This function can process classes and functions.
+
+    Parameters
+    ----------
+    html_file : :class:`pathlib.Path`
+        The path to the HTML User Guide documentation.
+
+    Returns
+    -------
+    str
+        The text extracted from the API documentation.
+    """
+    if not isinstance(html_file, Path):
+        raise ValueError(
+            "The User Guide HTML file should be a pathlib.Path object. "
+            f"Got {html_file!r}."
+        )
+    if html_file.suffix != ".html":
+        raise ValueError(
+            f"The file {html_file} is not an HTML file. Please provide an HTML file."
+        )
+    with open(html_file, "r") as file:
+        soup = BeautifulSoup(file, "html.parser")
+    return [
+        {
+            "source": _user_guide_path_to_user_guide_url(html_file),
+            "text": _extract_text_from_section(section),
+        }
+        for section in soup.section
+    ]
+
+
+def _extract_user_guide_doc(user_guide_doc_folder, *, n_jobs=None):
+    """Extract text from each HTML User Guide files from a folder
+
+    Parameters
+    ----------
+    user_guide_doc_folder : :class:`pathlib.Path`
+        The path to the User Guide documentation folder.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. If None, then the number of jobs is set
+        to the number of CPU cores.
+
+    Returns
+    -------
+    list
+        A list of dictionaries containing the source and text of the API
+        documentation.
+    """
+    if not isinstance(user_guide_doc_folder, Path):
+        raise ValueError(
+            "The User Guide documentation folder should be a pathlib.Path object. Got "
+            f"{user_guide_doc_folder!r}."
+        )
+    output = []
+    for html_file in user_guide_doc_folder.glob("*.html"):
+        texts = extract_user_guide_doc_from_single_file(html_file)
+        for text in texts:
+            if text["text"] is None or text["text"] == "":
+                continue
+            output.append(text)
+    return output
+
+
+class UserGuideDocExtractor(BaseEstimator, TransformerMixin):
+    """Extract text from the User Guide documentation.
+
+    This function can process classes and functions.
+
+    Parameters
+    ----------
+    chunk_size : int or None, default=300
+        The size of the chunks to split the text into. If None, the text is not chunked.
+
+    chunk_overlap : int, default=50
+        The overlap between two consecutive chunks.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel. If None, then the number of jobs is set
+        to the number of CPU cores.
+
+    Attributes
+    ----------
+    text_splitter_ : :class:`langchain.text_splitter.RecursiveCharacterTextSplitter`
+        The text splitter to use to chunk the document. If `chunk_size` is None, this
+        attribute is None.
+    """
+
+    _parameter_constraints = {
+        "chunk_size": [Interval(Integral, left=1, right=None, closed="left"), None],
+        "chunk_overlap": [Interval(Integral, left=0, right=None, closed="left")],
+        "n_jobs": [Integral, None],
+    }
+
+    def __init__(self, *, chunk_size=300, chunk_overlap=50, n_jobs=None):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.n_jobs = n_jobs
+
+    def fit(self, X=None, y=None):
+        """No-op operation, only validate parameters.
+
+        Parameters
+        ----------
+        X : None
+            This parameter is ignored.
+
+        y : None
+            This parameter is ignored.
+
+        Returns
+        -------
+        self
+            The fitted estimator.
+        """
+        self._validate_params()
+        if self.chunk_size is not None:
+            self.text_splitter_ = RecursiveCharacterTextSplitter(
+                separators=["\n\n", "\n", " "],
+                chunk_size=self.chunk_size,
+                chunk_overlap=self.chunk_overlap,
+                length_function=len,
+            )
+        else:
+            self.text_splitter_ = None
+        return self
+
+    def transform(self, X):
+        """Extract text from the API documentation.
+
+        Parameters
+        ----------
+        X : :class:`pathlib.Path`
+            The path to the API documentation folder.
+
+        Returns
+        -------
+        output : list
+            A list of dictionaries containing the source and text of the User Guide
+            documentation.
+        """
+        if self.chunk_size is None:
+            output = _extract_user_guide_doc(X, n_jobs=self.n_jobs)
+        else:
+            output = list(
+                chain.from_iterable(
+                    Parallel(n_jobs=self.n_jobs, return_as="generator")(
+                        delayed(_chunk_document)(self.text_splitter_, document)
+                        for document in _extract_user_guide_doc(X, n_jobs=self.n_jobs)
+                    )
+                )
+            )
+        if not output:
+            raise ValueError(
+                "No User Guide documentation was extracted. Please check the "
+                "input folder."
+            )
+        return output
+
+    def _more_tags(self):
+        return {"X_types": ["string"], "stateless": True}
diff --git a/ragger_duck/scraping/tests/data/user_guide_doc/clustering.html b/ragger_duck/scraping/tests/data/user_guide_doc/clustering.html
new file mode 100644
index 0000000..ca61f01
--- /dev/null
+++ b/ragger_duck/scraping/tests/data/user_guide_doc/clustering.html
@@ -0,0 +1,2310 @@
+
+
+<!DOCTYPE html>
+<!-- data-theme below is forced to be "light" but should be changed if we use pydata-theme-sphinx in the future -->
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" data-content_root="../" data-theme="light"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" data-content_root="../" data-theme="light"> <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+<meta property="og:title" content="2.3. Clustering" />
+<meta property="og:type" content="website" />
+<meta property="og:url" content="https://scikit-learn/stable/modules/clustering.html" />
+<meta property="og:site_name" content="scikit-learn" />
+<meta property="og:description" content="Clustering of unlabeled data can be performed with the module sklearn.cluster. Each clustering algorithm comes in two variants: a class, that implements the fit method to learn the clusters on trai..." />
+<meta property="og:image" content="https://scikit-learn/stable/modules/auto_examples/cluster/images/sphx_glr_plot_cluster_comparison_001.png" />
+<meta property="og:image:alt" content="scikit-learn" />
+<meta name="description" content="Clustering of unlabeled data can be performed with the module sklearn.cluster. Each clustering algorithm comes in two variants: a class, that implements the fit method to learn the clusters on trai..." />
+
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+
+  <title>2.3. Clustering &mdash; scikit-learn 1.4.dev0 documentation</title>
+
+  <link rel="canonical" href="http://scikit-learn.org/stable/modules/clustering.html" />
+
+
+  <link rel="shortcut icon" href="../_static/favicon.ico"/>
+
+
+  <link rel="stylesheet" href="../_static/css/vendor/bootstrap.min.css" type="text/css" />
+  <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
+  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="../_static/copybutton.css" type="text/css" />
+  <link rel="stylesheet" href="../_static/plot_directive.css" type="text/css" />
+  <link rel="stylesheet" href="../_static/sg_gallery.css" type="text/css" />
+  <link rel="stylesheet" href="../_static/sg_gallery-binder.css" type="text/css" />
+  <link rel="stylesheet" href="../_static/sg_gallery-dataframe.css" type="text/css" />
+  <link rel="stylesheet" href="../_static/sg_gallery-rendered-html.css" type="text/css" />
+  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
+<script id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
+<script src="../_static/js/vendor/jquery-3.6.3.slim.min.js"></script>
+<script src="../_static/js/details-permalink.js"></script>
+</head>
+<body>
+
+
+
+
+<nav id="navbar" class="sk-docs-navbar navbar navbar-expand-md navbar-light bg-light py-0">
+  <div class="container-fluid sk-docs-container px-0">
+      <a class="navbar-brand py-0" href="../index.html">
+        <img
+          class="sk-brand-img"
+          src="../_static/scikit-learn-logo-small.png"
+          alt="logo"/>
+      </a>
+    <button
+      id="sk-navbar-toggler"
+      class="navbar-toggler"
+      type="button"
+      data-toggle="collapse"
+      data-target="#navbarSupportedContent"
+      aria-controls="navbarSupportedContent"
+      aria-expanded="false"
+      aria-label="Toggle navigation"
+    >
+      <span class="navbar-toggler-icon"></span>
+    </button>
+
+    <div class="sk-navbar-collapse collapse navbar-collapse" id="navbarSupportedContent">
+      <ul class="navbar-nav mr-auto">
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link" href="../install.html">Install</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link" href="../user_guide.html">User Guide</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link" href="classes.html">API</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link" href="../auto_examples/index.html">Examples</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link" target="_blank" rel="noopener noreferrer" href="https://blog.scikit-learn.org/">Community</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../getting_started.html" >Getting Started</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../tutorial/index.html" >Tutorial</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../whats_new/v1.4.html" >What's new</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../glossary.html" >Glossary</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../developers/index.html" >Development</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../faq.html" >FAQ</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../support.html" >Support</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../related_projects.html" >Related packages</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../roadmap.html" >Roadmap</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../governance.html" >Governance</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../about.html" >About us</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="https://github.com/scikit-learn/scikit-learn" >GitHub</a>
+        </li>
+        <li class="nav-item">
+          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="https://scikit-learn.org/dev/versions.html" >Other Versions and Download</a>
+        </li>
+        <li class="nav-item dropdown nav-more-item-dropdown">
+          <a class="sk-nav-link nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
+          <div class="dropdown-menu" aria-labelledby="navbarDropdown">
+              <a class="sk-nav-dropdown-item dropdown-item" href="../getting_started.html" >Getting Started</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="../tutorial/index.html" >Tutorial</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="../whats_new/v1.4.html" >What's new</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="../glossary.html" >Glossary</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="../developers/index.html" >Development</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="../faq.html" >FAQ</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="../support.html" >Support</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="../related_projects.html" >Related packages</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="../roadmap.html" >Roadmap</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="../governance.html" >Governance</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="../about.html" >About us</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="https://github.com/scikit-learn/scikit-learn" >GitHub</a>
+              <a class="sk-nav-dropdown-item dropdown-item" href="https://scikit-learn.org/dev/versions.html" >Other Versions and Download</a>
+          </div>
+        </li>
+      </ul>
+      <div id="searchbox" role="search">
+          <div class="searchformwrapper">
+          <form class="search" action="../search.html" method="get">
+            <input class="sk-search-text-input" type="text" name="q" aria-labelledby="searchlabel" />
+            <input class="sk-search-text-btn" type="submit" value="Go" />
+          </form>
+          </div>
+      </div>
+    </div>
+  </div>
+</nav>
+<div class="d-flex" id="sk-doc-wrapper">
+    <input type="checkbox" name="sk-toggle-checkbox" id="sk-toggle-checkbox">
+    <label id="sk-sidemenu-toggle" class="sk-btn-toggle-toc btn sk-btn-primary" for="sk-toggle-checkbox">Toggle Menu</label>
+    <div id="sk-sidebar-wrapper" class="border-right">
+      <div class="sk-sidebar-toc-wrapper">
+        <div class="btn-group w-100 mb-2" role="group" aria-label="rellinks">
+            <a href="manifold.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="2.2. Manifold learning">Prev</a><a href="../unsupervised_learning.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="2. Unsupervised learning">Up</a>
+            <a href="biclustering.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="2.4. Biclustering">Next</a>
+        </div>
+        <div class="alert alert-danger p-1 mb-2" role="alert">
+          <p class="text-center mb-0">
+          <strong>scikit-learn 1.4.dev0</strong><br/>
+          <a href="http://scikit-learn.org/dev/versions.html">Other versions</a>
+          </p>
+        </div>
+        <div class="alert alert-warning p-1 mb-2" role="alert">
+          <p class="text-center mb-0">
+            Please <a class="font-weight-bold" href="../about.html#citing-scikit-learn"><string>cite us</string></a> if you use the software.
+          </p>
+        </div>
+            <div class="sk-sidebar-toc">
+              <ul>
+<li><a class="reference internal" href="#">2.3. Clustering</a><ul>
+<li><a class="reference internal" href="#overview-of-clustering-methods">2.3.1. Overview of clustering methods</a></li>
+<li><a class="reference internal" href="#k-means">2.3.2. K-means</a><ul>
+<li><a class="reference internal" href="#low-level-parallelism">2.3.2.1. Low-level parallelism</a></li>
+<li><a class="reference internal" href="#mini-batch-k-means">2.3.2.2. Mini Batch K-Means</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#affinity-propagation">2.3.3. Affinity Propagation</a></li>
+<li><a class="reference internal" href="#mean-shift">2.3.4. Mean Shift</a></li>
+<li><a class="reference internal" href="#spectral-clustering">2.3.5. Spectral clustering</a><ul>
+<li><a class="reference internal" href="#different-label-assignment-strategies">2.3.5.1. Different label assignment strategies</a></li>
+<li><a class="reference internal" href="#spectral-clustering-graphs">2.3.5.2. Spectral Clustering Graphs</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#hierarchical-clustering">2.3.6. Hierarchical clustering</a><ul>
+<li><a class="reference internal" href="#different-linkage-type-ward-complete-average-and-single-linkage">2.3.6.1. Different linkage type: Ward, complete, average, and single linkage</a></li>
+<li><a class="reference internal" href="#visualization-of-cluster-hierarchy">2.3.6.2. Visualization of cluster hierarchy</a></li>
+<li><a class="reference internal" href="#adding-connectivity-constraints">2.3.6.3. Adding connectivity constraints</a></li>
+<li><a class="reference internal" href="#varying-the-metric">2.3.6.4. Varying the metric</a></li>
+<li><a class="reference internal" href="#bisecting-k-means">2.3.6.5. Bisecting K-Means</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#dbscan">2.3.7. DBSCAN</a></li>
+<li><a class="reference internal" href="#hdbscan">2.3.8. HDBSCAN</a><ul>
+<li><a class="reference internal" href="#mutual-reachability-graph">2.3.8.1. Mutual Reachability Graph</a></li>
+<li><a class="reference internal" href="#id11">2.3.8.2. Hierarchical Clustering</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#optics">2.3.9. OPTICS</a></li>
+<li><a class="reference internal" href="#birch">2.3.10. BIRCH</a></li>
+<li><a class="reference internal" href="#clustering-performance-evaluation">2.3.11. Clustering performance evaluation</a><ul>
+<li><a class="reference internal" href="#rand-index">2.3.11.1. Rand index</a><ul>
+<li><a class="reference internal" href="#advantages">2.3.11.1.1. Advantages</a></li>
+<li><a class="reference internal" href="#drawbacks">2.3.11.1.2. Drawbacks</a></li>
+<li><a class="reference internal" href="#mathematical-formulation">2.3.11.1.3. Mathematical formulation</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#mutual-information-based-scores">2.3.11.2. Mutual Information based scores</a><ul>
+<li><a class="reference internal" href="#id14">2.3.11.2.1. Advantages</a></li>
+<li><a class="reference internal" href="#id15">2.3.11.2.2. Drawbacks</a></li>
+<li><a class="reference internal" href="#id16">2.3.11.2.3. Mathematical formulation</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#homogeneity-completeness-and-v-measure">2.3.11.3. Homogeneity, completeness and V-measure</a><ul>
+<li><a class="reference internal" href="#id21">2.3.11.3.1. Advantages</a></li>
+<li><a class="reference internal" href="#id22">2.3.11.3.2. Drawbacks</a></li>
+<li><a class="reference internal" href="#id23">2.3.11.3.3. Mathematical formulation</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#fowlkes-mallows-scores">2.3.11.4. Fowlkes-Mallows scores</a><ul>
+<li><a class="reference internal" href="#id25">2.3.11.4.1. Advantages</a></li>
+<li><a class="reference internal" href="#id26">2.3.11.4.2. Drawbacks</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#silhouette-coefficient">2.3.11.5. Silhouette Coefficient</a><ul>
+<li><a class="reference internal" href="#id28">2.3.11.5.1. Advantages</a></li>
+<li><a class="reference internal" href="#id29">2.3.11.5.2. Drawbacks</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#calinski-harabasz-index">2.3.11.6. Calinski-Harabasz Index</a><ul>
+<li><a class="reference internal" href="#id31">2.3.11.6.1. Advantages</a></li>
+<li><a class="reference internal" href="#id32">2.3.11.6.2. Drawbacks</a></li>
+<li><a class="reference internal" href="#id33">2.3.11.6.3. Mathematical formulation</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#davies-bouldin-index">2.3.11.7. Davies-Bouldin Index</a><ul>
+<li><a class="reference internal" href="#id35">2.3.11.7.1. Advantages</a></li>
+<li><a class="reference internal" href="#id36">2.3.11.7.2. Drawbacks</a></li>
+<li><a class="reference internal" href="#id37">2.3.11.7.3. Mathematical formulation</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#contingency-matrix">2.3.11.8. Contingency Matrix</a><ul>
+<li><a class="reference internal" href="#id39">2.3.11.8.1. Advantages</a></li>
+<li><a class="reference internal" href="#id40">2.3.11.8.2. Drawbacks</a></li>
+</ul>
+</li>
+<li><a class="reference internal" href="#pair-confusion-matrix">2.3.11.9. Pair Confusion Matrix</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+
+            </div>
+      </div>
+    </div>
+    <div id="sk-page-content-wrapper">
+      <div class="sk-page-content container-fluid body px-md-3" role="main">
+
+  <section id="clustering">
+<span id="id1"></span><h1><span class="section-number">2.3. </span>Clustering<a class="headerlink" href="#clustering" title="Link to this heading">¶</a></h1>
+<p><a class="reference external" href="https://en.wikipedia.org/wiki/Cluster_analysis">Clustering</a> of
+unlabeled data can be performed with the module <a class="reference internal" href="classes.html#module-sklearn.cluster" title="sklearn.cluster"><code class="xref py py-mod docutils literal notranslate"><span class="pre">sklearn.cluster</span></code></a>.</p>
+<p>Each clustering algorithm comes in two variants: a class, that implements
+the <code class="docutils literal notranslate"><span class="pre">fit</span></code> method to learn the clusters on train data, and a function,
+that, given train data, returns an array of integer labels corresponding
+to the different clusters. For the class, the labels over the training
+data can be found in the <code class="docutils literal notranslate"><span class="pre">labels_</span></code> attribute.</p>
+<aside class="topic">
+<p class="topic-title">Input data</p>
+<p>One important thing to note is that the algorithms implemented in
+this module can take different kinds of matrix as input. All the
+methods accept standard data matrices of shape <code class="docutils literal notranslate"><span class="pre">(n_samples,</span> <span class="pre">n_features)</span></code>.
+These can be obtained from the classes in the <a class="reference internal" href="classes.html#module-sklearn.feature_extraction" title="sklearn.feature_extraction"><code class="xref py py-mod docutils literal notranslate"><span class="pre">sklearn.feature_extraction</span></code></a>
+module. For <a class="reference internal" href="generated/sklearn.cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation" title="sklearn.cluster.AffinityPropagation"><code class="xref py py-class docutils literal notranslate"><span class="pre">AffinityPropagation</span></code></a>, <a class="reference internal" href="generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering" title="sklearn.cluster.SpectralClustering"><code class="xref py py-class docutils literal notranslate"><span class="pre">SpectralClustering</span></code></a>
+and <a class="reference internal" href="generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN" title="sklearn.cluster.DBSCAN"><code class="xref py py-class docutils literal notranslate"><span class="pre">DBSCAN</span></code></a> one can also input similarity matrices of shape
+<code class="docutils literal notranslate"><span class="pre">(n_samples,</span> <span class="pre">n_samples)</span></code>. These can be obtained from the functions
+in the <a class="reference internal" href="classes.html#module-sklearn.metrics.pairwise" title="sklearn.metrics.pairwise"><code class="xref py py-mod docutils literal notranslate"><span class="pre">sklearn.metrics.pairwise</span></code></a> module.</p>
+</aside>
+<section id="overview-of-clustering-methods">
+<h2><span class="section-number">2.3.1. </span>Overview of clustering methods<a class="headerlink" href="#overview-of-clustering-methods" title="Link to this heading">¶</a></h2>
+<figure class="align-center" id="id42">
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_cluster_comparison.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_cluster_comparison_001.png" src="auto_examples/cluster/images/sphx_glr_plot_cluster_comparison_001.png" /></a>
+<figcaption>
+<p><span class="caption-text">A comparison of the clustering algorithms in scikit-learn</span><a class="headerlink" href="#id42" title="Link to this image">¶</a></p>
+</figcaption>
+</figure>
+<table class="docutils align-default">
+<colgroup>
+<col style="width: 15.1%" />
+<col style="width: 16.1%" />
+<col style="width: 20.4%" />
+<col style="width: 26.9%" />
+<col style="width: 21.5%" />
+</colgroup>
+<thead>
+<tr class="row-odd"><th class="head"><p>Method name</p></th>
+<th class="head"><p>Parameters</p></th>
+<th class="head"><p>Scalability</p></th>
+<th class="head"><p>Usecase</p></th>
+<th class="head"><p>Geometry (metric used)</p></th>
+</tr>
+</thead>
+<tbody>
+<tr class="row-even"><td><p><a class="reference internal" href="#k-means"><span class="std std-ref">K-Means</span></a></p></td>
+<td><p>number of clusters</p></td>
+<td><p>Very large <code class="docutils literal notranslate"><span class="pre">n_samples</span></code>, medium <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code> with
+<a class="reference internal" href="#mini-batch-kmeans"><span class="std std-ref">MiniBatch code</span></a></p></td>
+<td><p>General-purpose, even cluster size, flat geometry,
+not too many clusters, inductive</p></td>
+<td><p>Distances between points</p></td>
+</tr>
+<tr class="row-odd"><td><p><a class="reference internal" href="#affinity-propagation"><span class="std std-ref">Affinity propagation</span></a></p></td>
+<td><p>damping, sample preference</p></td>
+<td><p>Not scalable with n_samples</p></td>
+<td><p>Many clusters, uneven cluster size, non-flat geometry, inductive</p></td>
+<td><p>Graph distance (e.g. nearest-neighbor graph)</p></td>
+</tr>
+<tr class="row-even"><td><p><a class="reference internal" href="#mean-shift"><span class="std std-ref">Mean-shift</span></a></p></td>
+<td><p>bandwidth</p></td>
+<td><p>Not scalable with <code class="docutils literal notranslate"><span class="pre">n_samples</span></code></p></td>
+<td><p>Many clusters, uneven cluster size, non-flat geometry, inductive</p></td>
+<td><p>Distances between points</p></td>
+</tr>
+<tr class="row-odd"><td><p><a class="reference internal" href="#spectral-clustering"><span class="std std-ref">Spectral clustering</span></a></p></td>
+<td><p>number of clusters</p></td>
+<td><p>Medium <code class="docutils literal notranslate"><span class="pre">n_samples</span></code>, small <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code></p></td>
+<td><p>Few clusters, even cluster size, non-flat geometry, transductive</p></td>
+<td><p>Graph distance (e.g. nearest-neighbor graph)</p></td>
+</tr>
+<tr class="row-even"><td><p><a class="reference internal" href="#hierarchical-clustering"><span class="std std-ref">Ward hierarchical clustering</span></a></p></td>
+<td><p>number of clusters or distance threshold</p></td>
+<td><p>Large <code class="docutils literal notranslate"><span class="pre">n_samples</span></code> and <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code></p></td>
+<td><p>Many clusters, possibly connectivity constraints, transductive</p></td>
+<td><p>Distances between points</p></td>
+</tr>
+<tr class="row-odd"><td><p><a class="reference internal" href="#hierarchical-clustering"><span class="std std-ref">Agglomerative clustering</span></a></p></td>
+<td><p>number of clusters or distance threshold, linkage type, distance</p></td>
+<td><p>Large <code class="docutils literal notranslate"><span class="pre">n_samples</span></code> and <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code></p></td>
+<td><p>Many clusters, possibly connectivity constraints, non Euclidean
+distances, transductive</p></td>
+<td><p>Any pairwise distance</p></td>
+</tr>
+<tr class="row-even"><td><p><a class="reference internal" href="#dbscan"><span class="std std-ref">DBSCAN</span></a></p></td>
+<td><p>neighborhood size</p></td>
+<td><p>Very large <code class="docutils literal notranslate"><span class="pre">n_samples</span></code>, medium <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code></p></td>
+<td><p>Non-flat geometry, uneven cluster sizes, outlier removal,
+transductive</p></td>
+<td><p>Distances between nearest points</p></td>
+</tr>
+<tr class="row-odd"><td><p><a class="reference internal" href="#hdbscan"><span class="std std-ref">HDBSCAN</span></a></p></td>
+<td><p>minimum cluster membership, minimum point neighbors</p></td>
+<td><p>large <code class="docutils literal notranslate"><span class="pre">n_samples</span></code>, medium <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code></p></td>
+<td><p>Non-flat geometry, uneven cluster sizes, outlier removal,
+transductive, hierarchical, variable cluster density</p></td>
+<td><p>Distances between nearest points</p></td>
+</tr>
+<tr class="row-even"><td><p><a class="reference internal" href="#optics"><span class="std std-ref">OPTICS</span></a></p></td>
+<td><p>minimum cluster membership</p></td>
+<td><p>Very large <code class="docutils literal notranslate"><span class="pre">n_samples</span></code>, large <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code></p></td>
+<td><p>Non-flat geometry, uneven cluster sizes, variable cluster density,
+outlier removal, transductive</p></td>
+<td><p>Distances between points</p></td>
+</tr>
+<tr class="row-odd"><td><p><a class="reference internal" href="mixture.html#mixture"><span class="std std-ref">Gaussian mixtures</span></a></p></td>
+<td><p>many</p></td>
+<td><p>Not scalable</p></td>
+<td><p>Flat geometry, good for density estimation, inductive</p></td>
+<td><p>Mahalanobis distances to  centers</p></td>
+</tr>
+<tr class="row-even"><td><p><a class="reference internal" href="#birch"><span class="std std-ref">BIRCH</span></a></p></td>
+<td><p>branching factor, threshold, optional global clusterer.</p></td>
+<td><p>Large <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code> and <code class="docutils literal notranslate"><span class="pre">n_samples</span></code></p></td>
+<td><p>Large dataset, outlier removal, data reduction, inductive</p></td>
+<td><p>Euclidean distance between points</p></td>
+</tr>
+<tr class="row-odd"><td><p><a class="reference internal" href="#bisect-k-means"><span class="std std-ref">Bisecting K-Means</span></a></p></td>
+<td><p>number of clusters</p></td>
+<td><p>Very large <code class="docutils literal notranslate"><span class="pre">n_samples</span></code>, medium <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code></p></td>
+<td><p>General-purpose, even cluster size, flat geometry,
+no empty clusters, inductive, hierarchical</p></td>
+<td><p>Distances between points</p></td>
+</tr>
+</tbody>
+</table>
+<p>Non-flat geometry clustering is useful when the clusters have a specific
+shape, i.e. a non-flat manifold, and the standard euclidean distance is
+not the right metric. This case arises in the two top rows of the figure
+above.</p>
+<p>Gaussian mixture models, useful for clustering, are described in
+<a class="reference internal" href="mixture.html#mixture"><span class="std std-ref">another chapter of the documentation</span></a> dedicated to
+mixture models. KMeans can be seen as a special case of Gaussian mixture
+model with equal covariance per component.</p>
+<p><a class="reference internal" href="../glossary.html#term-transductive"><span class="xref std std-term">Transductive</span></a> clustering methods (in contrast to
+<a class="reference internal" href="../glossary.html#term-inductive"><span class="xref std std-term">inductive</span></a> clustering methods) are not designed to be applied to new,
+unseen data.</p>
+</section>
+<section id="k-means">
+<span id="id2"></span><h2><span class="section-number">2.3.2. </span>K-means<a class="headerlink" href="#k-means" title="Link to this heading">¶</a></h2>
+<p>The <a class="reference internal" href="generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans" title="sklearn.cluster.KMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">KMeans</span></code></a> algorithm clusters data by trying to separate samples in n
+groups of equal variance, minimizing a criterion known as the <em>inertia</em> or
+within-cluster sum-of-squares (see below). This algorithm requires the number
+of clusters to be specified. It scales well to large numbers of samples and has
+been used across a large range of application areas in many different fields.</p>
+<p>The k-means algorithm divides a set of <span class="math notranslate nohighlight">\(N\)</span> samples <span class="math notranslate nohighlight">\(X\)</span> into
+<span class="math notranslate nohighlight">\(K\)</span> disjoint clusters <span class="math notranslate nohighlight">\(C\)</span>, each described by the mean <span class="math notranslate nohighlight">\(\mu_j\)</span>
+of the samples in the cluster. The means are commonly called the cluster
+“centroids”; note that they are not, in general, points from <span class="math notranslate nohighlight">\(X\)</span>,
+although they live in the same space.</p>
+<p>The K-means algorithm aims to choose centroids that minimise the <strong>inertia</strong>,
+or <strong>within-cluster sum-of-squares criterion</strong>:</p>
+<div class="math notranslate nohighlight">
+\[\sum_{i=0}^{n}\min_{\mu_j \in C}(||x_i - \mu_j||^2)\]</div>
+<p>Inertia can be recognized as a measure of how internally coherent clusters are.
+It suffers from various drawbacks:</p>
+<ul class="simple">
+<li><p>Inertia makes the assumption that clusters are convex and isotropic,
+which is not always the case. It responds poorly to elongated clusters,
+or manifolds with irregular shapes.</p></li>
+<li><p>Inertia is not a normalized metric: we just know that lower values are
+better and zero is optimal. But in very high-dimensional spaces, Euclidean
+distances tend to become inflated
+(this is an instance of the so-called “curse of dimensionality”).
+Running a dimensionality reduction algorithm such as <a class="reference internal" href="decomposition.html#pca"><span class="std std-ref">Principal component analysis (PCA)</span></a> prior to
+k-means clustering can alleviate this problem and speed up the
+computations.</p></li>
+</ul>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_kmeans_assumptions.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_kmeans_assumptions_002.png" class="align-center" src="auto_examples/cluster/images/sphx_glr_plot_kmeans_assumptions_002.png" /></a>
+<p>K-means is often referred to as Lloyd’s algorithm. In basic terms, the
+algorithm has three steps. The first step chooses the initial centroids, with
+the most basic method being to choose <span class="math notranslate nohighlight">\(k\)</span> samples from the dataset
+<span class="math notranslate nohighlight">\(X\)</span>. After initialization, K-means consists of looping between the
+two other steps. The first step assigns each sample to its nearest centroid.
+The second step creates new centroids by taking the mean value of all of the
+samples assigned to each previous centroid. The difference between the old
+and the new centroids are computed and the algorithm repeats these last two
+steps until this value is less than a threshold. In other words, it repeats
+until the centroids do not move significantly.</p>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_kmeans_digits.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_kmeans_digits_001.png" class="align-right" src="auto_examples/cluster/images/sphx_glr_plot_kmeans_digits_001.png" /></a>
+<p>K-means is equivalent to the expectation-maximization algorithm
+with a small, all-equal, diagonal covariance matrix.</p>
+<p>The algorithm can also be understood through the concept of <a class="reference external" href="https://en.wikipedia.org/wiki/Voronoi_diagram">Voronoi diagrams</a>. First the Voronoi diagram of
+the points is calculated using the current centroids. Each segment in the
+Voronoi diagram becomes a separate cluster. Secondly, the centroids are updated
+to the mean of each segment. The algorithm then repeats this until a stopping
+criterion is fulfilled. Usually, the algorithm stops when the relative decrease
+in the objective function between iterations is less than the given tolerance
+value. This is not the case in this implementation: iteration stops when
+centroids move less than the tolerance.</p>
+<p>Given enough time, K-means will always converge, however this may be to a local
+minimum. This is highly dependent on the initialization of the centroids.
+As a result, the computation is often done several times, with different
+initializations of the centroids. One method to help address this issue is the
+k-means++ initialization scheme, which has been implemented in scikit-learn
+(use the <code class="docutils literal notranslate"><span class="pre">init='k-means++'</span></code> parameter). This initializes the centroids to be
+(generally) distant from each other, leading to probably better results than
+random initialization, as shown in the reference.</p>
+<p>K-means++ can also be called independently to select seeds for other
+clustering algorithms, see <a class="reference internal" href="generated/sklearn.cluster.kmeans_plusplus.html#sklearn.cluster.kmeans_plusplus" title="sklearn.cluster.kmeans_plusplus"><code class="xref py py-func docutils literal notranslate"><span class="pre">sklearn.cluster.kmeans_plusplus</span></code></a> for details
+and example usage.</p>
+<p>The algorithm supports sample weights, which can be given by a parameter
+<code class="docutils literal notranslate"><span class="pre">sample_weight</span></code>. This allows to assign more weight to some samples when
+computing cluster centers and values of inertia. For example, assigning a
+weight of 2 to a sample is equivalent to adding a duplicate of that sample
+to the dataset <span class="math notranslate nohighlight">\(X\)</span>.</p>
+<p>K-means can be used for vector quantization. This is achieved using the
+transform method of a trained model of <a class="reference internal" href="generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans" title="sklearn.cluster.KMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">KMeans</span></code></a>.</p>
+<section id="low-level-parallelism">
+<h3><span class="section-number">2.3.2.1. </span>Low-level parallelism<a class="headerlink" href="#low-level-parallelism" title="Link to this heading">¶</a></h3>
+<p><a class="reference internal" href="generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans" title="sklearn.cluster.KMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">KMeans</span></code></a> benefits from OpenMP based parallelism through Cython. Small
+chunks of data (256 samples) are processed in parallel, which in addition
+yields a low memory footprint. For more details on how to control the number of
+threads, please refer to our <a class="reference internal" href="../computing/parallelism.html#parallelism"><span class="std std-ref">Parallelism</span></a> notes.</p>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py"><span class="std std-ref">Demonstration of k-means assumptions</span></a>: Demonstrating when
+k-means performs intuitively and when it does not</p></li>
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py"><span class="std std-ref">A demo of K-Means clustering on the handwritten digits data</span></a>: Clustering handwritten digits</p></li>
+</ul>
+</aside>
+<aside class="topic">
+<p class="topic-title">References:</p>
+<ul class="simple">
+<li><p><a class="reference external" href="http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf">“k-means++: The advantages of careful seeding”</a>
+Arthur, David, and Sergei Vassilvitskii,
+<em>Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete
+algorithms</em>, Society for Industrial and Applied Mathematics (2007)</p></li>
+</ul>
+</aside>
+</section>
+<section id="mini-batch-k-means">
+<span id="mini-batch-kmeans"></span><h3><span class="section-number">2.3.2.2. </span>Mini Batch K-Means<a class="headerlink" href="#mini-batch-k-means" title="Link to this heading">¶</a></h3>
+<p>The <a class="reference internal" href="generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans" title="sklearn.cluster.MiniBatchKMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">MiniBatchKMeans</span></code></a> is a variant of the <a class="reference internal" href="generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans" title="sklearn.cluster.KMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">KMeans</span></code></a> algorithm
+which uses mini-batches to reduce the computation time, while still attempting
+to optimise the same objective function. Mini-batches are subsets of the input
+data, randomly sampled in each training iteration. These mini-batches
+drastically reduce the amount of computation required to converge to a local
+solution. In contrast to other algorithms that reduce the convergence time of
+k-means, mini-batch k-means produces results that are generally only slightly
+worse than the standard algorithm.</p>
+<p>The algorithm iterates between two major steps, similar to vanilla k-means.
+In the first step, <span class="math notranslate nohighlight">\(b\)</span> samples are drawn randomly from the dataset, to form
+a mini-batch. These are then assigned to the nearest centroid. In the second
+step, the centroids are updated. In contrast to k-means, this is done on a
+per-sample basis. For each sample in the mini-batch, the assigned centroid
+is updated by taking the streaming average of the sample and all previous
+samples assigned to that centroid. This has the effect of decreasing the
+rate of change for a centroid over time. These steps are performed until
+convergence or a predetermined number of iterations is reached.</p>
+<p><a class="reference internal" href="generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans" title="sklearn.cluster.MiniBatchKMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">MiniBatchKMeans</span></code></a> converges faster than <a class="reference internal" href="generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans" title="sklearn.cluster.KMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">KMeans</span></code></a>, but the quality
+of the results is reduced. In practice this difference in quality can be quite
+small, as shown in the example and cited reference.</p>
+<figure class="align-center">
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_mini_batch_kmeans.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_mini_batch_kmeans_001.png" src="auto_examples/cluster/images/sphx_glr_plot_mini_batch_kmeans_001.png" /></a>
+</figure>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_mini_batch_kmeans.html#sphx-glr-auto-examples-cluster-plot-mini-batch-kmeans-py"><span class="std std-ref">Comparison of the K-Means and MiniBatchKMeans clustering algorithms</span></a>: Comparison of KMeans and
+MiniBatchKMeans</p></li>
+<li><p><a class="reference internal" href="../auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py"><span class="std std-ref">Clustering text documents using k-means</span></a>: Document clustering using sparse
+MiniBatchKMeans</p></li>
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_dict_face_patches.html#sphx-glr-auto-examples-cluster-plot-dict-face-patches-py"><span class="std std-ref">Online learning of a dictionary of parts of faces</span></a></p></li>
+</ul>
+</aside>
+<aside class="topic">
+<p class="topic-title">References:</p>
+<ul class="simple">
+<li><p><a class="reference external" href="https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf">“Web Scale K-Means clustering”</a>
+D. Sculley, <em>Proceedings of the 19th international conference on World
+wide web</em> (2010)</p></li>
+</ul>
+</aside>
+</section>
+</section>
+<section id="affinity-propagation">
+<span id="id3"></span><h2><span class="section-number">2.3.3. </span>Affinity Propagation<a class="headerlink" href="#affinity-propagation" title="Link to this heading">¶</a></h2>
+<p><a class="reference internal" href="generated/sklearn.cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation" title="sklearn.cluster.AffinityPropagation"><code class="xref py py-class docutils literal notranslate"><span class="pre">AffinityPropagation</span></code></a> creates clusters by sending messages between
+pairs of samples until convergence. A dataset is then described using a small
+number of exemplars, which are identified as those most representative of other
+samples. The messages sent between pairs represent the suitability for one
+sample to be the exemplar of the other, which is updated in response to the
+values from other pairs. This updating happens iteratively until convergence,
+at which point the final exemplars are chosen, and hence the final clustering
+is given.</p>
+<figure class="align-center">
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_affinity_propagation.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_affinity_propagation_001.png" src="auto_examples/cluster/images/sphx_glr_plot_affinity_propagation_001.png" /></a>
+</figure>
+<p>Affinity Propagation can be interesting as it chooses the number of
+clusters based on the data provided. For this purpose, the two important
+parameters are the <em>preference</em>, which controls how many exemplars are
+used, and the <em>damping factor</em> which damps the responsibility and
+availability messages to avoid numerical oscillations when updating these
+messages.</p>
+<p>The main drawback of Affinity Propagation is its complexity. The
+algorithm has a time complexity of the order <span class="math notranslate nohighlight">\(O(N^2 T)\)</span>, where <span class="math notranslate nohighlight">\(N\)</span>
+is the number of samples and <span class="math notranslate nohighlight">\(T\)</span> is the number of iterations until
+convergence. Further, the memory complexity is of the order
+<span class="math notranslate nohighlight">\(O(N^2)\)</span> if a dense similarity matrix is used, but reducible if a
+sparse similarity matrix is used. This makes Affinity Propagation most
+appropriate for small to medium sized datasets.</p>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_affinity_propagation.html#sphx-glr-auto-examples-cluster-plot-affinity-propagation-py"><span class="std std-ref">Demo of affinity propagation clustering algorithm</span></a>: Affinity
+Propagation on a synthetic 2D datasets with 3 classes.</p></li>
+<li><p><a class="reference internal" href="../auto_examples/applications/plot_stock_market.html#sphx-glr-auto-examples-applications-plot-stock-market-py"><span class="std std-ref">Visualizing the stock market structure</span></a> Affinity Propagation on
+Financial time series to find groups of companies</p></li>
+</ul>
+</aside>
+<p><strong>Algorithm description:</strong>
+The messages sent between points belong to one of two categories. The first is
+the responsibility <span class="math notranslate nohighlight">\(r(i, k)\)</span>,
+which is the accumulated evidence that sample <span class="math notranslate nohighlight">\(k\)</span>
+should be the exemplar for sample <span class="math notranslate nohighlight">\(i\)</span>.
+The second is the availability <span class="math notranslate nohighlight">\(a(i, k)\)</span>
+which is the accumulated evidence that sample <span class="math notranslate nohighlight">\(i\)</span>
+should choose sample <span class="math notranslate nohighlight">\(k\)</span> to be its exemplar,
+and considers the values for all other samples that <span class="math notranslate nohighlight">\(k\)</span> should
+be an exemplar. In this way, exemplars are chosen by samples if they are (1)
+similar enough to many samples and (2) chosen by many samples to be
+representative of themselves.</p>
+<p>More formally, the responsibility of a sample <span class="math notranslate nohighlight">\(k\)</span>
+to be the exemplar of sample <span class="math notranslate nohighlight">\(i\)</span> is given by:</p>
+<div class="math notranslate nohighlight">
+\[r(i, k) \leftarrow s(i, k) - max [ a(i, k') + s(i, k') \forall k' \neq k ]\]</div>
+<p>Where <span class="math notranslate nohighlight">\(s(i, k)\)</span> is the similarity between samples <span class="math notranslate nohighlight">\(i\)</span> and <span class="math notranslate nohighlight">\(k\)</span>.
+The availability of sample <span class="math notranslate nohighlight">\(k\)</span>
+to be the exemplar of sample <span class="math notranslate nohighlight">\(i\)</span> is given by:</p>
+<div class="math notranslate nohighlight">
+\[a(i, k) \leftarrow min [0, r(k, k) + \sum_{i'~s.t.~i' \notin \{i, k\}}{r(i', k)}]\]</div>
+<p>To begin with, all values for <span class="math notranslate nohighlight">\(r\)</span> and <span class="math notranslate nohighlight">\(a\)</span> are set to zero,
+and the calculation of each iterates until convergence.
+As discussed above, in order to avoid numerical oscillations when updating the
+messages, the damping factor <span class="math notranslate nohighlight">\(\lambda\)</span> is introduced to iteration process:</p>
+<div class="math notranslate nohighlight">
+\[r_{t+1}(i, k) = \lambda\cdot r_{t}(i, k) + (1-\lambda)\cdot r_{t+1}(i, k)\]</div>
+<div class="math notranslate nohighlight">
+\[a_{t+1}(i, k) = \lambda\cdot a_{t}(i, k) + (1-\lambda)\cdot a_{t+1}(i, k)\]</div>
+<p>where <span class="math notranslate nohighlight">\(t\)</span> indicates the iteration times.</p>
+</section>
+<section id="mean-shift">
+<span id="id4"></span><h2><span class="section-number">2.3.4. </span>Mean Shift<a class="headerlink" href="#mean-shift" title="Link to this heading">¶</a></h2>
+<p><a class="reference internal" href="generated/sklearn.cluster.MeanShift.html#sklearn.cluster.MeanShift" title="sklearn.cluster.MeanShift"><code class="xref py py-class docutils literal notranslate"><span class="pre">MeanShift</span></code></a> clustering aims to discover <em>blobs</em> in a smooth density of
+samples. It is a centroid based algorithm, which works by updating candidates
+for centroids to be the mean of the points within a given region. These
+candidates are then filtered in a post-processing stage to eliminate
+near-duplicates to form the final set of centroids.</p>
+<p>The position of centroid candidates is iteratively adjusted using a technique called hill
+climbing, which finds local maxima of the estimated probability density.
+Given a candidate centroid <span class="math notranslate nohighlight">\(x\)</span> for iteration <span class="math notranslate nohighlight">\(t\)</span>, the candidate
+is updated according to the following equation:</p>
+<div class="math notranslate nohighlight">
+\[x^{t+1} = x^t + m(x^t)\]</div>
+<p>Where <span class="math notranslate nohighlight">\(m\)</span> is the <em>mean shift</em> vector that is computed for each
+centroid that points towards a region of the maximum increase in the density of points.
+To compute <span class="math notranslate nohighlight">\(m\)</span> we define <span class="math notranslate nohighlight">\(N(x)\)</span> as the neighborhood of samples within
+a given distance around <span class="math notranslate nohighlight">\(x\)</span>. Then <span class="math notranslate nohighlight">\(m\)</span> is computed using the following
+equation, effectively updating a centroid to be the mean of the samples within
+its neighborhood:</p>
+<div class="math notranslate nohighlight">
+\[m(x) = \frac{1}{|N(x)|} \sum_{x_j \in N(x)}x_j - x\]</div>
+<p>In general, the equation for <span class="math notranslate nohighlight">\(m\)</span> depends on a kernel used for density estimation.
+The generic formula is:</p>
+<div class="math notranslate nohighlight">
+\[m(x) = \frac{\sum_{x_j \in N(x)}K(x_j - x)x_j}{\sum_{x_j \in N(x)}K(x_j - x)} - x\]</div>
+<p>In our implementation, <span class="math notranslate nohighlight">\(K(x)\)</span> is equal to 1 if <span class="math notranslate nohighlight">\(x\)</span> is small enough and is
+equal to 0 otherwise. Effectively <span class="math notranslate nohighlight">\(K(y - x)\)</span> indicates whether <span class="math notranslate nohighlight">\(y\)</span> is in
+the neighborhood of <span class="math notranslate nohighlight">\(x\)</span>.</p>
+<p>The algorithm automatically sets the number of clusters, instead of relying on a
+parameter <code class="docutils literal notranslate"><span class="pre">bandwidth</span></code>, which dictates the size of the region to search through.
+This parameter can be set manually, but can be estimated using the provided
+<code class="docutils literal notranslate"><span class="pre">estimate_bandwidth</span></code> function, which is called if the bandwidth is not set.</p>
+<p>The algorithm is not highly scalable, as it requires multiple nearest neighbor
+searches during the execution of the algorithm. The algorithm is guaranteed to
+converge, however the algorithm will stop iterating when the change in centroids
+is small.</p>
+<p>Labelling a new sample is performed by finding the nearest centroid for a
+given sample.</p>
+<figure class="align-center">
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_mean_shift.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_mean_shift_001.png" src="auto_examples/cluster/images/sphx_glr_plot_mean_shift_001.png" /></a>
+</figure>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_mean_shift.html#sphx-glr-auto-examples-cluster-plot-mean-shift-py"><span class="std std-ref">A demo of the mean-shift clustering algorithm</span></a>: Mean Shift clustering
+on a synthetic 2D datasets with 3 classes.</p></li>
+</ul>
+</aside>
+<aside class="topic">
+<p class="topic-title">References:</p>
+<ul class="simple">
+<li><p><a class="reference external" href="https://doi.org/10.1109/34.1000236">“Mean shift: A robust approach toward feature space analysis”</a>
+D. Comaniciu and P. Meer, <em>IEEE Transactions on Pattern Analysis and Machine Intelligence</em> (2002)</p></li>
+</ul>
+</aside>
+</section>
+<section id="spectral-clustering">
+<span id="id5"></span><h2><span class="section-number">2.3.5. </span>Spectral clustering<a class="headerlink" href="#spectral-clustering" title="Link to this heading">¶</a></h2>
+<p><a class="reference internal" href="generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering" title="sklearn.cluster.SpectralClustering"><code class="xref py py-class docutils literal notranslate"><span class="pre">SpectralClustering</span></code></a> performs a low-dimension embedding of the
+affinity matrix between samples, followed by clustering, e.g., by KMeans,
+of the components of the eigenvectors in the low dimensional space.
+It is especially computationally efficient if the affinity matrix is sparse
+and the <code class="docutils literal notranslate"><span class="pre">amg</span></code> solver is used for the eigenvalue problem (Note, the <code class="docutils literal notranslate"><span class="pre">amg</span></code> solver
+requires that the <a class="reference external" href="https://github.com/pyamg/pyamg">pyamg</a> module is installed.)</p>
+<p>The present version of SpectralClustering requires the number of clusters
+to be specified in advance. It works well for a small number of clusters,
+but is not advised for many clusters.</p>
+<p>For two clusters, SpectralClustering solves a convex relaxation of the
+<a class="reference external" href="https://people.eecs.berkeley.edu/~malik/papers/SM-ncut.pdf">normalized cuts</a>
+problem on the similarity graph: cutting the graph in two so that the weight of
+the edges cut is small compared to the weights of the edges inside each
+cluster. This criteria is especially interesting when working on images, where
+graph vertices are pixels, and weights of the edges of the similarity graph are
+computed using a function of a gradient of the image.</p>
+<p class="centered">
+<strong><a class="reference external" href="../auto_examples/cluster/plot_segmentation_toy.html"><img alt="noisy_img" src="auto_examples/cluster/images/sphx_glr_plot_segmentation_toy_001.png" /></a> <a class="reference external" href="../auto_examples/cluster/plot_segmentation_toy.html"><img alt="segmented_img" src="auto_examples/cluster/images/sphx_glr_plot_segmentation_toy_002.png" /></a></strong></p><div class="admonition warning">
+<p class="admonition-title">Warning</p>
+<p>Transforming distance to well-behaved similarities</p>
+<p>Note that if the values of your similarity matrix are not well
+distributed, e.g. with negative values or with a distance matrix
+rather than a similarity, the spectral problem will be singular and
+the problem not solvable. In which case it is advised to apply a
+transformation to the entries of the matrix. For instance, in the
+case of a signed distance matrix, is common to apply a heat kernel:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">similarity</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="o">-</span><span class="n">beta</span> <span class="o">*</span> <span class="n">distance</span> <span class="o">/</span> <span class="n">distance</span><span class="o">.</span><span class="n">std</span><span class="p">())</span>
+</pre></div>
+</div>
+<p>See the examples for such an application.</p>
+</div>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_segmentation_toy.html#sphx-glr-auto-examples-cluster-plot-segmentation-toy-py"><span class="std std-ref">Spectral clustering for image segmentation</span></a>: Segmenting objects
+from a noisy background using spectral clustering.</p></li>
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_coin_segmentation.html#sphx-glr-auto-examples-cluster-plot-coin-segmentation-py"><span class="std std-ref">Segmenting the picture of greek coins in regions</span></a>: Spectral clustering
+to split the image of coins in regions.</p></li>
+</ul>
+</aside>
+<section id="different-label-assignment-strategies">
+<h3><span class="section-number">2.3.5.1. </span>Different label assignment strategies<a class="headerlink" href="#different-label-assignment-strategies" title="Link to this heading">¶</a></h3>
+<p>Different label assignment strategies can be used, corresponding to the
+<code class="docutils literal notranslate"><span class="pre">assign_labels</span></code> parameter of <a class="reference internal" href="generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering" title="sklearn.cluster.SpectralClustering"><code class="xref py py-class docutils literal notranslate"><span class="pre">SpectralClustering</span></code></a>.
+<code class="docutils literal notranslate"><span class="pre">&quot;kmeans&quot;</span></code> strategy can match finer details, but can be unstable.
+In particular, unless you control the <code class="docutils literal notranslate"><span class="pre">random_state</span></code>, it may not be
+reproducible from run-to-run, as it depends on random initialization.
+The alternative <code class="docutils literal notranslate"><span class="pre">&quot;discretize&quot;</span></code> strategy is 100% reproducible, but tends
+to create parcels of fairly even and geometrical shape.
+The recently added <code class="docutils literal notranslate"><span class="pre">&quot;cluster_qr&quot;</span></code> option is a deterministic alternative that
+tends to create the visually best partitioning on the example application
+below.</p>
+<table class="docutils align-default">
+<thead>
+<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">assign_labels=&quot;kmeans&quot;</span></code></p></th>
+<th class="head"><p><code class="docutils literal notranslate"><span class="pre">assign_labels=&quot;discretize&quot;</span></code></p></th>
+<th class="head"><p><code class="docutils literal notranslate"><span class="pre">assign_labels=&quot;cluster_qr&quot;</span></code></p></th>
+</tr>
+</thead>
+<tbody>
+<tr class="row-even"><td><p><a class="reference external" href="../auto_examples/cluster/plot_coin_segmentation.html"><img alt="coin_kmeans" src="auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_001.png" /></a></p></td>
+<td><p><a class="reference external" href="../auto_examples/cluster/plot_coin_segmentation.html"><img alt="coin_discretize" src="auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_002.png" /></a></p></td>
+<td><p><a class="reference external" href="../auto_examples/cluster/plot_coin_segmentation.html"><img alt="coin_cluster_qr" src="auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_003.png" /></a></p></td>
+</tr>
+</tbody>
+</table>
+<aside class="topic">
+<p class="topic-title">References:</p>
+<ul class="simple">
+<li><p><a class="reference external" href="https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf">“Multiclass spectral clustering”</a>
+Stella X. Yu, Jianbo Shi, 2003</p></li>
+<li><p><a class="reference external" href="https://doi.org/10.1093/imaiai/iay008">“Simple, direct, and efficient multi-way spectral clustering”</a>
+Anil Damle, Victor Minden, Lexing Ying, 2019</p></li>
+</ul>
+</aside>
+</section>
+<section id="spectral-clustering-graphs">
+<span id="spectral-clustering-graph"></span><h3><span class="section-number">2.3.5.2. </span>Spectral Clustering Graphs<a class="headerlink" href="#spectral-clustering-graphs" title="Link to this heading">¶</a></h3>
+<p>Spectral Clustering can also be used to partition graphs via their spectral
+embeddings.  In this case, the affinity matrix is the adjacency matrix of the
+graph, and SpectralClustering is initialized with <code class="docutils literal notranslate"><span class="pre">affinity='precomputed'</span></code>:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn.cluster</span> <span class="kn">import</span> <span class="n">SpectralClustering</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">sc</span> <span class="o">=</span> <span class="n">SpectralClustering</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="n">affinity</span><span class="o">=</span><span class="s1">&#39;precomputed&#39;</span><span class="p">,</span> <span class="n">n_init</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
+<span class="gp">... </span>                        <span class="n">assign_labels</span><span class="o">=</span><span class="s1">&#39;discretize&#39;</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">sc</span><span class="o">.</span><span class="n">fit_predict</span><span class="p">(</span><span class="n">adjacency_matrix</span><span class="p">)</span>
+</pre></div>
+</div>
+<aside class="topic">
+<p class="topic-title">References:</p>
+<ul class="simple">
+<li><p><a class="reference external" href="https://doi.org/10.1007/s11222-007-9033-z">“A Tutorial on Spectral Clustering”</a>
+Ulrike von Luxburg, 2007</p></li>
+<li><p><a class="reference external" href="https://doi.org/10.1109/34.868688">“Normalized cuts and image segmentation”</a>
+Jianbo Shi, Jitendra Malik, 2000</p></li>
+<li><p><a class="reference external" href="https://citeseerx.ist.psu.edu/doc_view/pid/84a86a69315e994cfd1e0c7debb86d62d7bd1f44">“A Random Walks View of Spectral Segmentation”</a>
+Marina Meila, Jianbo Shi, 2001</p></li>
+<li><p><a class="reference external" href="https://citeseerx.ist.psu.edu/doc_view/pid/796c5d6336fc52aa84db575fb821c78918b65f58">“On Spectral Clustering: Analysis and an algorithm”</a>
+Andrew Y. Ng, Michael I. Jordan, Yair Weiss, 2001</p></li>
+<li><p><a class="reference external" href="https://arxiv.org/abs/1708.07481">“Preconditioned Spectral Clustering for Stochastic
+Block Partition Streaming Graph Challenge”</a>
+David Zhuzhunashvili, Andrew Knyazev</p></li>
+</ul>
+</aside>
+</section>
+</section>
+<section id="hierarchical-clustering">
+<span id="id6"></span><h2><span class="section-number">2.3.6. </span>Hierarchical clustering<a class="headerlink" href="#hierarchical-clustering" title="Link to this heading">¶</a></h2>
+<p>Hierarchical clustering is a general family of clustering algorithms that
+build nested clusters by merging or splitting them successively. This
+hierarchy of clusters is represented as a tree (or dendrogram). The root of the
+tree is the unique cluster that gathers all the samples, the leaves being the
+clusters with only one sample. See the <a class="reference external" href="https://en.wikipedia.org/wiki/Hierarchical_clustering">Wikipedia page</a> for more details.</p>
+<p>The <a class="reference internal" href="generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering" title="sklearn.cluster.AgglomerativeClustering"><code class="xref py py-class docutils literal notranslate"><span class="pre">AgglomerativeClustering</span></code></a> object performs a hierarchical clustering
+using a bottom up approach: each observation starts in its own cluster, and
+clusters are successively merged together. The linkage criteria determines the
+metric used for the merge strategy:</p>
+<ul class="simple">
+<li><p><strong>Ward</strong> minimizes the sum of squared differences within all clusters. It is a
+variance-minimizing approach and in this sense is similar to the k-means
+objective function but tackled with an agglomerative hierarchical
+approach.</p></li>
+<li><p><strong>Maximum</strong> or <strong>complete linkage</strong> minimizes the maximum distance between
+observations of pairs of clusters.</p></li>
+<li><p><strong>Average linkage</strong> minimizes the average of the distances between all
+observations of pairs of clusters.</p></li>
+<li><p><strong>Single linkage</strong> minimizes the distance between the closest
+observations of pairs of clusters.</p></li>
+</ul>
+<p><a class="reference internal" href="generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering" title="sklearn.cluster.AgglomerativeClustering"><code class="xref py py-class docutils literal notranslate"><span class="pre">AgglomerativeClustering</span></code></a> can also scale to large number of samples
+when it is used jointly with a connectivity matrix, but is computationally
+expensive when no connectivity constraints are added between samples: it
+considers at each step all the possible merges.</p>
+<aside class="topic">
+<p class="topic-title"><a class="reference internal" href="generated/sklearn.cluster.FeatureAgglomeration.html#sklearn.cluster.FeatureAgglomeration" title="sklearn.cluster.FeatureAgglomeration"><code class="xref py py-class docutils literal notranslate"><span class="pre">FeatureAgglomeration</span></code></a></p>
+<p>The <a class="reference internal" href="generated/sklearn.cluster.FeatureAgglomeration.html#sklearn.cluster.FeatureAgglomeration" title="sklearn.cluster.FeatureAgglomeration"><code class="xref py py-class docutils literal notranslate"><span class="pre">FeatureAgglomeration</span></code></a> uses agglomerative clustering to
+group together features that look very similar, thus decreasing the
+number of features. It is a dimensionality reduction tool, see
+<a class="reference internal" href="unsupervised_reduction.html#data-reduction"><span class="std std-ref">Unsupervised dimensionality reduction</span></a>.</p>
+</aside>
+<section id="different-linkage-type-ward-complete-average-and-single-linkage">
+<h3><span class="section-number">2.3.6.1. </span>Different linkage type: Ward, complete, average, and single linkage<a class="headerlink" href="#different-linkage-type-ward-complete-average-and-single-linkage" title="Link to this heading">¶</a></h3>
+<p><a class="reference internal" href="generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering" title="sklearn.cluster.AgglomerativeClustering"><code class="xref py py-class docutils literal notranslate"><span class="pre">AgglomerativeClustering</span></code></a> supports Ward, single, average, and complete
+linkage strategies.</p>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_linkage_comparison.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_linkage_comparison_001.png" src="auto_examples/cluster/images/sphx_glr_plot_linkage_comparison_001.png" /></a>
+<p>Agglomerative cluster has a “rich get richer” behavior that leads to
+uneven cluster sizes. In this regard, single linkage is the worst
+strategy, and Ward gives the most regular sizes. However, the affinity
+(or distance used in clustering) cannot be varied with Ward, thus for non
+Euclidean metrics, average linkage is a good alternative. Single linkage,
+while not robust to noisy data, can be computed very efficiently and can
+therefore be useful to provide hierarchical clustering of larger datasets.
+Single linkage can also perform well on non-globular data.</p>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_digits_linkage.html#sphx-glr-auto-examples-cluster-plot-digits-linkage-py"><span class="std std-ref">Various Agglomerative Clustering on a 2D embedding of digits</span></a>: exploration of the
+different linkage strategies in a real dataset.</p></li>
+</ul>
+</aside>
+</section>
+<section id="visualization-of-cluster-hierarchy">
+<h3><span class="section-number">2.3.6.2. </span>Visualization of cluster hierarchy<a class="headerlink" href="#visualization-of-cluster-hierarchy" title="Link to this heading">¶</a></h3>
+<p>It’s possible to visualize the tree representing the hierarchical merging of clusters
+as a dendrogram. Visual inspection can often be useful for understanding the structure
+of the data, though more so in the case of small sample sizes.</p>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_agglomerative_dendrogram.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_agglomerative_dendrogram_001.png" src="auto_examples/cluster/images/sphx_glr_plot_agglomerative_dendrogram_001.png" /></a>
+</section>
+<section id="adding-connectivity-constraints">
+<h3><span class="section-number">2.3.6.3. </span>Adding connectivity constraints<a class="headerlink" href="#adding-connectivity-constraints" title="Link to this heading">¶</a></h3>
+<p>An interesting aspect of <a class="reference internal" href="generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering" title="sklearn.cluster.AgglomerativeClustering"><code class="xref py py-class docutils literal notranslate"><span class="pre">AgglomerativeClustering</span></code></a> is that
+connectivity constraints can be added to this algorithm (only adjacent
+clusters can be merged together), through a connectivity matrix that defines
+for each sample the neighboring samples following a given structure of the
+data. For instance, in the swiss-roll example below, the connectivity
+constraints forbid the merging of points that are not adjacent on the swiss
+roll, and thus avoid forming clusters that extend across overlapping folds of
+the roll.</p>
+<p class="centered">
+<strong><a class="reference external" href="../auto_examples/cluster/plot_ward_structured_vs_unstructured.html"><img alt="unstructured" src="auto_examples/cluster/images/sphx_glr_plot_ward_structured_vs_unstructured_001.png" /></a> <a class="reference external" href="../auto_examples/cluster/plot_ward_structured_vs_unstructured.html"><img alt="structured" src="auto_examples/cluster/images/sphx_glr_plot_ward_structured_vs_unstructured_002.png" /></a></strong></p><p>These constraint are useful to impose a certain local structure, but they
+also make the algorithm faster, especially when the number of the samples
+is high.</p>
+<p>The connectivity constraints are imposed via an connectivity matrix: a
+scipy sparse matrix that has elements only at the intersection of a row
+and a column with indices of the dataset that should be connected. This
+matrix can be constructed from a-priori information: for instance, you
+may wish to cluster web pages by only merging pages with a link pointing
+from one to another. It can also be learned from the data, for instance
+using <a class="reference internal" href="generated/sklearn.neighbors.kneighbors_graph.html#sklearn.neighbors.kneighbors_graph" title="sklearn.neighbors.kneighbors_graph"><code class="xref py py-func docutils literal notranslate"><span class="pre">sklearn.neighbors.kneighbors_graph</span></code></a> to restrict
+merging to nearest neighbors as in <a class="reference internal" href="../auto_examples/cluster/plot_agglomerative_clustering.html#sphx-glr-auto-examples-cluster-plot-agglomerative-clustering-py"><span class="std std-ref">this example</span></a>, or
+using <a class="reference internal" href="generated/sklearn.feature_extraction.image.grid_to_graph.html#sklearn.feature_extraction.image.grid_to_graph" title="sklearn.feature_extraction.image.grid_to_graph"><code class="xref py py-func docutils literal notranslate"><span class="pre">sklearn.feature_extraction.image.grid_to_graph</span></code></a> to
+enable only merging of neighboring pixels on an image, as in the
+<a class="reference internal" href="../auto_examples/cluster/plot_coin_ward_segmentation.html#sphx-glr-auto-examples-cluster-plot-coin-ward-segmentation-py"><span class="std std-ref">coin</span></a> example.</p>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_coin_ward_segmentation.html#sphx-glr-auto-examples-cluster-plot-coin-ward-segmentation-py"><span class="std std-ref">A demo of structured Ward hierarchical clustering on an image of coins</span></a>: Ward clustering
+to split the image of coins in regions.</p></li>
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_ward_structured_vs_unstructured.html#sphx-glr-auto-examples-cluster-plot-ward-structured-vs-unstructured-py"><span class="std std-ref">Hierarchical clustering: structured vs unstructured ward</span></a>: Example of
+Ward algorithm on a swiss-roll, comparison of structured approaches
+versus unstructured approaches.</p></li>
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_feature_agglomeration_vs_univariate_selection.html#sphx-glr-auto-examples-cluster-plot-feature-agglomeration-vs-univariate-selection-py"><span class="std std-ref">Feature agglomeration vs. univariate selection</span></a>:
+Example of dimensionality reduction with feature agglomeration based on
+Ward hierarchical clustering.</p></li>
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_agglomerative_clustering.html#sphx-glr-auto-examples-cluster-plot-agglomerative-clustering-py"><span class="std std-ref">Agglomerative clustering with and without structure</span></a></p></li>
+</ul>
+</aside>
+<div class="admonition warning">
+<p class="admonition-title">Warning</p>
+<p><strong>Connectivity constraints with single, average and complete linkage</strong></p>
+<p>Connectivity constraints and single, complete or average linkage can enhance
+the ‘rich getting richer’ aspect of agglomerative clustering,
+particularly so if they are built with
+<a class="reference internal" href="generated/sklearn.neighbors.kneighbors_graph.html#sklearn.neighbors.kneighbors_graph" title="sklearn.neighbors.kneighbors_graph"><code class="xref py py-func docutils literal notranslate"><span class="pre">sklearn.neighbors.kneighbors_graph</span></code></a>. In the limit of a small
+number of clusters, they tend to give a few macroscopically occupied
+clusters and almost empty ones. (see the discussion in
+<a class="reference internal" href="../auto_examples/cluster/plot_agglomerative_clustering.html#sphx-glr-auto-examples-cluster-plot-agglomerative-clustering-py"><span class="std std-ref">Agglomerative clustering with and without structure</span></a>).
+Single linkage is the most brittle linkage option with regard to this issue.</p>
+</div>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_agglomerative_clustering.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_001.png" src="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_001.png" /></a>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_agglomerative_clustering.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_002.png" src="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_002.png" /></a>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_agglomerative_clustering.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_003.png" src="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_003.png" /></a>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_agglomerative_clustering.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_004.png" src="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_004.png" /></a>
+</section>
+<section id="varying-the-metric">
+<h3><span class="section-number">2.3.6.4. </span>Varying the metric<a class="headerlink" href="#varying-the-metric" title="Link to this heading">¶</a></h3>
+<p>Single, average and complete linkage can be used with a variety of distances (or
+affinities), in particular Euclidean distance (<em>l2</em>), Manhattan distance
+(or Cityblock, or <em>l1</em>), cosine distance, or any precomputed affinity
+matrix.</p>
+<ul class="simple">
+<li><p><em>l1</em> distance is often good for sparse features, or sparse noise: i.e.
+many of the features are zero, as in text mining using occurrences of
+rare words.</p></li>
+<li><p><em>cosine</em> distance is interesting because it is invariant to global
+scalings of the signal.</p></li>
+</ul>
+<p>The guidelines for choosing a metric is to use one that maximizes the
+distance between samples in different classes, and minimizes that within
+each class.</p>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_agglomerative_clustering_metrics.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_metrics_005.png" src="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_metrics_005.png" /></a>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_agglomerative_clustering_metrics.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_metrics_006.png" src="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_metrics_006.png" /></a>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_agglomerative_clustering_metrics.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_metrics_007.png" src="auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_metrics_007.png" /></a>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_agglomerative_clustering_metrics.html#sphx-glr-auto-examples-cluster-plot-agglomerative-clustering-metrics-py"><span class="std std-ref">Agglomerative clustering with different metrics</span></a></p></li>
+</ul>
+</aside>
+</section>
+<section id="bisecting-k-means">
+<h3><span class="section-number">2.3.6.5. </span>Bisecting K-Means<a class="headerlink" href="#bisecting-k-means" title="Link to this heading">¶</a></h3>
+<p id="bisect-k-means">The <a class="reference internal" href="generated/sklearn.cluster.BisectingKMeans.html#sklearn.cluster.BisectingKMeans" title="sklearn.cluster.BisectingKMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">BisectingKMeans</span></code></a> is an iterative variant of <a class="reference internal" href="generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans" title="sklearn.cluster.KMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">KMeans</span></code></a>, using
+divisive hierarchical clustering. Instead of creating all centroids at once, centroids
+are picked progressively based on a previous clustering: a cluster is split into two
+new clusters repeatedly until the target number of clusters is reached.</p>
+<p><a class="reference internal" href="generated/sklearn.cluster.BisectingKMeans.html#sklearn.cluster.BisectingKMeans" title="sklearn.cluster.BisectingKMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">BisectingKMeans</span></code></a> is more efficient than <a class="reference internal" href="generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans" title="sklearn.cluster.KMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">KMeans</span></code></a> when the number of
+clusters is large since it only works on a subset of the data at each bisection
+while <a class="reference internal" href="generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans" title="sklearn.cluster.KMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">KMeans</span></code></a> always works on the entire dataset.</p>
+<p>Although <a class="reference internal" href="generated/sklearn.cluster.BisectingKMeans.html#sklearn.cluster.BisectingKMeans" title="sklearn.cluster.BisectingKMeans"><code class="xref py py-class docutils literal notranslate"><span class="pre">BisectingKMeans</span></code></a> can’t benefit from the advantages of the <code class="docutils literal notranslate"><span class="pre">&quot;k-means++&quot;</span></code>
+initialization by design, it will still produce comparable results than
+<code class="docutils literal notranslate"><span class="pre">KMeans(init=&quot;k-means++&quot;)</span></code> in terms of inertia at cheaper computational costs, and will
+likely produce better results than <code class="docutils literal notranslate"><span class="pre">KMeans</span></code> with a random initialization.</p>
+<p>This variant is more efficient to agglomerative clustering if the number of clusters is
+small compared to the number of data points.</p>
+<p>This variant also does not produce empty clusters.</p>
+<dl class="simple">
+<dt>There exist two strategies for selecting the cluster to split:</dt><dd><ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">bisecting_strategy=&quot;largest_cluster&quot;</span></code> selects the cluster having the most points</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">bisecting_strategy=&quot;biggest_inertia&quot;</span></code> selects the cluster with biggest inertia
+(cluster with biggest Sum of Squared Errors within)</p></li>
+</ul>
+</dd>
+</dl>
+<p>Picking by largest amount of data points in most cases produces result as
+accurate as picking by inertia and is faster (especially for larger amount of data
+points, where calculating error may be costly).</p>
+<p>Picking by largest amount of data points will also likely produce clusters of similar
+sizes while <code class="docutils literal notranslate"><span class="pre">KMeans</span></code> is known to produce clusters of different sizes.</p>
+<p>Difference between Bisecting K-Means and regular K-Means can be seen on example
+<a class="reference internal" href="../auto_examples/cluster/plot_bisect_kmeans.html#sphx-glr-auto-examples-cluster-plot-bisect-kmeans-py"><span class="std std-ref">Bisecting K-Means and Regular K-Means Performance Comparison</span></a>.
+While the regular K-Means algorithm tends to create non-related clusters,
+clusters from Bisecting K-Means are well ordered and create quite a visible hierarchy.</p>
+<aside class="topic">
+<p class="topic-title">References:</p>
+<ul class="simple">
+<li><p><a class="reference external" href="http://www.philippe-fournier-viger.com/spmf/bisectingkmeans.pdf">“A Comparison of Document Clustering Techniques”</a>
+Michael Steinbach, George Karypis and Vipin Kumar,
+Department of Computer Science and Egineering, University of Minnesota
+(June 2000)</p></li>
+<li><p><a class="reference external" href="https://ijeter.everscience.org/Manuscripts/Volume-4/Issue-8/Vol-4-issue-8-M-23.pdf">“Performance Analysis of K-Means and Bisecting K-Means Algorithms in Weblog Data”</a>
+K.Abirami and Dr.P.Mayilvahanan,
+International Journal of Emerging Technologies in Engineering Research (IJETER)
+Volume 4, Issue 8, (August 2016)</p></li>
+<li><p><a class="reference external" href="http://www.jcomputers.us/vol13/jcp1306-01.pdf">“Bisecting K-means Algorithm Based on K-valued Self-determining
+and Clustering Center Optimization”</a>
+Jian Di, Xinyue Gou
+School of Control and Computer Engineering,North China Electric Power University,
+Baoding, Hebei, China (August 2017)</p></li>
+</ul>
+</aside>
+</section>
+</section>
+<section id="dbscan">
+<span id="id7"></span><h2><span class="section-number">2.3.7. </span>DBSCAN<a class="headerlink" href="#dbscan" title="Link to this heading">¶</a></h2>
+<p>The <a class="reference internal" href="generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN" title="sklearn.cluster.DBSCAN"><code class="xref py py-class docutils literal notranslate"><span class="pre">DBSCAN</span></code></a> algorithm views clusters as areas of high density
+separated by areas of low density. Due to this rather generic view, clusters
+found by DBSCAN can be any shape, as opposed to k-means which assumes that
+clusters are convex shaped. The central component to the DBSCAN is the concept
+of <em>core samples</em>, which are samples that are in areas of high density. A
+cluster is therefore a set of core samples, each close to each other
+(measured by some distance measure)
+and a set of non-core samples that are close to a core sample (but are not
+themselves core samples). There are two parameters to the algorithm,
+<code class="docutils literal notranslate"><span class="pre">min_samples</span></code> and <code class="docutils literal notranslate"><span class="pre">eps</span></code>,
+which define formally what we mean when we say <em>dense</em>.
+Higher <code class="docutils literal notranslate"><span class="pre">min_samples</span></code> or lower <code class="docutils literal notranslate"><span class="pre">eps</span></code>
+indicate higher density necessary to form a cluster.</p>
+<p>More formally, we define a core sample as being a sample in the dataset such
+that there exist <code class="docutils literal notranslate"><span class="pre">min_samples</span></code> other samples within a distance of
+<code class="docutils literal notranslate"><span class="pre">eps</span></code>, which are defined as <em>neighbors</em> of the core sample. This tells
+us that the core sample is in a dense area of the vector space. A cluster
+is a set of core samples that can be built by recursively taking a core
+sample, finding all of its neighbors that are core samples, finding all of
+<em>their</em> neighbors that are core samples, and so on. A cluster also has a
+set of non-core samples, which are samples that are neighbors of a core sample
+in the cluster but are not themselves core samples. Intuitively, these samples
+are on the fringes of a cluster.</p>
+<p>Any core sample is part of a cluster, by definition. Any sample that is not a
+core sample, and is at least <code class="docutils literal notranslate"><span class="pre">eps</span></code> in distance from any core sample, is
+considered an outlier by the algorithm.</p>
+<p>While the parameter <code class="docutils literal notranslate"><span class="pre">min_samples</span></code> primarily controls how tolerant the
+algorithm is towards noise (on noisy and large data sets it may be desirable
+to increase this parameter), the parameter <code class="docutils literal notranslate"><span class="pre">eps</span></code> is <em>crucial to choose
+appropriately</em> for the data set and distance function and usually cannot be
+left at the default value. It controls the local neighborhood of the points.
+When chosen too small, most data will not be clustered at all (and labeled
+as <code class="docutils literal notranslate"><span class="pre">-1</span></code> for “noise”). When chosen too large, it causes close clusters to
+be merged into one cluster, and eventually the entire data set to be returned
+as a single cluster. Some heuristics for choosing this parameter have been
+discussed in the literature, for example based on a knee in the nearest neighbor
+distances plot (as discussed in the references below).</p>
+<p>In the figure below, the color indicates cluster membership, with large circles
+indicating core samples found by the algorithm. Smaller circles are non-core
+samples that are still part of a cluster. Moreover, the outliers are indicated
+by black points below.</p>
+<p class="centered">
+<strong><a class="reference external" href="../auto_examples/cluster/plot_dbscan.html"><img alt="dbscan_results" src="auto_examples/cluster/images/sphx_glr_plot_dbscan_002.png" /></a></strong></p><aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py"><span class="std std-ref">Demo of DBSCAN clustering algorithm</span></a></p></li>
+</ul>
+</aside>
+<aside class="topic">
+<p class="topic-title">Implementation</p>
+<p>The DBSCAN algorithm is deterministic, always generating the same clusters
+when given the same data in the same order.  However, the results can differ when
+data is provided in a different order. First, even though the core samples
+will always be assigned to the same clusters, the labels of those clusters
+will depend on the order in which those samples are encountered in the data.
+Second and more importantly, the clusters to which non-core samples are assigned
+can differ depending on the data order.  This would happen when a non-core sample
+has a distance lower than <code class="docutils literal notranslate"><span class="pre">eps</span></code> to two core samples in different clusters. By the
+triangular inequality, those two core samples must be more distant than
+<code class="docutils literal notranslate"><span class="pre">eps</span></code> from each other, or they would be in the same cluster. The non-core
+sample is assigned to whichever cluster is generated first in a pass
+through the data, and so the results will depend on the data ordering.</p>
+<p>The current implementation uses ball trees and kd-trees
+to determine the neighborhood of points,
+which avoids calculating the full distance matrix
+(as was done in scikit-learn versions before 0.14).
+The possibility to use custom metrics is retained;
+for details, see <a class="reference internal" href="generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors" title="sklearn.neighbors.NearestNeighbors"><code class="xref py py-class docutils literal notranslate"><span class="pre">NearestNeighbors</span></code></a>.</p>
+</aside>
+<aside class="topic">
+<p class="topic-title">Memory consumption for large sample sizes</p>
+<p>This implementation is by default not memory efficient because it constructs
+a full pairwise similarity matrix in the case where kd-trees or ball-trees cannot
+be used (e.g., with sparse matrices). This matrix will consume <span class="math notranslate nohighlight">\(n^2\)</span> floats.
+A couple of mechanisms for getting around this are:</p>
+<ul class="simple">
+<li><p>Use <a class="reference internal" href="#optics"><span class="std std-ref">OPTICS</span></a> clustering in conjunction with the
+<code class="docutils literal notranslate"><span class="pre">extract_dbscan</span></code> method. OPTICS clustering also calculates the full
+pairwise matrix, but only keeps one row in memory at a time (memory
+complexity n).</p></li>
+<li><p>A sparse radius neighborhood graph (where missing entries are presumed to
+be out of eps) can be precomputed in a memory-efficient way and dbscan
+can be run over this with <code class="docutils literal notranslate"><span class="pre">metric='precomputed'</span></code>.  See
+<a class="reference internal" href="generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors.radius_neighbors_graph" title="sklearn.neighbors.NearestNeighbors.radius_neighbors_graph"><code class="xref py py-meth docutils literal notranslate"><span class="pre">sklearn.neighbors.NearestNeighbors.radius_neighbors_graph</span></code></a>.</p></li>
+<li><p>The dataset can be compressed, either by removing exact duplicates if
+these occur in your data, or by using BIRCH. Then you only have a
+relatively small number of representatives for a large number of points.
+You can then provide a <code class="docutils literal notranslate"><span class="pre">sample_weight</span></code> when fitting DBSCAN.</p></li>
+</ul>
+</aside>
+<aside class="topic">
+<p class="topic-title">References:</p>
+<ul class="simple">
+<li><p><a class="reference external" href="https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf">“A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases
+with Noise”</a>
+Ester, M., H. P. Kriegel, J. Sander, and X. Xu,
+In Proceedings of the 2nd International Conference on Knowledge Discovery
+and Data Mining, Portland, OR, AAAI Press, pp. 226–231. 1996</p></li>
+<li><p><a class="reference external" href="https://doi.org/10.1145/3068335">“DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.”</a>
+Schubert, E., Sander, J., Ester, M., Kriegel, H. P., &amp; Xu, X. (2017).
+In ACM Transactions on Database Systems (TODS), 42(3), 19.</p></li>
+</ul>
+</aside>
+</section>
+<section id="hdbscan">
+<span id="id8"></span><h2><span class="section-number">2.3.8. </span>HDBSCAN<a class="headerlink" href="#hdbscan" title="Link to this heading">¶</a></h2>
+<p>The <a class="reference internal" href="generated/sklearn.cluster.HDBSCAN.html#sklearn.cluster.HDBSCAN" title="sklearn.cluster.HDBSCAN"><code class="xref py py-class docutils literal notranslate"><span class="pre">HDBSCAN</span></code></a> algorithm can be seen as an extension of <a class="reference internal" href="generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN" title="sklearn.cluster.DBSCAN"><code class="xref py py-class docutils literal notranslate"><span class="pre">DBSCAN</span></code></a>
+and <a class="reference internal" href="generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS" title="sklearn.cluster.OPTICS"><code class="xref py py-class docutils literal notranslate"><span class="pre">OPTICS</span></code></a>. Specifically, <a class="reference internal" href="generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN" title="sklearn.cluster.DBSCAN"><code class="xref py py-class docutils literal notranslate"><span class="pre">DBSCAN</span></code></a> assumes that the clustering
+criterion (i.e. density requirement) is <em>globally homogeneous</em>.
+In other words, <a class="reference internal" href="generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN" title="sklearn.cluster.DBSCAN"><code class="xref py py-class docutils literal notranslate"><span class="pre">DBSCAN</span></code></a> may struggle to successfully capture clusters
+with different densities.
+<a class="reference internal" href="generated/sklearn.cluster.HDBSCAN.html#sklearn.cluster.HDBSCAN" title="sklearn.cluster.HDBSCAN"><code class="xref py py-class docutils literal notranslate"><span class="pre">HDBSCAN</span></code></a> alleviates this assumption and explores all possible density
+scales by building an alternative representation of the clustering problem.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>This implementation is adapted from the original implementation of HDBSCAN,
+<a class="reference external" href="https://github.com/scikit-learn-contrib/hdbscan">scikit-learn-contrib/hdbscan</a> based on <a class="reference internal" href="#lj2017" id="id9"><span>[LJ2017]</span></a>.</p>
+</div>
+<section id="mutual-reachability-graph">
+<h3><span class="section-number">2.3.8.1. </span>Mutual Reachability Graph<a class="headerlink" href="#mutual-reachability-graph" title="Link to this heading">¶</a></h3>
+<p>HDBSCAN first defines <span class="math notranslate nohighlight">\(d_c(x_p)\)</span>, the <em>core distance</em> of a sample <span class="math notranslate nohighlight">\(x_p\)</span>, as the
+distance to its <code class="docutils literal notranslate"><span class="pre">min_samples</span></code> th-nearest neighbor, counting itself. For example,
+if <code class="docutils literal notranslate"><span class="pre">min_samples=5</span></code> and <span class="math notranslate nohighlight">\(x_*\)</span> is the 5th-nearest neighbor of <span class="math notranslate nohighlight">\(x_p\)</span>
+then the core distance is:</p>
+<div class="math notranslate nohighlight">
+\[d_c(x_p)=d(x_p, x_*).\]</div>
+<p>Next it defines <span class="math notranslate nohighlight">\(d_m(x_p, x_q)\)</span>, the <em>mutual reachability distance</em> of two points
+<span class="math notranslate nohighlight">\(x_p, x_q\)</span>, as:</p>
+<div class="math notranslate nohighlight">
+\[d_m(x_p, x_q) = \max\{d_c(x_p), d_c(x_q), d(x_p, x_q)\}\]</div>
+<p>These two notions allow us to construct the <em>mutual reachability graph</em>
+<span class="math notranslate nohighlight">\(G_{ms}\)</span> defined for a fixed choice of <code class="docutils literal notranslate"><span class="pre">min_samples</span></code> by associating each
+sample <span class="math notranslate nohighlight">\(x_p\)</span> with a vertex of the graph, and thus edges between points
+<span class="math notranslate nohighlight">\(x_p, x_q\)</span> are the mutual reachability distance <span class="math notranslate nohighlight">\(d_m(x_p, x_q)\)</span>
+between them. We may build subsets of this graph, denoted as
+<span class="math notranslate nohighlight">\(G_{ms,\varepsilon}\)</span>, by removing any edges with value greater than <span class="math notranslate nohighlight">\(\varepsilon\)</span>:
+from the original graph. Any points whose core distance is less than <span class="math notranslate nohighlight">\(\varepsilon\)</span>:
+are at this staged marked as noise. The remaining points are then clustered by
+finding the connected components of this trimmed graph.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Taking the connected components of a trimmed graph <span class="math notranslate nohighlight">\(G_{ms,\varepsilon}\)</span> is
+equivalent to running DBSCAN* with <code class="docutils literal notranslate"><span class="pre">min_samples</span></code> and <span class="math notranslate nohighlight">\(\varepsilon\)</span>. DBSCAN* is a
+slightly modified version of DBSCAN mentioned in <a class="reference internal" href="#cm2013" id="id10"><span>[CM2013]</span></a>.</p>
+</div>
+</section>
+<section id="id11">
+<h3><span class="section-number">2.3.8.2. </span>Hierarchical Clustering<a class="headerlink" href="#id11" title="Link to this heading">¶</a></h3>
+<p>HDBSCAN can be seen as an algorithm which performs DBSCAN* clustering across all
+values of <span class="math notranslate nohighlight">\(\varepsilon\)</span>. As mentioned prior, this is equivalent to finding the connected
+components of the mutual reachability graphs for all values of <span class="math notranslate nohighlight">\(\varepsilon\)</span>. To do this
+efficiently, HDBSCAN first extracts a minimum spanning tree (MST) from the fully
+-connected mutual reachability graph, then greedily cuts the edges with highest
+weight. An outline of the HDBSCAN algorithm is as follows:</p>
+<blockquote>
+<div><ol class="arabic simple">
+<li><p>Extract the MST of <span class="math notranslate nohighlight">\(G_{ms}\)</span></p></li>
+<li><p>Extend the MST by adding a “self edge” for each vertex, with weight equal
+to the core distance of the underlying sample.</p></li>
+<li><p>Initialize a single cluster and label for the MST.</p></li>
+<li><p>Remove the edge with the greatest weight from the MST (ties are
+removed simultaneously).</p></li>
+<li><p>Assign cluster labels to the connected components which contain the
+end points of the now-removed edge. If the component does not have at least
+one edge it is instead assigned a “null” label marking it as noise.</p></li>
+<li><p>Repeat 4-5 until there are no more connected components.</p></li>
+</ol>
+</div></blockquote>
+<p>HDBSCAN is therefore able to obtain all possible partitions achievable by
+DBSCAN* for a fixed choice of <code class="docutils literal notranslate"><span class="pre">min_samples</span></code> in a hierarchical fashion.
+Indeed, this allows HDBSCAN to perform clustering across multiple densities
+and as such it no longer needs <span class="math notranslate nohighlight">\(\varepsilon\)</span> to be given as a hyperparameter. Instead
+it relies solely on the choice of <code class="docutils literal notranslate"><span class="pre">min_samples</span></code>, which tends to be a more robust
+hyperparameter.</p>
+<p class="centered">
+<strong><a class="reference external" href="../auto_examples/cluster/plot_hdbscan.html"><img alt="hdbscan_ground_truth" src="auto_examples/cluster/images/sphx_glr_plot_hdbscan_005.png" /></a></strong></p><p class="centered">
+<strong><a class="reference external" href="../auto_examples/cluster/plot_hdbscan.html"><img alt="hdbscan_results" src="auto_examples/cluster/images/sphx_glr_plot_hdbscan_007.png" /></a></strong></p><p>HDBSCAN can be smoothed with an additional hyperparameter <code class="docutils literal notranslate"><span class="pre">min_cluster_size</span></code>
+which specifies that during the hierarchical clustering, components with fewer
+than <code class="docutils literal notranslate"><span class="pre">minimum_cluster_size</span></code> many samples are considered noise. In practice, one
+can set <code class="docutils literal notranslate"><span class="pre">minimum_cluster_size</span> <span class="pre">=</span> <span class="pre">min_samples</span></code> to couple the parameters and
+simplify the hyperparameter space.</p>
+<aside class="topic">
+<p class="topic-title">References:</p>
+<div role="list" class="citation-list">
+<div class="citation" id="cm2013" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id10">CM2013</a><span class="fn-bracket">]</span></span>
+<p>Campello, R.J.G.B., Moulavi, D., Sander, J. (2013). Density-Based Clustering
+Based on Hierarchical Density Estimates. In: Pei, J., Tseng, V.S., Cao, L.,
+Motoda, H., Xu, G. (eds) Advances in Knowledge Discovery and Data Mining.
+PAKDD 2013. Lecture Notes in Computer Science(), vol 7819. Springer, Berlin,
+Heidelberg.
+<a class="reference external" href="https://doi.org/10.1007/978-3-642-37456-2_14">Density-Based Clustering Based on Hierarchical Density Estimates</a></p>
+</div>
+<div class="citation" id="lj2017" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id9">LJ2017</a><span class="fn-bracket">]</span></span>
+<p>L. McInnes and J. Healy, (2017). Accelerated Hierarchical Density Based
+Clustering. In: IEEE International Conference on Data Mining Workshops (ICDMW),
+2017, pp. 33-42.
+<a class="reference external" href="https://doi.org/10.1109/ICDMW.2017.12">Accelerated Hierarchical Density Based Clustering</a></p>
+</div>
+</div>
+</aside>
+</section>
+</section>
+<section id="optics">
+<span id="id12"></span><h2><span class="section-number">2.3.9. </span>OPTICS<a class="headerlink" href="#optics" title="Link to this heading">¶</a></h2>
+<p>The <a class="reference internal" href="generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS" title="sklearn.cluster.OPTICS"><code class="xref py py-class docutils literal notranslate"><span class="pre">OPTICS</span></code></a> algorithm shares many similarities with the <a class="reference internal" href="generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN" title="sklearn.cluster.DBSCAN"><code class="xref py py-class docutils literal notranslate"><span class="pre">DBSCAN</span></code></a>
+algorithm, and can be considered a generalization of DBSCAN that relaxes the
+<code class="docutils literal notranslate"><span class="pre">eps</span></code> requirement from a single value to a value range. The key difference
+between DBSCAN and OPTICS is that the OPTICS algorithm builds a <em>reachability</em>
+graph, which assigns each sample both a <code class="docutils literal notranslate"><span class="pre">reachability_</span></code> distance, and a spot
+within the cluster <code class="docutils literal notranslate"><span class="pre">ordering_</span></code> attribute; these two attributes are assigned
+when the model is fitted, and are used to determine cluster membership. If
+OPTICS is run with the default value of <em>inf</em> set for <code class="docutils literal notranslate"><span class="pre">max_eps</span></code>, then DBSCAN
+style cluster extraction can be performed repeatedly in linear time for any
+given <code class="docutils literal notranslate"><span class="pre">eps</span></code> value using the <code class="docutils literal notranslate"><span class="pre">cluster_optics_dbscan</span></code> method. Setting
+<code class="docutils literal notranslate"><span class="pre">max_eps</span></code> to a lower value will result in shorter run times, and can be
+thought of as the maximum neighborhood radius from each point to find other
+potential reachable points.</p>
+<p class="centered">
+<strong><a class="reference external" href="../auto_examples/cluster/plot_optics.html"><img alt="optics_results" src="auto_examples/cluster/images/sphx_glr_plot_optics_001.png" /></a></strong></p><p>The <em>reachability</em> distances generated by OPTICS allow for variable density
+extraction of clusters within a single data set. As shown in the above plot,
+combining <em>reachability</em> distances and data set <code class="docutils literal notranslate"><span class="pre">ordering_</span></code> produces a
+<em>reachability plot</em>, where point density is represented on the Y-axis, and
+points are ordered such that nearby points are adjacent. ‘Cutting’ the
+reachability plot at a single value produces DBSCAN like results; all points
+above the ‘cut’ are classified as noise, and each time that there is a break
+when reading from left to right signifies a new cluster. The default cluster
+extraction with OPTICS looks at the steep slopes within the graph to find
+clusters, and the user can define what counts as a steep slope using the
+parameter <code class="docutils literal notranslate"><span class="pre">xi</span></code>. There are also other possibilities for analysis on the graph
+itself, such as generating hierarchical representations of the data through
+reachability-plot dendrograms, and the hierarchy of clusters detected by the
+algorithm can be accessed through the <code class="docutils literal notranslate"><span class="pre">cluster_hierarchy_</span></code> parameter. The
+plot above has been color-coded so that cluster colors in planar space match
+the linear segment clusters of the reachability plot. Note that the blue and
+red clusters are adjacent in the reachability plot, and can be hierarchically
+represented as children of a larger parent cluster.</p>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_optics.html#sphx-glr-auto-examples-cluster-plot-optics-py"><span class="std std-ref">Demo of OPTICS clustering algorithm</span></a></p></li>
+</ul>
+</aside>
+<aside class="topic">
+<p class="topic-title">Comparison with DBSCAN</p>
+<p>The results from OPTICS <code class="docutils literal notranslate"><span class="pre">cluster_optics_dbscan</span></code> method and DBSCAN are
+very similar, but not always identical; specifically, labeling of periphery
+and noise points. This is in part because the first samples of each dense
+area processed by OPTICS have a large reachability value while being close
+to other points in their area, and will thus sometimes be marked as noise
+rather than periphery. This affects adjacent points when they are
+considered as candidates for being marked as either periphery or noise.</p>
+<p>Note that for any single value of <code class="docutils literal notranslate"><span class="pre">eps</span></code>, DBSCAN will tend to have a
+shorter run time than OPTICS; however, for repeated runs at varying <code class="docutils literal notranslate"><span class="pre">eps</span></code>
+values, a single run of OPTICS may require less cumulative runtime than
+DBSCAN. It is also important to note that OPTICS’ output is close to
+DBSCAN’s only if <code class="docutils literal notranslate"><span class="pre">eps</span></code> and <code class="docutils literal notranslate"><span class="pre">max_eps</span></code> are close.</p>
+</aside>
+<aside class="topic">
+<p class="topic-title">Computational Complexity</p>
+<p>Spatial indexing trees are used to avoid calculating the full distance
+matrix, and allow for efficient memory usage on large sets of samples.
+Different distance metrics can be supplied via the <code class="docutils literal notranslate"><span class="pre">metric</span></code> keyword.</p>
+<p>For large datasets, similar (but not identical) results can be obtained via
+<a class="reference internal" href="generated/sklearn.cluster.HDBSCAN.html#sklearn.cluster.HDBSCAN" title="sklearn.cluster.HDBSCAN"><code class="xref py py-class docutils literal notranslate"><span class="pre">HDBSCAN</span></code></a>. The HDBSCAN implementation is
+multithreaded, and has better algorithmic runtime complexity than OPTICS,
+at the cost of worse memory scaling. For extremely large datasets that
+exhaust system memory using HDBSCAN, OPTICS will maintain <span class="math notranslate nohighlight">\(n\)</span> (as opposed
+to <span class="math notranslate nohighlight">\(n^2\)</span>) memory scaling; however, tuning of the <code class="docutils literal notranslate"><span class="pre">max_eps</span></code> parameter
+will likely need to be used to give a solution in a reasonable amount of
+wall time.</p>
+</aside>
+<aside class="topic">
+<p class="topic-title">References:</p>
+<ul class="simple">
+<li><p>“OPTICS: ordering points to identify the clustering structure.”
+Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel, and Jörg Sander.
+In ACM Sigmod Record, vol. 28, no. 2, pp. 49-60. ACM, 1999.</p></li>
+</ul>
+</aside>
+</section>
+<section id="birch">
+<span id="id13"></span><h2><span class="section-number">2.3.10. </span>BIRCH<a class="headerlink" href="#birch" title="Link to this heading">¶</a></h2>
+<p>The <a class="reference internal" href="generated/sklearn.cluster.Birch.html#sklearn.cluster.Birch" title="sklearn.cluster.Birch"><code class="xref py py-class docutils literal notranslate"><span class="pre">Birch</span></code></a> builds a tree called the Clustering Feature Tree (CFT)
+for the given data. The data is essentially lossy compressed to a set of
+Clustering Feature nodes (CF Nodes). The CF Nodes have a number of
+subclusters called Clustering Feature subclusters (CF Subclusters)
+and these CF Subclusters located in the non-terminal CF Nodes
+can have CF Nodes as children.</p>
+<p>The CF Subclusters hold the necessary information for clustering which prevents
+the need to hold the entire input data in memory. This information includes:</p>
+<ul class="simple">
+<li><p>Number of samples in a subcluster.</p></li>
+<li><p>Linear Sum - An n-dimensional vector holding the sum of all samples</p></li>
+<li><p>Squared Sum - Sum of the squared L2 norm of all samples.</p></li>
+<li><p>Centroids - To avoid recalculation linear sum / n_samples.</p></li>
+<li><p>Squared norm of the centroids.</p></li>
+</ul>
+<p>The BIRCH algorithm has two parameters, the threshold and the branching factor.
+The branching factor limits the number of subclusters in a node and the
+threshold limits the distance between the entering sample and the existing
+subclusters.</p>
+<p>This algorithm can be viewed as an instance or data reduction method,
+since it reduces the input data to a set of subclusters which are obtained directly
+from the leaves of the CFT. This reduced data can be further processed by feeding
+it into a global clusterer. This global clusterer can be set by <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code>.
+If <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code> is set to None, the subclusters from the leaves are directly
+read off, otherwise a global clustering step labels these subclusters into global
+clusters (labels) and the samples are mapped to the global label of the nearest subcluster.</p>
+<p><strong>Algorithm description:</strong></p>
+<ul class="simple">
+<li><p>A new sample is inserted into the root of the CF Tree which is a CF Node.
+It is then merged with the subcluster of the root, that has the smallest
+radius after merging, constrained by the threshold and branching factor conditions.
+If the subcluster has any child node, then this is done repeatedly till it reaches
+a leaf. After finding the nearest subcluster in the leaf, the properties of this
+subcluster and the parent subclusters are recursively updated.</p></li>
+<li><p>If the radius of the subcluster obtained by merging the new sample and the
+nearest subcluster is greater than the square of the threshold and if the
+number of subclusters is greater than the branching factor, then a space is temporarily
+allocated to this new sample. The two farthest subclusters are taken and
+the subclusters are divided into two groups on the basis of the distance
+between these subclusters.</p></li>
+<li><p>If this split node has a parent subcluster and there is room
+for a new subcluster, then the parent is split into two. If there is no room,
+then this node is again split into two and the process is continued
+recursively, till it reaches the root.</p></li>
+</ul>
+<p><strong>BIRCH or MiniBatchKMeans?</strong></p>
+<blockquote>
+<div><ul class="simple">
+<li><p>BIRCH does not scale very well to high dimensional data. As a rule of thumb if
+<code class="docutils literal notranslate"><span class="pre">n_features</span></code> is greater than twenty, it is generally better to use MiniBatchKMeans.</p></li>
+<li><p>If the number of instances of data needs to be reduced, or if one wants a
+large number of subclusters either as a preprocessing step or otherwise,
+BIRCH is more useful than MiniBatchKMeans.</p></li>
+</ul>
+</div></blockquote>
+<p><strong>How to use partial_fit?</strong></p>
+<p>To avoid the computation of global clustering, for every call of <code class="docutils literal notranslate"><span class="pre">partial_fit</span></code>
+the user is advised</p>
+<blockquote>
+<div><ol class="arabic simple">
+<li><p>To set <code class="docutils literal notranslate"><span class="pre">n_clusters=None</span></code> initially</p></li>
+<li><p>Train all data by multiple calls to partial_fit.</p></li>
+<li><p>Set <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code> to a required value using
+<code class="docutils literal notranslate"><span class="pre">brc.set_params(n_clusters=n_clusters)</span></code>.</p></li>
+<li><p>Call <code class="docutils literal notranslate"><span class="pre">partial_fit</span></code> finally with no arguments, i.e. <code class="docutils literal notranslate"><span class="pre">brc.partial_fit()</span></code>
+which performs the global clustering.</p></li>
+</ol>
+</div></blockquote>
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_birch_vs_minibatchkmeans.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_birch_vs_minibatchkmeans_001.png" src="auto_examples/cluster/images/sphx_glr_plot_birch_vs_minibatchkmeans_001.png" /></a>
+<aside class="topic">
+<p class="topic-title">References:</p>
+<ul class="simple">
+<li><p>Tian Zhang, Raghu Ramakrishnan, Maron Livny
+BIRCH: An efficient data clustering method for large databases.
+<a class="reference external" href="https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf">https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf</a></p></li>
+<li><p>Roberto Perdisci
+JBirch - Java implementation of BIRCH clustering algorithm
+<a class="reference external" href="https://code.google.com/archive/p/jbirch">https://code.google.com/archive/p/jbirch</a></p></li>
+</ul>
+</aside>
+</section>
+<section id="clustering-performance-evaluation">
+<span id="clustering-evaluation"></span><h2><span class="section-number">2.3.11. </span>Clustering performance evaluation<a class="headerlink" href="#clustering-performance-evaluation" title="Link to this heading">¶</a></h2>
+<p>Evaluating the performance of a clustering algorithm is not as trivial as
+counting the number of errors or the precision and recall of a supervised
+classification algorithm. In particular any evaluation metric should not
+take the absolute values of the cluster labels into account but rather
+if this clustering define separations of the data similar to some ground
+truth set of classes or satisfying some assumption such that members
+belong to the same class are more similar than members of different
+classes according to some similarity metric.</p>
+<section id="rand-index">
+<span id="adjusted-rand-score"></span><span id="rand-score"></span><h3><span class="section-number">2.3.11.1. </span>Rand index<a class="headerlink" href="#rand-index" title="Link to this heading">¶</a></h3>
+<p>Given the knowledge of the ground truth class assignments
+<code class="docutils literal notranslate"><span class="pre">labels_true</span></code> and our clustering algorithm assignments of the same
+samples <code class="docutils literal notranslate"><span class="pre">labels_pred</span></code>, the <strong>(adjusted or unadjusted) Rand index</strong>
+is a function that measures the <strong>similarity</strong> of the two assignments,
+ignoring permutations:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">metrics</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_true</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">rand_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.66...</span>
+</pre></div>
+</div>
+<p>The Rand index does not ensure to obtain a value close to 0.0 for a
+random labelling. The adjusted Rand index <strong>corrects for chance</strong> and
+will give such a baseline.</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">adjusted_rand_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.24...</span>
+</pre></div>
+</div>
+<p>As with all clustering metrics, one can permute 0 and 1 in the predicted
+labels, rename 2 to 3, and get the same score:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">rand_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.66...</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">adjusted_rand_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.24...</span>
+</pre></div>
+</div>
+<p>Furthermore, both <a class="reference internal" href="generated/sklearn.metrics.rand_score.html#sklearn.metrics.rand_score" title="sklearn.metrics.rand_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">rand_score</span></code></a> <a class="reference internal" href="generated/sklearn.metrics.adjusted_rand_score.html#sklearn.metrics.adjusted_rand_score" title="sklearn.metrics.adjusted_rand_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">adjusted_rand_score</span></code></a> are
+<strong>symmetric</strong>: swapping the argument does not change the scores. They can
+thus be used as <strong>consensus measures</strong>:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">rand_score</span><span class="p">(</span><span class="n">labels_pred</span><span class="p">,</span> <span class="n">labels_true</span><span class="p">)</span>
+<span class="go">0.66...</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">adjusted_rand_score</span><span class="p">(</span><span class="n">labels_pred</span><span class="p">,</span> <span class="n">labels_true</span><span class="p">)</span>
+<span class="go">0.24...</span>
+</pre></div>
+</div>
+<p>Perfect labeling is scored 1.0:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="n">labels_true</span><span class="p">[:]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">rand_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">1.0</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">adjusted_rand_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">1.0</span>
+</pre></div>
+</div>
+<p>Poorly agreeing labels (e.g. independent labelings) have lower scores,
+and for the adjusted Rand index the score will be negative or close to
+zero. However, for the unadjusted Rand index the score, while lower,
+will not necessarily be close to zero.:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">labels_true</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">rand_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.39...</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">adjusted_rand_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">-0.07...</span>
+</pre></div>
+</div>
+<section id="advantages">
+<h4><span class="section-number">2.3.11.1.1. </span>Advantages<a class="headerlink" href="#advantages" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p><strong>Interpretability</strong>: The unadjusted Rand index is proportional
+to the number of sample pairs whose labels are the same in both
+<code class="docutils literal notranslate"><span class="pre">labels_pred</span></code> and <code class="docutils literal notranslate"><span class="pre">labels_true</span></code>, or are different in both.</p></li>
+<li><p><strong>Random (uniform) label assignments have an adjusted Rand index
+score close to 0.0</strong> for any value of <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code> and
+<code class="docutils literal notranslate"><span class="pre">n_samples</span></code> (which is not the case for the unadjusted Rand index
+or the V-measure for instance).</p></li>
+<li><p><strong>Bounded range</strong>: Lower values indicate different labelings,
+similar clusterings have a high (adjusted or unadjusted) Rand index,
+1.0 is the perfect match score. The score range is [0, 1] for the
+unadjusted Rand index and [-1, 1] for the adjusted Rand index.</p></li>
+<li><p><strong>No assumption is made on the cluster structure</strong>: The (adjusted or
+unadjusted) Rand index can be used to compare all kinds of
+clustering algorithms, and can be used to compare clustering
+algorithms such as k-means which assumes isotropic blob shapes with
+results of spectral clustering algorithms which can find cluster
+with “folded” shapes.</p></li>
+</ul>
+</section>
+<section id="drawbacks">
+<h4><span class="section-number">2.3.11.1.2. </span>Drawbacks<a class="headerlink" href="#drawbacks" title="Link to this heading">¶</a></h4>
+<ul>
+<li><p>Contrary to inertia, the <strong>(adjusted or unadjusted) Rand index
+requires knowledge of the ground truth classes</strong> which is almost
+never available in practice or requires manual assignment by human
+annotators (as in the supervised learning setting).</p>
+<p>However (adjusted or unadjusted) Rand index can also be useful in a
+purely unsupervised setting as a building block for a Consensus
+Index that can be used for clustering model selection (TODO).</p>
+</li>
+<li><p>The <strong>unadjusted Rand index is often close to 1.0</strong> even if the
+clusterings themselves differ significantly. This can be understood
+when interpreting the Rand index as the accuracy of element pair
+labeling resulting from the clusterings: In practice there often is
+a majority of element pairs that are assigned the <code class="docutils literal notranslate"><span class="pre">different</span></code> pair
+label under both the predicted and the ground truth clustering
+resulting in a high proportion of pair labels that agree, which
+leads subsequently to a high score.</p></li>
+</ul>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_adjusted_for_chance_measures.html#sphx-glr-auto-examples-cluster-plot-adjusted-for-chance-measures-py"><span class="std std-ref">Adjustment for chance in clustering performance evaluation</span></a>:
+Analysis of the impact of the dataset size on the value of
+clustering measures for random assignments.</p></li>
+</ul>
+</aside>
+</section>
+<section id="mathematical-formulation">
+<h4><span class="section-number">2.3.11.1.3. </span>Mathematical formulation<a class="headerlink" href="#mathematical-formulation" title="Link to this heading">¶</a></h4>
+<p>If C is a ground truth class assignment and K the clustering, let us
+define <span class="math notranslate nohighlight">\(a\)</span> and <span class="math notranslate nohighlight">\(b\)</span> as:</p>
+<ul class="simple">
+<li><p><span class="math notranslate nohighlight">\(a\)</span>, the number of pairs of elements that are in the same set
+in C and in the same set in K</p></li>
+<li><p><span class="math notranslate nohighlight">\(b\)</span>, the number of pairs of elements that are in different sets
+in C and in different sets in K</p></li>
+</ul>
+<p>The unadjusted Rand index is then given by:</p>
+<div class="math notranslate nohighlight">
+\[\text{RI} = \frac{a + b}{C_2^{n_{samples}}}\]</div>
+<p>where <span class="math notranslate nohighlight">\(C_2^{n_{samples}}\)</span> is the total number of possible pairs
+in the dataset. It does not matter if the calculation is performed on
+ordered pairs or unordered pairs as long as the calculation is
+performed consistently.</p>
+<p>However, the Rand index does not guarantee that random label assignments
+will get a value close to zero (esp. if the number of clusters is in
+the same order of magnitude as the number of samples).</p>
+<p>To counter this effect we can discount the expected RI <span class="math notranslate nohighlight">\(E[\text{RI}]\)</span> of
+random labelings by defining the adjusted Rand index as follows:</p>
+<div class="math notranslate nohighlight">
+\[\text{ARI} = \frac{\text{RI} - E[\text{RI}]}{\max(\text{RI}) - E[\text{RI}]}\]</div>
+<aside class="topic">
+<p class="topic-title">References</p>
+<ul class="simple">
+<li><p><a class="reference external" href="https://link.springer.com/article/10.1007%2FBF01908075">Comparing Partitions</a>
+L. Hubert and P. Arabie, Journal of Classification 1985</p></li>
+<li><p><a class="reference external" href="https://psycnet.apa.org/record/2004-17801-007">Properties of the Hubert-Arabie adjusted Rand index</a>
+D. Steinley, Psychological Methods 2004</p></li>
+<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Rand_index">Wikipedia entry for the Rand index</a></p></li>
+<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index">Wikipedia entry for the adjusted Rand index</a></p></li>
+</ul>
+</aside>
+</section>
+</section>
+<section id="mutual-information-based-scores">
+<span id="mutual-info-score"></span><h3><span class="section-number">2.3.11.2. </span>Mutual Information based scores<a class="headerlink" href="#mutual-information-based-scores" title="Link to this heading">¶</a></h3>
+<p>Given the knowledge of the ground truth class assignments <code class="docutils literal notranslate"><span class="pre">labels_true</span></code> and
+our clustering algorithm assignments of the same samples <code class="docutils literal notranslate"><span class="pre">labels_pred</span></code>, the
+<strong>Mutual Information</strong> is a function that measures the <strong>agreement</strong> of the two
+assignments, ignoring permutations.  Two different normalized versions of this
+measure are available, <strong>Normalized Mutual Information (NMI)</strong> and <strong>Adjusted
+Mutual Information (AMI)</strong>. NMI is often used in the literature, while AMI was
+proposed more recently and is <strong>normalized against chance</strong>:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">metrics</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_true</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span>
+
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">adjusted_mutual_info_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.22504...</span>
+</pre></div>
+</div>
+<p>One can permute 0 and 1 in the predicted labels, rename 2 to 3 and get
+the same score:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">adjusted_mutual_info_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.22504...</span>
+</pre></div>
+</div>
+<p>All, <a class="reference internal" href="generated/sklearn.metrics.mutual_info_score.html#sklearn.metrics.mutual_info_score" title="sklearn.metrics.mutual_info_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">mutual_info_score</span></code></a>, <a class="reference internal" href="generated/sklearn.metrics.adjusted_mutual_info_score.html#sklearn.metrics.adjusted_mutual_info_score" title="sklearn.metrics.adjusted_mutual_info_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">adjusted_mutual_info_score</span></code></a> and
+<a class="reference internal" href="generated/sklearn.metrics.normalized_mutual_info_score.html#sklearn.metrics.normalized_mutual_info_score" title="sklearn.metrics.normalized_mutual_info_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">normalized_mutual_info_score</span></code></a> are symmetric: swapping the argument does
+not change the score. Thus they can be used as a <strong>consensus measure</strong>:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">adjusted_mutual_info_score</span><span class="p">(</span><span class="n">labels_pred</span><span class="p">,</span> <span class="n">labels_true</span><span class="p">)</span>
+<span class="go">0.22504...</span>
+</pre></div>
+</div>
+<p>Perfect labeling is scored 1.0:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="n">labels_true</span><span class="p">[:]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">adjusted_mutual_info_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">1.0</span>
+
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">normalized_mutual_info_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">1.0</span>
+</pre></div>
+</div>
+<p>This is not true for <code class="docutils literal notranslate"><span class="pre">mutual_info_score</span></code>, which is therefore harder to judge:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">mutual_info_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.69...</span>
+</pre></div>
+</div>
+<p>Bad (e.g. independent labelings) have non-positive scores:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">labels_true</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">adjusted_mutual_info_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">-0.10526...</span>
+</pre></div>
+</div>
+<section id="id14">
+<h4><span class="section-number">2.3.11.2.1. </span>Advantages<a class="headerlink" href="#id14" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p><strong>Random (uniform) label assignments have a AMI score close to 0.0</strong>
+for any value of <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code> and <code class="docutils literal notranslate"><span class="pre">n_samples</span></code> (which is not the
+case for raw Mutual Information or the V-measure for instance).</p></li>
+<li><p><strong>Upper bound  of 1</strong>:  Values close to zero indicate two label
+assignments that are largely independent, while values close to one
+indicate significant agreement. Further, an AMI of exactly 1 indicates
+that the two label assignments are equal (with or without permutation).</p></li>
+</ul>
+</section>
+<section id="id15">
+<h4><span class="section-number">2.3.11.2.2. </span>Drawbacks<a class="headerlink" href="#id15" title="Link to this heading">¶</a></h4>
+<ul>
+<li><p>Contrary to inertia, <strong>MI-based measures require the knowledge
+of the ground truth classes</strong> while almost never available in practice or
+requires manual assignment by human annotators (as in the supervised learning
+setting).</p>
+<p>However MI-based measures can also be useful in purely unsupervised setting as a
+building block for a Consensus Index that can be used for clustering
+model selection.</p>
+</li>
+<li><p>NMI and MI are not adjusted against chance.</p></li>
+</ul>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_adjusted_for_chance_measures.html#sphx-glr-auto-examples-cluster-plot-adjusted-for-chance-measures-py"><span class="std std-ref">Adjustment for chance in clustering performance evaluation</span></a>: Analysis of
+the impact of the dataset size on the value of clustering measures
+for random assignments. This example also includes the Adjusted Rand
+Index.</p></li>
+</ul>
+</aside>
+</section>
+<section id="id16">
+<h4><span class="section-number">2.3.11.2.3. </span>Mathematical formulation<a class="headerlink" href="#id16" title="Link to this heading">¶</a></h4>
+<p>Assume two label assignments (of the same N objects), <span class="math notranslate nohighlight">\(U\)</span> and <span class="math notranslate nohighlight">\(V\)</span>.
+Their entropy is the amount of uncertainty for a partition set, defined by:</p>
+<div class="math notranslate nohighlight">
+\[H(U) = - \sum_{i=1}^{|U|}P(i)\log(P(i))\]</div>
+<p>where <span class="math notranslate nohighlight">\(P(i) = |U_i| / N\)</span> is the probability that an object picked at
+random from <span class="math notranslate nohighlight">\(U\)</span> falls into class <span class="math notranslate nohighlight">\(U_i\)</span>. Likewise for <span class="math notranslate nohighlight">\(V\)</span>:</p>
+<div class="math notranslate nohighlight">
+\[H(V) = - \sum_{j=1}^{|V|}P'(j)\log(P'(j))\]</div>
+<p>With <span class="math notranslate nohighlight">\(P'(j) = |V_j| / N\)</span>. The mutual information (MI) between <span class="math notranslate nohighlight">\(U\)</span>
+and <span class="math notranslate nohighlight">\(V\)</span> is calculated by:</p>
+<div class="math notranslate nohighlight">
+\[\text{MI}(U, V) = \sum_{i=1}^{|U|}\sum_{j=1}^{|V|}P(i, j)\log\left(\frac{P(i,j)}{P(i)P'(j)}\right)\]</div>
+<p>where <span class="math notranslate nohighlight">\(P(i, j) = |U_i \cap V_j| / N\)</span> is the probability that an object
+picked at random falls into both classes <span class="math notranslate nohighlight">\(U_i\)</span> and <span class="math notranslate nohighlight">\(V_j\)</span>.</p>
+<p>It also can be expressed in set cardinality formulation:</p>
+<div class="math notranslate nohighlight">
+\[\text{MI}(U, V) = \sum_{i=1}^{|U|} \sum_{j=1}^{|V|} \frac{|U_i \cap V_j|}{N}\log\left(\frac{N|U_i \cap V_j|}{|U_i||V_j|}\right)\]</div>
+<p>The normalized mutual information is defined as</p>
+<div class="math notranslate nohighlight">
+\[\text{NMI}(U, V) = \frac{\text{MI}(U, V)}{\text{mean}(H(U), H(V))}\]</div>
+<p>This value of the mutual information and also the normalized variant is not
+adjusted for chance and will tend to increase as the number of different labels
+(clusters) increases, regardless of the actual amount of “mutual information”
+between the label assignments.</p>
+<p>The expected value for the mutual information can be calculated using the
+following equation <a class="reference internal" href="#veb2009" id="id17"><span>[VEB2009]</span></a>. In this equation,
+<span class="math notranslate nohighlight">\(a_i = |U_i|\)</span> (the number of elements in <span class="math notranslate nohighlight">\(U_i\)</span>) and
+<span class="math notranslate nohighlight">\(b_j = |V_j|\)</span> (the number of elements in <span class="math notranslate nohighlight">\(V_j\)</span>).</p>
+<div class="math notranslate nohighlight">
+\[E[\text{MI}(U,V)]=\sum_{i=1}^{|U|} \sum_{j=1}^{|V|} \sum_{n_{ij}=(a_i+b_j-N)^+
+}^{\min(a_i, b_j)} \frac{n_{ij}}{N}\log \left( \frac{ N.n_{ij}}{a_i b_j}\right)
+\frac{a_i!b_j!(N-a_i)!(N-b_j)!}{N!n_{ij}!(a_i-n_{ij})!(b_j-n_{ij})!
+(N-a_i-b_j+n_{ij})!}\]</div>
+<p>Using the expected value, the adjusted mutual information can then be
+calculated using a similar form to that of the adjusted Rand index:</p>
+<div class="math notranslate nohighlight">
+\[\text{AMI} = \frac{\text{MI} - E[\text{MI}]}{\text{mean}(H(U), H(V)) - E[\text{MI}]}\]</div>
+<p>For normalized mutual information and adjusted mutual information, the normalizing
+value is typically some <em>generalized</em> mean of the entropies of each clustering.
+Various generalized means exist, and no firm rules exist for preferring one over the
+others.  The decision is largely a field-by-field basis; for instance, in community
+detection, the arithmetic mean is most common. Each
+normalizing method provides “qualitatively similar behaviours” <a class="reference internal" href="#yat2016" id="id18"><span>[YAT2016]</span></a>. In our
+implementation, this is controlled by the <code class="docutils literal notranslate"><span class="pre">average_method</span></code> parameter.</p>
+<p>Vinh et al. (2010) named variants of NMI and AMI by their averaging method <a class="reference internal" href="#veb2010" id="id19"><span>[VEB2010]</span></a>. Their
+‘sqrt’ and ‘sum’ averages are the geometric and arithmetic means; we use these
+more broadly common names.</p>
+<aside class="topic">
+<p class="topic-title">References</p>
+<ul class="simple">
+<li><p>Strehl, Alexander, and Joydeep Ghosh (2002). “Cluster ensembles – a
+knowledge reuse framework for combining multiple partitions”. Journal of
+Machine Learning Research 3: 583–617.
+<a class="reference external" href="http://strehl.com/download/strehl-jmlr02.pdf">doi:10.1162/153244303321897735</a>.</p></li>
+<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Mutual_Information">Wikipedia entry for the (normalized) Mutual Information</a></p></li>
+<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Adjusted_Mutual_Information">Wikipedia entry for the Adjusted Mutual Information</a></p></li>
+</ul>
+<div role="list" class="citation-list">
+<div class="citation" id="veb2009" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id17">VEB2009</a><span class="fn-bracket">]</span></span>
+<p>Vinh, Epps, and Bailey, (2009). “Information theoretic measures
+for clusterings comparison”. Proceedings of the 26th Annual International
+Conference on Machine Learning - ICML ‘09.
+<a class="reference external" href="https://dl.acm.org/citation.cfm?doid=1553374.1553511">doi:10.1145/1553374.1553511</a>.
+ISBN 9781605585161.</p>
+</div>
+<div class="citation" id="veb2010" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id19">VEB2010</a><span class="fn-bracket">]</span></span>
+<p>Vinh, Epps, and Bailey, (2010). “Information Theoretic Measures for
+Clusterings Comparison: Variants, Properties, Normalization and
+Correction for Chance”. JMLR
+&lt;<a class="reference external" href="https://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf">https://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf</a>&gt;</p>
+</div>
+<div class="citation" id="yat2016" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id18">YAT2016</a><span class="fn-bracket">]</span></span>
+<p>Yang, Algesheimer, and Tessone, (2016). “A comparative analysis of
+community
+detection algorithms on artificial networks”. Scientific Reports 6: 30750.
+<a class="reference external" href="https://www.nature.com/articles/srep30750">doi:10.1038/srep30750</a>.</p>
+</div>
+</div>
+</aside>
+</section>
+</section>
+<section id="homogeneity-completeness-and-v-measure">
+<span id="homogeneity-completeness"></span><h3><span class="section-number">2.3.11.3. </span>Homogeneity, completeness and V-measure<a class="headerlink" href="#homogeneity-completeness-and-v-measure" title="Link to this heading">¶</a></h3>
+<p>Given the knowledge of the ground truth class assignments of the samples,
+it is possible to define some intuitive metric using conditional entropy
+analysis.</p>
+<p>In particular Rosenberg and Hirschberg (2007) define the following two
+desirable objectives for any cluster assignment:</p>
+<ul class="simple">
+<li><p><strong>homogeneity</strong>: each cluster contains only members of a single class.</p></li>
+<li><p><strong>completeness</strong>: all members of a given class are assigned to the same
+cluster.</p></li>
+</ul>
+<p>We can turn those concept as scores <a class="reference internal" href="generated/sklearn.metrics.homogeneity_score.html#sklearn.metrics.homogeneity_score" title="sklearn.metrics.homogeneity_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">homogeneity_score</span></code></a> and
+<a class="reference internal" href="generated/sklearn.metrics.completeness_score.html#sklearn.metrics.completeness_score" title="sklearn.metrics.completeness_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">completeness_score</span></code></a>. Both are bounded below by 0.0 and above by
+1.0 (higher is better):</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">metrics</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_true</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span>
+
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">homogeneity_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.66...</span>
+
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">completeness_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.42...</span>
+</pre></div>
+</div>
+<p>Their harmonic mean called <strong>V-measure</strong> is computed by
+<a class="reference internal" href="generated/sklearn.metrics.v_measure_score.html#sklearn.metrics.v_measure_score" title="sklearn.metrics.v_measure_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">v_measure_score</span></code></a>:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">v_measure_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.51...</span>
+</pre></div>
+</div>
+<p>This function’s formula is as follows:</p>
+<div class="math notranslate nohighlight">
+\[v = \frac{(1 + \beta) \times \text{homogeneity} \times \text{completeness}}{(\beta \times \text{homogeneity} + \text{completeness})}\]</div>
+<p><code class="docutils literal notranslate"><span class="pre">beta</span></code> defaults to a value of 1.0, but for using a value less than 1 for beta:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">v_measure_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">,</span> <span class="n">beta</span><span class="o">=</span><span class="mf">0.6</span><span class="p">)</span>
+<span class="go">0.54...</span>
+</pre></div>
+</div>
+<p>more weight will be attributed to homogeneity, and using a value greater than 1:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">v_measure_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">,</span> <span class="n">beta</span><span class="o">=</span><span class="mf">1.8</span><span class="p">)</span>
+<span class="go">0.48...</span>
+</pre></div>
+</div>
+<p>more weight will be attributed to completeness.</p>
+<p>The V-measure is actually equivalent to the mutual information (NMI)
+discussed above, with the aggregation function being the arithmetic mean <a class="reference internal" href="#b2011" id="id20"><span>[B2011]</span></a>.</p>
+<p>Homogeneity, completeness and V-measure can be computed at once using
+<a class="reference internal" href="generated/sklearn.metrics.homogeneity_completeness_v_measure.html#sklearn.metrics.homogeneity_completeness_v_measure" title="sklearn.metrics.homogeneity_completeness_v_measure"><code class="xref py py-func docutils literal notranslate"><span class="pre">homogeneity_completeness_v_measure</span></code></a> as follows:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">homogeneity_completeness_v_measure</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">(0.66..., 0.42..., 0.51...)</span>
+</pre></div>
+</div>
+<p>The following clustering assignment is slightly better, since it is
+homogeneous but not complete:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">homogeneity_completeness_v_measure</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">(1.0, 0.68..., 0.81...)</span>
+</pre></div>
+</div>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p><a class="reference internal" href="generated/sklearn.metrics.v_measure_score.html#sklearn.metrics.v_measure_score" title="sklearn.metrics.v_measure_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">v_measure_score</span></code></a> is <strong>symmetric</strong>: it can be used to evaluate
+the <strong>agreement</strong> of two independent assignments on the same dataset.</p>
+<p>This is not the case for <a class="reference internal" href="generated/sklearn.metrics.completeness_score.html#sklearn.metrics.completeness_score" title="sklearn.metrics.completeness_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">completeness_score</span></code></a> and
+<a class="reference internal" href="generated/sklearn.metrics.homogeneity_score.html#sklearn.metrics.homogeneity_score" title="sklearn.metrics.homogeneity_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">homogeneity_score</span></code></a>: both are bound by the relationship:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">homogeneity_score</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span> <span class="o">==</span> <span class="n">completeness_score</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="n">a</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<section id="id21">
+<h4><span class="section-number">2.3.11.3.1. </span>Advantages<a class="headerlink" href="#id21" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p><strong>Bounded scores</strong>: 0.0 is as bad as it can be, 1.0 is a perfect score.</p></li>
+<li><p>Intuitive interpretation: clustering with bad V-measure can be
+<strong>qualitatively analyzed in terms of homogeneity and completeness</strong>
+to better feel what ‘kind’ of mistakes is done by the assignment.</p></li>
+<li><p><strong>No assumption is made on the cluster structure</strong>: can be used
+to compare clustering algorithms such as k-means which assumes isotropic
+blob shapes with results of spectral clustering algorithms which can
+find cluster with “folded” shapes.</p></li>
+</ul>
+</section>
+<section id="id22">
+<h4><span class="section-number">2.3.11.3.2. </span>Drawbacks<a class="headerlink" href="#id22" title="Link to this heading">¶</a></h4>
+<ul>
+<li><p>The previously introduced metrics are <strong>not normalized with regards to
+random labeling</strong>: this means that depending on the number of samples,
+clusters and ground truth classes, a completely random labeling will
+not always yield the same values for homogeneity, completeness and
+hence v-measure. In particular <strong>random labeling won’t yield zero
+scores especially when the number of clusters is large</strong>.</p>
+<p>This problem can safely be ignored when the number of samples is more
+than a thousand and the number of clusters is less than 10. <strong>For
+smaller sample sizes or larger number of clusters it is safer to use
+an adjusted index such as the Adjusted Rand Index (ARI)</strong>.</p>
+</li>
+</ul>
+<figure class="align-center">
+<a class="reference external image-reference" href="../auto_examples/cluster/plot_adjusted_for_chance_measures.html"><img alt="auto_examples/cluster/images/sphx_glr_plot_adjusted_for_chance_measures_001.png" src="auto_examples/cluster/images/sphx_glr_plot_adjusted_for_chance_measures_001.png" /></a>
+</figure>
+<ul class="simple">
+<li><p>These metrics <strong>require the knowledge of the ground truth classes</strong> while
+almost never available in practice or requires manual assignment by
+human annotators (as in the supervised learning setting).</p></li>
+</ul>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_adjusted_for_chance_measures.html#sphx-glr-auto-examples-cluster-plot-adjusted-for-chance-measures-py"><span class="std std-ref">Adjustment for chance in clustering performance evaluation</span></a>: Analysis of
+the impact of the dataset size on the value of clustering measures
+for random assignments.</p></li>
+</ul>
+</aside>
+</section>
+<section id="id23">
+<h4><span class="section-number">2.3.11.3.3. </span>Mathematical formulation<a class="headerlink" href="#id23" title="Link to this heading">¶</a></h4>
+<p>Homogeneity and completeness scores are formally given by:</p>
+<div class="math notranslate nohighlight">
+\[h = 1 - \frac{H(C|K)}{H(C)}\]</div>
+<div class="math notranslate nohighlight">
+\[c = 1 - \frac{H(K|C)}{H(K)}\]</div>
+<p>where <span class="math notranslate nohighlight">\(H(C|K)\)</span> is the <strong>conditional entropy of the classes given
+the cluster assignments</strong> and is given by:</p>
+<div class="math notranslate nohighlight">
+\[H(C|K) = - \sum_{c=1}^{|C|} \sum_{k=1}^{|K|} \frac{n_{c,k}}{n}
+\cdot \log\left(\frac{n_{c,k}}{n_k}\right)\]</div>
+<p>and <span class="math notranslate nohighlight">\(H(C)\)</span> is the <strong>entropy of the classes</strong> and is given by:</p>
+<div class="math notranslate nohighlight">
+\[H(C) = - \sum_{c=1}^{|C|} \frac{n_c}{n} \cdot \log\left(\frac{n_c}{n}\right)\]</div>
+<p>with <span class="math notranslate nohighlight">\(n\)</span> the total number of samples, <span class="math notranslate nohighlight">\(n_c\)</span> and <span class="math notranslate nohighlight">\(n_k\)</span>
+the number of samples respectively belonging to class <span class="math notranslate nohighlight">\(c\)</span> and
+cluster <span class="math notranslate nohighlight">\(k\)</span>, and finally <span class="math notranslate nohighlight">\(n_{c,k}\)</span> the number of samples
+from class <span class="math notranslate nohighlight">\(c\)</span> assigned to cluster <span class="math notranslate nohighlight">\(k\)</span>.</p>
+<p>The <strong>conditional entropy of clusters given class</strong> <span class="math notranslate nohighlight">\(H(K|C)\)</span> and the
+<strong>entropy of clusters</strong> <span class="math notranslate nohighlight">\(H(K)\)</span> are defined in a symmetric manner.</p>
+<p>Rosenberg and Hirschberg further define <strong>V-measure</strong> as the <strong>harmonic
+mean of homogeneity and completeness</strong>:</p>
+<div class="math notranslate nohighlight">
+\[v = 2 \cdot \frac{h \cdot c}{h + c}\]</div>
+<aside class="topic">
+<p class="topic-title">References</p>
+<ul class="simple">
+<li><p><a class="reference external" href="https://aclweb.org/anthology/D/D07/D07-1043.pdf">V-Measure: A conditional entropy-based external cluster evaluation
+measure</a>
+Andrew Rosenberg and Julia Hirschberg, 2007</p></li>
+</ul>
+<div role="list" class="citation-list">
+<div class="citation" id="b2011" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id20">B2011</a><span class="fn-bracket">]</span></span>
+<p><a class="reference external" href="http://www.cs.columbia.edu/~hila/hila-thesis-distributed.pdf">Identification and Characterization of Events in Social Media</a>, Hila
+Becker, PhD Thesis.</p>
+</div>
+</div>
+</aside>
+</section>
+</section>
+<section id="fowlkes-mallows-scores">
+<span id="id24"></span><h3><span class="section-number">2.3.11.4. </span>Fowlkes-Mallows scores<a class="headerlink" href="#fowlkes-mallows-scores" title="Link to this heading">¶</a></h3>
+<p>The Fowlkes-Mallows index (<a class="reference internal" href="generated/sklearn.metrics.fowlkes_mallows_score.html#sklearn.metrics.fowlkes_mallows_score" title="sklearn.metrics.fowlkes_mallows_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">sklearn.metrics.fowlkes_mallows_score</span></code></a>) can be
+used when the ground truth class assignments of the samples is known. The
+Fowlkes-Mallows score FMI is defined as the geometric mean of the
+pairwise precision and recall:</p>
+<div class="math notranslate nohighlight">
+\[\text{FMI} = \frac{\text{TP}}{\sqrt{(\text{TP} + \text{FP}) (\text{TP} + \text{FN})}}\]</div>
+<p>Where <code class="docutils literal notranslate"><span class="pre">TP</span></code> is the number of <strong>True Positive</strong> (i.e. the number of pair
+of points that belong to the same clusters in both the true labels and the
+predicted labels), <code class="docutils literal notranslate"><span class="pre">FP</span></code> is the number of <strong>False Positive</strong> (i.e. the number
+of pair of points that belong to the same clusters in the true labels and not
+in the predicted labels) and <code class="docutils literal notranslate"><span class="pre">FN</span></code> is the number of <strong>False Negative</strong> (i.e. the
+number of pair of points that belongs in the same clusters in the predicted
+labels and not in the true labels).</p>
+<p>The score ranges from 0 to 1. A high value indicates a good similarity
+between two clusters.</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">metrics</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_true</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span>
+</pre></div>
+</div>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">fowlkes_mallows_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.47140...</span>
+</pre></div>
+</div>
+<p>One can permute 0 and 1 in the predicted labels, rename 2 to 3 and get
+the same score:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">]</span>
+
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">fowlkes_mallows_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.47140...</span>
+</pre></div>
+</div>
+<p>Perfect labeling is scored 1.0:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="n">labels_true</span><span class="p">[:]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">fowlkes_mallows_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">1.0</span>
+</pre></div>
+</div>
+<p>Bad (e.g. independent labelings) have zero scores:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">labels_true</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels_pred</span> <span class="o">=</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">fowlkes_mallows_score</span><span class="p">(</span><span class="n">labels_true</span><span class="p">,</span> <span class="n">labels_pred</span><span class="p">)</span>
+<span class="go">0.0</span>
+</pre></div>
+</div>
+<section id="id25">
+<h4><span class="section-number">2.3.11.4.1. </span>Advantages<a class="headerlink" href="#id25" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p><strong>Random (uniform) label assignments have a FMI score close to 0.0</strong>
+for any value of <code class="docutils literal notranslate"><span class="pre">n_clusters</span></code> and <code class="docutils literal notranslate"><span class="pre">n_samples</span></code> (which is not the
+case for raw Mutual Information or the V-measure for instance).</p></li>
+<li><p><strong>Upper-bounded at 1</strong>:  Values close to zero indicate two label
+assignments that are largely independent, while values close to one
+indicate significant agreement. Further, values of exactly 0 indicate
+<strong>purely</strong> independent label assignments and a FMI of exactly 1 indicates
+that the two label assignments are equal (with or without permutation).</p></li>
+<li><p><strong>No assumption is made on the cluster structure</strong>: can be used
+to compare clustering algorithms such as k-means which assumes isotropic
+blob shapes with results of spectral clustering algorithms which can
+find cluster with “folded” shapes.</p></li>
+</ul>
+</section>
+<section id="id26">
+<h4><span class="section-number">2.3.11.4.2. </span>Drawbacks<a class="headerlink" href="#id26" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p>Contrary to inertia, <strong>FMI-based measures require the knowledge
+of the ground truth classes</strong> while almost never available in practice or
+requires manual assignment by human annotators (as in the supervised learning
+setting).</p></li>
+</ul>
+<aside class="topic">
+<p class="topic-title">References</p>
+<ul class="simple">
+<li><p>E. B. Fowkles and C. L. Mallows, 1983. “A method for comparing two
+hierarchical clusterings”. Journal of the American Statistical Association.
+<a class="reference external" href="https://www.tandfonline.com/doi/abs/10.1080/01621459.1983.10478008">https://www.tandfonline.com/doi/abs/10.1080/01621459.1983.10478008</a></p></li>
+<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Fowlkes-Mallows_index">Wikipedia entry for the Fowlkes-Mallows Index</a></p></li>
+</ul>
+</aside>
+</section>
+</section>
+<section id="silhouette-coefficient">
+<span id="id27"></span><h3><span class="section-number">2.3.11.5. </span>Silhouette Coefficient<a class="headerlink" href="#silhouette-coefficient" title="Link to this heading">¶</a></h3>
+<p>If the ground truth labels are not known, evaluation must be performed using
+the model itself. The Silhouette Coefficient
+(<a class="reference internal" href="generated/sklearn.metrics.silhouette_score.html#sklearn.metrics.silhouette_score" title="sklearn.metrics.silhouette_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">sklearn.metrics.silhouette_score</span></code></a>)
+is an example of such an evaluation, where a
+higher Silhouette Coefficient score relates to a model with better defined
+clusters. The Silhouette Coefficient is defined for each sample and is composed
+of two scores:</p>
+<ul class="simple">
+<li><p><strong>a</strong>: The mean distance between a sample and all other points in the same
+class.</p></li>
+<li><p><strong>b</strong>: The mean distance between a sample and all other points in the <em>next
+nearest cluster</em>.</p></li>
+</ul>
+<p>The Silhouette Coefficient <em>s</em> for a single sample is then given as:</p>
+<div class="math notranslate nohighlight">
+\[s = \frac{b - a}{max(a, b)}\]</div>
+<p>The Silhouette Coefficient for a set of samples is given as the mean of the
+Silhouette Coefficient for each sample.</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">metrics</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn.metrics</span> <span class="kn">import</span> <span class="n">pairwise_distances</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">datasets</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">datasets</span><span class="o">.</span><span class="n">load_iris</span><span class="p">(</span><span class="n">return_X_y</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>In normal usage, the Silhouette Coefficient is applied to the results of a
+cluster analysis.</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn.cluster</span> <span class="kn">import</span> <span class="n">KMeans</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">kmeans_model</span> <span class="o">=</span> <span class="n">KMeans</span><span class="p">(</span><span class="n">n_clusters</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">random_state</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels</span> <span class="o">=</span> <span class="n">kmeans_model</span><span class="o">.</span><span class="n">labels_</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">silhouette_score</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">labels</span><span class="p">,</span> <span class="n">metric</span><span class="o">=</span><span class="s1">&#39;euclidean&#39;</span><span class="p">)</span>
+<span class="go">0.55...</span>
+</pre></div>
+</div>
+<aside class="topic">
+<p class="topic-title">References</p>
+<ul class="simple">
+<li><p>Peter J. Rousseeuw (1987). <a class="reference external" href="https://doi.org/10.1016/0377-0427(87)90125-7">“Silhouettes: a Graphical Aid to the
+Interpretation and Validation of Cluster Analysis”</a>
+. Computational and Applied Mathematics 20: 53–65.</p></li>
+</ul>
+</aside>
+<section id="id28">
+<h4><span class="section-number">2.3.11.5.1. </span>Advantages<a class="headerlink" href="#id28" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p>The score is bounded between -1 for incorrect clustering and +1 for highly
+dense clustering. Scores around zero indicate overlapping clusters.</p></li>
+<li><p>The score is higher when clusters are dense and well separated, which relates
+to a standard concept of a cluster.</p></li>
+</ul>
+</section>
+<section id="id29">
+<h4><span class="section-number">2.3.11.5.2. </span>Drawbacks<a class="headerlink" href="#id29" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p>The Silhouette Coefficient is generally higher for convex clusters than other
+concepts of clusters, such as density based clusters like those obtained
+through DBSCAN.</p></li>
+</ul>
+<aside class="topic">
+<p class="topic-title">Examples:</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="../auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py"><span class="std std-ref">Selecting the number of clusters with silhouette analysis on KMeans clustering</span></a> : In this example
+the silhouette analysis is used to choose an optimal value for n_clusters.</p></li>
+</ul>
+</aside>
+</section>
+</section>
+<section id="calinski-harabasz-index">
+<span id="id30"></span><h3><span class="section-number">2.3.11.6. </span>Calinski-Harabasz Index<a class="headerlink" href="#calinski-harabasz-index" title="Link to this heading">¶</a></h3>
+<p>If the ground truth labels are not known, the Calinski-Harabasz index
+(<a class="reference internal" href="generated/sklearn.metrics.calinski_harabasz_score.html#sklearn.metrics.calinski_harabasz_score" title="sklearn.metrics.calinski_harabasz_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">sklearn.metrics.calinski_harabasz_score</span></code></a>) - also known as the Variance
+Ratio Criterion - can be used to evaluate the model, where a higher
+Calinski-Harabasz score relates to a model with better defined clusters.</p>
+<p>The index is the ratio of the sum of between-clusters dispersion and of
+within-cluster dispersion for all clusters (where dispersion is defined as the
+sum of distances squared):</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">metrics</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn.metrics</span> <span class="kn">import</span> <span class="n">pairwise_distances</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">datasets</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">datasets</span><span class="o">.</span><span class="n">load_iris</span><span class="p">(</span><span class="n">return_X_y</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>In normal usage, the Calinski-Harabasz index is applied to the results of a
+cluster analysis:</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn.cluster</span> <span class="kn">import</span> <span class="n">KMeans</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">kmeans_model</span> <span class="o">=</span> <span class="n">KMeans</span><span class="p">(</span><span class="n">n_clusters</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">random_state</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels</span> <span class="o">=</span> <span class="n">kmeans_model</span><span class="o">.</span><span class="n">labels_</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">metrics</span><span class="o">.</span><span class="n">calinski_harabasz_score</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">labels</span><span class="p">)</span>
+<span class="go">561.59...</span>
+</pre></div>
+</div>
+<section id="id31">
+<h4><span class="section-number">2.3.11.6.1. </span>Advantages<a class="headerlink" href="#id31" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p>The score is higher when clusters are dense and well separated, which relates
+to a standard concept of a cluster.</p></li>
+<li><p>The score is fast to compute.</p></li>
+</ul>
+</section>
+<section id="id32">
+<h4><span class="section-number">2.3.11.6.2. </span>Drawbacks<a class="headerlink" href="#id32" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p>The Calinski-Harabasz index is generally higher for convex clusters than other
+concepts of clusters, such as density based clusters like those obtained
+through DBSCAN.</p></li>
+</ul>
+</section>
+<section id="id33">
+<h4><span class="section-number">2.3.11.6.3. </span>Mathematical formulation<a class="headerlink" href="#id33" title="Link to this heading">¶</a></h4>
+<p>For a set of data <span class="math notranslate nohighlight">\(E\)</span> of size <span class="math notranslate nohighlight">\(n_E\)</span> which has been clustered into
+<span class="math notranslate nohighlight">\(k\)</span> clusters, the Calinski-Harabasz score <span class="math notranslate nohighlight">\(s\)</span> is defined as the
+ratio of the between-clusters dispersion mean and the within-cluster dispersion:</p>
+<div class="math notranslate nohighlight">
+\[s = \frac{\mathrm{tr}(B_k)}{\mathrm{tr}(W_k)} \times \frac{n_E - k}{k - 1}\]</div>
+<p>where <span class="math notranslate nohighlight">\(\mathrm{tr}(B_k)\)</span> is trace of the between group dispersion matrix
+and <span class="math notranslate nohighlight">\(\mathrm{tr}(W_k)\)</span> is the trace of the within-cluster dispersion
+matrix defined by:</p>
+<div class="math notranslate nohighlight">
+\[W_k = \sum_{q=1}^k \sum_{x \in C_q} (x - c_q) (x - c_q)^T\]</div>
+<div class="math notranslate nohighlight">
+\[B_k = \sum_{q=1}^k n_q (c_q - c_E) (c_q - c_E)^T\]</div>
+<p>with <span class="math notranslate nohighlight">\(C_q\)</span> the set of points in cluster <span class="math notranslate nohighlight">\(q\)</span>, <span class="math notranslate nohighlight">\(c_q\)</span> the center
+of cluster <span class="math notranslate nohighlight">\(q\)</span>, <span class="math notranslate nohighlight">\(c_E\)</span> the center of <span class="math notranslate nohighlight">\(E\)</span>, and <span class="math notranslate nohighlight">\(n_q\)</span> the
+number of points in cluster <span class="math notranslate nohighlight">\(q\)</span>.</p>
+<aside class="topic">
+<p class="topic-title">References</p>
+<ul class="simple">
+<li><p>Caliński, T., &amp; Harabasz, J. (1974).
+<a class="reference external" href="https://www.researchgate.net/publication/233096619_A_Dendrite_Method_for_Cluster_Analysis">“A Dendrite Method for Cluster Analysis”</a>.
+<a class="reference external" href="https://doi.org/10.1080/03610927408827101">Communications in Statistics-theory and Methods 3: 1-27</a>.</p></li>
+</ul>
+</aside>
+</section>
+</section>
+<section id="davies-bouldin-index">
+<span id="id34"></span><h3><span class="section-number">2.3.11.7. </span>Davies-Bouldin Index<a class="headerlink" href="#davies-bouldin-index" title="Link to this heading">¶</a></h3>
+<p>If the ground truth labels are not known, the Davies-Bouldin index
+(<a class="reference internal" href="generated/sklearn.metrics.davies_bouldin_score.html#sklearn.metrics.davies_bouldin_score" title="sklearn.metrics.davies_bouldin_score"><code class="xref py py-func docutils literal notranslate"><span class="pre">sklearn.metrics.davies_bouldin_score</span></code></a>) can be used to evaluate the
+model, where a lower Davies-Bouldin index relates to a model with better
+separation between the clusters.</p>
+<p>This index signifies the average ‘similarity’ between clusters, where the
+similarity is a measure that compares the distance between clusters with the
+size of the clusters themselves.</p>
+<p>Zero is the lowest possible score. Values closer to zero indicate a better
+partition.</p>
+<p>In normal usage, the Davies-Bouldin index is applied to the results of a
+cluster analysis as follows:</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">datasets</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">iris</span> <span class="o">=</span> <span class="n">datasets</span><span class="o">.</span><span class="n">load_iris</span><span class="p">()</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">X</span> <span class="o">=</span> <span class="n">iris</span><span class="o">.</span><span class="n">data</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn.cluster</span> <span class="kn">import</span> <span class="n">KMeans</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn.metrics</span> <span class="kn">import</span> <span class="n">davies_bouldin_score</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">kmeans</span> <span class="o">=</span> <span class="n">KMeans</span><span class="p">(</span><span class="n">n_clusters</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">random_state</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">labels</span> <span class="o">=</span> <span class="n">kmeans</span><span class="o">.</span><span class="n">labels_</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">davies_bouldin_score</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">labels</span><span class="p">)</span>
+<span class="go">0.666...</span>
+</pre></div>
+</div>
+<section id="id35">
+<h4><span class="section-number">2.3.11.7.1. </span>Advantages<a class="headerlink" href="#id35" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p>The computation of Davies-Bouldin is simpler than that of Silhouette scores.</p></li>
+<li><p>The index is solely based on quantities and features inherent to the dataset
+as its computation only uses point-wise distances.</p></li>
+</ul>
+</section>
+<section id="id36">
+<h4><span class="section-number">2.3.11.7.2. </span>Drawbacks<a class="headerlink" href="#id36" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p>The Davies-Boulding index is generally higher for convex clusters than other
+concepts of clusters, such as density based clusters like those obtained from
+DBSCAN.</p></li>
+<li><p>The usage of centroid distance limits the distance metric to Euclidean space.</p></li>
+</ul>
+</section>
+<section id="id37">
+<h4><span class="section-number">2.3.11.7.3. </span>Mathematical formulation<a class="headerlink" href="#id37" title="Link to this heading">¶</a></h4>
+<p>The index is defined as the average similarity between each cluster <span class="math notranslate nohighlight">\(C_i\)</span>
+for <span class="math notranslate nohighlight">\(i=1, ..., k\)</span> and its most similar one <span class="math notranslate nohighlight">\(C_j\)</span>. In the context of
+this index, similarity is defined as a measure <span class="math notranslate nohighlight">\(R_{ij}\)</span> that trades off:</p>
+<ul class="simple">
+<li><p><span class="math notranslate nohighlight">\(s_i\)</span>, the average distance between each point of cluster <span class="math notranslate nohighlight">\(i\)</span> and
+the centroid of that cluster – also know as cluster diameter.</p></li>
+<li><p><span class="math notranslate nohighlight">\(d_{ij}\)</span>, the distance between cluster centroids <span class="math notranslate nohighlight">\(i\)</span> and <span class="math notranslate nohighlight">\(j\)</span>.</p></li>
+</ul>
+<p>A simple choice to construct <span class="math notranslate nohighlight">\(R_{ij}\)</span> so that it is nonnegative and
+symmetric is:</p>
+<div class="math notranslate nohighlight">
+\[R_{ij} = \frac{s_i + s_j}{d_{ij}}\]</div>
+<p>Then the Davies-Bouldin index is defined as:</p>
+<div class="math notranslate nohighlight">
+\[DB = \frac{1}{k} \sum_{i=1}^k \max_{i \neq j} R_{ij}\]</div>
+<aside class="topic">
+<p class="topic-title">References</p>
+<ul class="simple">
+<li><p>Davies, David L.; Bouldin, Donald W. (1979).
+<a class="reference external" href="https://doi.org/10.1109/TPAMI.1979.4766909">“A Cluster Separation Measure”</a>
+IEEE Transactions on Pattern Analysis and Machine Intelligence.
+PAMI-1 (2): 224-227.</p></li>
+<li><p>Halkidi, Maria; Batistakis, Yannis; Vazirgiannis, Michalis (2001).
+<a class="reference external" href="https://doi.org/10.1023/A:1012801612483">“On Clustering Validation Techniques”</a>
+Journal of Intelligent Information Systems, 17(2-3), 107-145.</p></li>
+<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Davies–Bouldin_index">Wikipedia entry for Davies-Bouldin index</a>.</p></li>
+</ul>
+</aside>
+</section>
+</section>
+<section id="contingency-matrix">
+<span id="id38"></span><h3><span class="section-number">2.3.11.8. </span>Contingency Matrix<a class="headerlink" href="#contingency-matrix" title="Link to this heading">¶</a></h3>
+<p>Contingency matrix (<a class="reference internal" href="generated/sklearn.metrics.cluster.contingency_matrix.html#sklearn.metrics.cluster.contingency_matrix" title="sklearn.metrics.cluster.contingency_matrix"><code class="xref py py-func docutils literal notranslate"><span class="pre">sklearn.metrics.cluster.contingency_matrix</span></code></a>)
+reports the intersection cardinality for every true/predicted cluster pair.
+The contingency matrix provides sufficient statistics for all clustering
+metrics where the samples are independent and identically distributed and
+one doesn’t need to account for some instances not being clustered.</p>
+<p>Here is an example:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn.metrics.cluster</span> <span class="kn">import</span> <span class="n">contingency_matrix</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">x</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">y</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">contingency_matrix</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span>
+<span class="go">array([[2, 1, 0],</span>
+<span class="go">       [0, 1, 2]])</span>
+</pre></div>
+</div>
+<p>The first row of output array indicates that there are three samples whose
+true cluster is “a”. Of them, two are in predicted cluster 0, one is in 1,
+and none is in 2. And the second row indicates that there are three samples
+whose true cluster is “b”. Of them, none is in predicted cluster 0, one is in
+1 and two are in 2.</p>
+<p>A <a class="reference internal" href="model_evaluation.html#confusion-matrix"><span class="std std-ref">confusion matrix</span></a> for classification is a square
+contingency matrix where the order of rows and columns correspond to a list
+of classes.</p>
+<section id="id39">
+<h4><span class="section-number">2.3.11.8.1. </span>Advantages<a class="headerlink" href="#id39" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p>Allows to examine the spread of each true cluster across predicted
+clusters and vice versa.</p></li>
+<li><p>The contingency table calculated is typically utilized in the calculation
+of a similarity statistic (like the others listed in this document) between
+the two clusterings.</p></li>
+</ul>
+</section>
+<section id="id40">
+<h4><span class="section-number">2.3.11.8.2. </span>Drawbacks<a class="headerlink" href="#id40" title="Link to this heading">¶</a></h4>
+<ul class="simple">
+<li><p>Contingency matrix is easy to interpret for a small number of clusters, but
+becomes very hard to interpret for a large number of clusters.</p></li>
+<li><p>It doesn’t give a single metric to use as an objective for clustering
+optimisation.</p></li>
+</ul>
+<aside class="topic">
+<p class="topic-title">References</p>
+<ul class="simple">
+<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Contingency_table">Wikipedia entry for contingency matrix</a></p></li>
+</ul>
+</aside>
+</section>
+</section>
+<section id="pair-confusion-matrix">
+<span id="id41"></span><h3><span class="section-number">2.3.11.9. </span>Pair Confusion Matrix<a class="headerlink" href="#pair-confusion-matrix" title="Link to this heading">¶</a></h3>
+<p>The pair confusion matrix
+(<a class="reference internal" href="generated/sklearn.metrics.cluster.pair_confusion_matrix.html#sklearn.metrics.cluster.pair_confusion_matrix" title="sklearn.metrics.cluster.pair_confusion_matrix"><code class="xref py py-func docutils literal notranslate"><span class="pre">sklearn.metrics.cluster.pair_confusion_matrix</span></code></a>) is a 2x2
+similarity matrix</p>
+<div class="math notranslate nohighlight">
+\[\begin{split}C = \left[\begin{matrix}
+C_{00} &amp; C_{01} \\
+C_{10} &amp; C_{11}
+\end{matrix}\right]\end{split}\]</div>
+<p>between two clusterings computed by considering all pairs of samples and
+counting pairs that are assigned into the same or into different clusters
+under the true and predicted clusterings.</p>
+<p>It has the following entries:</p>
+<blockquote>
+<div><p><span class="math notranslate nohighlight">\(C_{00}\)</span> : number of pairs with both clusterings having the samples
+not clustered together</p>
+<p><span class="math notranslate nohighlight">\(C_{10}\)</span> : number of pairs with the true label clustering having the
+samples clustered together but the other clustering not having the samples
+clustered together</p>
+<p><span class="math notranslate nohighlight">\(C_{01}\)</span> : number of pairs with the true label clustering not having
+the samples clustered together but the other clustering having the samples
+clustered together</p>
+<p><span class="math notranslate nohighlight">\(C_{11}\)</span> : number of pairs with both clusterings having the samples
+clustered together</p>
+</div></blockquote>
+<p>Considering a pair of samples that is clustered together a positive pair,
+then as in binary classification the count of true negatives is
+<span class="math notranslate nohighlight">\(C_{00}\)</span>, false negatives is <span class="math notranslate nohighlight">\(C_{10}\)</span>, true positives is
+<span class="math notranslate nohighlight">\(C_{11}\)</span> and false positives is <span class="math notranslate nohighlight">\(C_{01}\)</span>.</p>
+<p>Perfectly matching labelings have all non-zero entries on the
+diagonal regardless of actual label values:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">sklearn.metrics.cluster</span> <span class="kn">import</span> <span class="n">pair_confusion_matrix</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">pair_confusion_matrix</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span>
+<span class="go">array([[8, 0],</span>
+<span class="go">       [0, 4]])</span>
+</pre></div>
+</div>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">pair_confusion_matrix</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">])</span>
+<span class="go">array([[8, 0],</span>
+<span class="go">       [0, 4]])</span>
+</pre></div>
+</div>
+<p>Labelings that assign all classes members to the same clusters
+are complete but may not always be pure, hence penalized, and
+have some off-diagonal non-zero entries:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">pair_confusion_matrix</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span>
+<span class="go">array([[8, 2],</span>
+<span class="go">       [0, 2]])</span>
+</pre></div>
+</div>
+<p>The matrix is not symmetric:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">pair_confusion_matrix</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span>
+<span class="go">array([[8, 0],</span>
+<span class="go">       [2, 2]])</span>
+</pre></div>
+</div>
+<p>If classes members are completely split across different clusters, the
+assignment is totally incomplete, hence the matrix has all zero
+diagonal entries:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">pair_confusion_matrix</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">])</span>
+<span class="go">array([[ 0,  0],</span>
+<span class="go">       [12,  0]])</span>
+</pre></div>
+</div>
+<aside class="topic">
+<p class="topic-title">References</p>
+<ul class="simple">
+<li><p><a class="reference external" href="https://doi.org/10.1007/BF01908075">“Comparing Partitions”</a>
+L. Hubert and P. Arabie, Journal of Classification 1985</p></li>
+</ul>
+</aside>
+</section>
+</section>
+</section>
+
+
+      </div>
+    <div class="container">
+      <footer class="sk-content-footer">
+            &copy; 2007 - 2023, scikit-learn developers (BSD License).
+          <a href="../_sources/modules/clustering.rst.txt" rel="nofollow">Show this page source</a>
+      </footer>
+    </div>
+  </div>
+</div>
+<script src="../_static/js/vendor/bootstrap.min.js"></script>
+
+<script>
+    window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
+    ga('create', 'UA-22606712-2', 'auto');
+    ga('set', 'anonymizeIp', true);
+    ga('send', 'pageview');
+</script>
+<script async src='https://www.google-analytics.com/analytics.js'></script>
+
+
+
+<script defer data-domain="scikit-learn.org" src="https://views.scientific-python.org/js/script.js">
+</script>
+
+
+<script src="../_static/clipboard.min.js"></script>
+<script src="../_static/copybutton.js"></script>
+
+<script>
+$(document).ready(function() {
+    /* Add a [>>>] button on the top-right corner of code samples to hide
+     * the >>> and ... prompts and the output and thus make the code
+     * copyable. */
+    var div = $('.highlight-python .highlight,' +
+                '.highlight-python3 .highlight,' +
+                '.highlight-pycon .highlight,' +
+		'.highlight-default .highlight')
+    var pre = div.find('pre');
+
+    // get the styles from the current theme
+    pre.parent().parent().css('position', 'relative');
+
+    // create and add the button to all the code blocks that contain >>>
+    div.each(function(index) {
+        var jthis = $(this);
+        // tracebacks (.gt) contain bare text elements that need to be
+        // wrapped in a span to work with .nextUntil() (see later)
+        jthis.find('pre:has(.gt)').contents().filter(function() {
+            return ((this.nodeType == 3) && (this.data.trim().length > 0));
+        }).wrap('<span>');
+    });
+
+	/*** Add permalink buttons next to glossary terms ***/
+	$('dl.glossary > dt[id]').append(function() {
+		return ('<a class="headerlink" href="#' +
+			    this.getAttribute('id') +
+			    '" title="Permalink to this term">¶</a>');
+	});
+});
+
+</script>
+
+<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js"></script>
+
+</body>
+</html>
diff --git a/ragger_duck/scraping/tests/test_user_guide.py b/ragger_duck/scraping/tests/test_user_guide.py
new file mode 100644
index 0000000..07e4218
--- /dev/null
+++ b/ragger_duck/scraping/tests/test_user_guide.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+import pytest
+
+from ragger_duck.scraping import UserGuideDocExtractor
+
+USER_GUIDE_TEST_FOLDER = Path(__file__).parent / "data" / "user_guide_doc"
+HTML_TEST_FILES = ["calibration.html", "clustering.html"]
+SKLEARN_API_URL = "https://scikit-learn.org/stable/modules/"
+
+
+@pytest.mark.parametrize("n_jobs", [None, 1, 2])
+@pytest.mark.parametrize("chunk_size", [None, 300])
+def test_user_guide_doc_extractor(chunk_size, n_jobs):
+    """Check the behavior of the `UserGuideDocExtractor` class."""
+    extractor = UserGuideDocExtractor(chunk_size=chunk_size, n_jobs=n_jobs)
+    output_extract = extractor.fit_transform(USER_GUIDE_TEST_FOLDER)
+    possible_source = [SKLEARN_API_URL + html_file for html_file in HTML_TEST_FILES]
+    for chunk in output_extract:
+        assert isinstance(chunk, dict)
+        assert isinstance(chunk["source"], str)
+        assert isinstance(chunk["text"], str)
+        if chunk_size is not None:
+            assert len(chunk["text"]) <= chunk_size
+        assert chunk["source"] in possible_source