Merge branch 'main' into directed_spheres

JackyZzZz · Jul 14, 2023 · daf9b91 · daf9b91
2 parents d4de433 + 0b9e088
commit daf9b91
Show file tree

Hide file tree

Showing 7 changed files with 277 additions and 200 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -1,5 +1,6 @@
 [run]
 omit =
+    DiverseSelector/*/tests/*
     DiverseSelector/test/*
     DiverseSelector/__init__.py
     DiverseSelector/_version.py

diff --git a/.github/workflows/ci_codecov.yml b/.github/workflows/ci_codecov.yml
@@ -30,7 +30,7 @@ jobs:
         python-version: [3.7]
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Additional info about the build
       shell: bash
       run: |

diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py
@@ -21,220 +21,206 @@
 #
 # --
 
-"""Metric calculation module."""
+"""Similarity Module."""
 
 
 import numpy as np
-from scipy.spatial.distance import squareform
+from itertools import combinations_with_replacement
+from scipy.spatial import distance_matrix
 
-__all__ = [
-    "compute_distance_matrix",
-    "pairwise_similarity_bit",
-    "tanimoto",
-    "modified_tanimoto",
-    "nearest_average_tanimoto"
-]
 
+__all__ = ["pairwise_similarity_bit", "tanimoto", "modified_tanimoto", "nearest_average_tanimoto"]
 
-def compute_distance_matrix(
-    features: np.ndarray,
-    metric: str
-):
-    """Compute pairwise distance given a feature matrix.
 
-    Parameters
-    ----------
-    features : np.ndarray
-        Molecule feature matrix.
-    metric : str
-        Distance metric.
-
-    Returns
-    -------
-    dist : ndarray
-        Symmetric distance array.
-    """
-    # todo: add more metrics implemented here
-    built_in_metrics = {
-        "tanimoto": tanimoto,
-        "modified_tanimoto": modified_tanimoto,
-    }
-
-    # Check if specified metric is supported
-    if metric in built_in_metrics:
-        distances = []
-        size = len(features)
-        for i in range(0, size):
-            for j in range(i + 1, size):
-                # use the metric to compute distance between all molecule pairs
-                distances.append(1 - built_in_metrics[metric](features[i], features[j]))
-        # shape into symmetric matrix
-        dist = squareform(distances)
-
-    else:  # raise error if unsupported
-        raise ValueError(f"Metric {metric} is not supported by the library.")
-
-    return dist
-
-
-def pairwise_similarity_bit(features: np.array, metric: str) -> np.ndarray:
-    """Compute the pairwise similarity coefficients and returns them in
-        a square symmetric matrix.
+def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray:
+    """Compute pairwise similarity coefficient matrix.
 
     Parameters
     ----------
-    features : ndarray
-        Feature matrix.
+    X : ndarray of shape (n_samples, n_features)
+        Feature matrix of `n_samples` samples in `n_features` dimensional space.
     metric : str
-        Method of calculation.
+        The metric used when calculating similarity coefficients between samples in a feature array.
+        Method for calculating similarity coefficient. Options: `"tanimoto"`, `"modified_tanimoto"`.
 
     Returns
     -------
-    pair_coeff : ndarray
-        Similarity coefficients for all molecule pairs in feature matrix.
+    s : ndarray of shape (n_samples, n_samples)
+        A symmetric similarity matrix between each pair of samples in the feature matrix.
+        The diagonal elements are directly computed instead of assuming that they are 1.
     """
 
-    function_dict = {
+    available_methods = {
         "tanimoto": tanimoto,
         "modified_tanimoto": modified_tanimoto,
     }
-
-    pair_simi = []
-    size = len(features)
-    for i in range(0, size):
-        for j in range(i + 1, size):
-            # use the specified metric to compute similarity between all distinct molecule pairs
-            pair_simi.append(function_dict[metric](features[i], features[j]))
-    # shape into symmetric matrix
-    pair_coeff = squareform(pair_simi) + np.identity(size)
-    return pair_coeff
+    if metric not in available_methods:
+        raise ValueError(
+            f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}"
+        )
+    if X.ndim != 2:
+        raise ValueError(f"Argument features should be a 2D array, got {X.ndim}")
+
+    # make pairwise m-by-m similarity matrix
+    n_samples = len(X)
+    s = np.zeros((n_samples, n_samples))
+    # compute similarity between all pairs of points (including the diagonal elements)
+    for i, j in combinations_with_replacement(range(n_samples), 2):
+        s[i, j] = s[j, i] = available_methods[metric](X[i], X[j])
+    return s
 
 
 def tanimoto(a: np.array, b: np.array) -> float:
-    r"""Compute Tanimoto coefficient.
+    r"""Compute Tanimoto coefficient or index (a.k.a. Jaccard similarity coefficient).
+
+    For two binary or non-binary arrays :math:`A` and :math:`B`, Tanimoto coefficient
+    is defined as the size of their intersection divided by the size of their union:
 
     ..math::
-        T(A,B) = A \cap B / A \cup B
+        T(A, B) = \frac{| A \cap B|}{| A \cup B |} =
+        \frac{| A \cap B|}{|A| + |B| - | A \cap B|} =
+        \frac{A \cdot B}{\|A\|^2 + \|B\|^2 - A \cdot B}
+
+    where :math:`A \cdot B = \sum_i{A_i B_i}` and :math:`\|A\|^2 = \sum_i{A_i^2}`.
 
     Parameters
     ----------
-    a : array_like
-        Molecule A's features.
-    b : array_like
-        Molecules B's features.
+    a : ndarray of shape (n_features,)
+        The 1D feature array of sample :math:`A` in an `n_features` dimensional space.
+    b : ndarray of shape (n_features,)
+        The 1D feature array of sample :math:`B` in an `n_features` dimensional space.
 
     Returns
     -------
     coeff : float
-        Tanimoto coefficient for molecules A and B.
-
-    Notes
-    -----
-    The Tanimoto coefficient computes similarity by taking the intersection of A and B over their union.
+        Tanimoto coefficient between feature arrays :math:`A` and :math:`B`.
 
     Bajusz, D., Rácz, A., and Héberger, K.. (2015)
     Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?.
     Journal of Cheminformatics 7.
     """
-    coeff = (sum(a * b)) / ((sum(a ** 2)) + (sum(b ** 2)) - (sum(a * b)))
+    if a.ndim != 1 or b.ndim != 1:
+        raise ValueError(f"Arguments a and b should be 1D arrays, got {a.ndim} and {b.ndim}")
+    if a.shape != b.shape:
+        raise ValueError(
+            f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}"
+        )
+    coeff = sum(a * b) / (sum(a**2) + sum(b**2) - sum(a * b))
     return coeff
 
 
 def modified_tanimoto(a: np.array, b: np.array) -> float:
-    r"""Compute the modified tanimoto coefficient from bitstrings of molecules A and B.
+    r"""Compute the modified tanimoto coefficient from bitstring vectors of data points A and B.
 
     Adjusts calculation of the Tanimoto coefficient to counter its natural bias towards
-    smaller molecules using a Bernoulli probability model.
+    shorter vectors using a Bernoulli probability model.
 
     ..math::
-    mt = \frac{2-p}{3}t_1 + \frac{1+p}{3}t_0$
-    where
-    p = success probability of independent trials
-    $t_1 = | A \cap B |$
-    $t_0 =  |(1-A) \cap (1-B)|$
+    MT = \frac{2-p}{3}T_1 + \frac{1+p}{3}T_0
+
+    where :math:`p` is success probability of independent trials,
+    :math:`T_1` is the number of common '1' bits between data points
+    (:math:`T_1 = | A \cap B |`), and :math:`T_0` is the number of common '0'
+    bits between data points (:math:`T_0 = |(1-A) \cap (1-B)|`).
+
 
     Parameters
     ----------
-    a : array_like
-        Molecule A's features in bitstring.
-    b : array_like
-        Molecules B's features in bitstring.
+    a : ndarray of shape (n_features,)
+        The 1D bitstring feature array of sample :math:`A` in an `n_features` dimensional space.
+    b : ndarray of shape (n_features,)
+        The 1D bitstring feature array of sample :math:`B` in an `n_features` dimensional space.
 
     Returns
     -------
     mt : float
-        Modified tanimoto coefficient for molecule A and B.
+        Modified tanimoto coefficient between bitstring feature arrays :math:`A` and :math:`B`.
 
     Notes
     -----
+    The equation above has been derived from
+
+    ..math::
+    MT_\alpha= {\alpha}T_1 + (1-\alpha)T_0
+
+    where :math:`\alpha = \frac{2-p}{3}`. This is done so that the expected value
+    of the modified tanimoto, :math:`E(MT)`, remains constant even as the number of
+    trials :math:`p` grows larger.
 
     Fligner, M. A., Verducci, J. S., and Blower, P. E.. (2002)
     A Modification of the Jaccard-Tanimoto Similarity Index for
     Diverse Selection of Chemical Compounds Using Binary Strings.
     Technometrics 44, 110-119.
     """
-    n = len(a)
-    # intersection of '1' bits
+    if a.ndim != 1:
+        raise ValueError(f"Argument `a` should have dimension 1 rather than {a.ndim}.")
+    if b.ndim != 1:
+        raise ValueError(f"Argument `b` should have dimension 1 rather than {b.ndim}.")
+    if a.shape != b.shape:
+        raise ValueError(
+            f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}"
+        )
+
+    n_features = len(a)
+    # number of common '1' bits between points A and B
     n_11 = sum(a * b)
-    # intersection of '0' bits
+    # number of common '0' bits between points A and B
     n_00 = sum((1 - a) * (1 - b))
 
-    # calculate in terms of '1' bits
-    if n_00 == n:
-        t_1 = 1
-    else:
-        t_1 = n_11 / (n - n_00)
-    # calculate in terms of '0' bits
-    if n_11 == n:
-        t_0 = 1
-    else:
-        t_0 = n_00 / (n - n_11)
+    # calculate Tanimoto coefficient based on '0' bits
+    t_1 = 1
+    if n_00 != n_features:
+        # bit strings are not all '0's
+        t_1 = n_11 / (n_features - n_00)
+    # calculate Tanimoto coefficient based on '1' bits
+    t_0 = 1
+    if n_11 != n_features:
+        # bit strings are not all '1's
+        t_0 = n_00 / (n_features - n_11)
+
     # combine into modified tanimoto using Bernoulli Model
-    p = ((n - n_00) + n_11) / (2 * n)
+    # p = independent success trials
+    #       evaluated as total number of '1' bits
+    #       divided by 2x the fingerprint length
+    p = (n_features - n_00 + n_11) / (2 * n_features)
+    # mt = x * T_1 + (1-x) * T_0
+    #       x = (2-p)/3 so that E(mt) = 1/3, no matter the value of p
     mt = (((2 - p) / 3) * t_1) + (((1 + p) / 3) * t_0)
     return mt
 
 
-def nearest_average_tanimoto(x: np.ndarray) -> float:
-    """Computes the average tanimoto for nearest molecules.
+def nearest_average_tanimoto(X: np.ndarray) -> float:
+    """Compute the average tanimoto for nearest data points measured by Minkowski 2-norm.
+
+    For each sample, the closest neighbor is identified by computing its Minkowski 2-norm
+    (i.e., Euclidean) distance with all other samples, and identifying neighboring sample
+    with the shortest distance.
 
     Parameters
     ----------
-    x : ndarray
-        Feature matrix.
+    X : ndarray of shape (n_samples, n_features)
+        Feature matrix of `n_samples` samples in `n_features` dimensional space.
 
     Returns
     -------
-    nat : float
-        Average tanimoto of closest pairs.
-
-    Notes
-    -----
-    This computes the tanimoto coefficient of pairs with the shortest
-    distances, then returns the average of them.
-    This calculation is explictly for the explicit diversity index.
+    float :
+        Average of the Tanimoto coefficients for each sample and its closest neighbor.
 
     Papp, Á., Gulyás-Forró, A., Gulyás, Z., Dormán, G., Ürge, L.,
     and Darvas, F.. (2006) Explicit Diversity Index (EDI):
     A Novel Measure for Assessing the Diversity of Compound Databases.
     Journal of Chemical Information and Modeling 46, 1898-1904.
     """
-    tani = []
-    for idx, _ in enumerate(x):
-        # arbitrary distance for comparison:
-        short = 100
-        a = 0
-        b = 0
-        # search for shortest distance point from idx
-        for jdx, _ in enumerate(x):
-            dist = np.linalg.norm(x[idx]-x[jdx])
-            if dist < short and idx != jdx:
-                short = dist
-                a = idx
-                b = jdx
-        # calculate tanimoto for each shortest dist pair
-        tani.append(tanimoto(x[a], x[b]))
-    # compute average of all shortest tanimoto coeffs
-    nat = np.average(tani)
-    return nat
-
+    # compute euclidean distance between all samples
+    dist = distance_matrix(X, X, p=2)
+    # replace zero self-distance with infinity, before computing nearest neighbors
+    np.fill_diagonal(dist, np.inf)
+    # find index of closest neighbor for each sample
+    nearest_neighbors = np.argmin(dist, axis=0)
+    assert nearest_neighbors.shape == (X.shape[0],)
+    # compute the tanimoto coefficient for each sample and its closest neighbor
+    coeffs = []
+    for idx_sample, idx_neighbor in enumerate(nearest_neighbors):
+        coeffs.append(tanimoto(X[idx_sample], X[idx_neighbor]))
+    # return average of all coefficients
+    return np.average(coeffs)
diff --git a/DiverseSelector/diversity.py b/DiverseSelector/diversity.py
@@ -54,6 +54,7 @@ def compute_diversity(
         Method of calculation diversity for a given molecule set, which
         includes "entropy", "logdet", "shannon_entropy", "wdud",
         gini_coefficient" and "hypersphere_overlap_of_subset".
+        Default is "hypersphere_overlap_of_subset".
     mols : List[rdkit.Chem.rdchem.Mol], optional
         List of RDKit molecule objects. This is only needed when using the
         "explicit_diversity_index" method. Default=None.