From 84b6d6c6d07d6745da89f15c50c543211b20fe6a Mon Sep 17 00:00:00 2001
From: awb9691 <abby.broscius@gmail.com>
Date: Tue, 20 Jun 2023 12:40:01 -0400
Subject: [PATCH 01/11] remove compute_distance_matrix, add tanimoto tests

---
 DiverseSelector/distance.py           | 46 ++-------------------------
 DiverseSelector/test/test_distance.py | 46 ++++++++++++---------------
 2 files changed, 23 insertions(+), 69 deletions(-)

diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py
index ed76cc46..ef3a0701 100644
--- a/DiverseSelector/distance.py
+++ b/DiverseSelector/distance.py
@@ -28,7 +28,6 @@
 from scipy.spatial.distance import squareform
 
 __all__ = [
-    "compute_distance_matrix",
     "pairwise_similarity_bit",
     "tanimoto",
     "modified_tanimoto",
@@ -36,47 +35,6 @@
 ]
 
 
-def compute_distance_matrix(
-    features: np.ndarray,
-    metric: str
-):
-    """Compute pairwise distance given a feature matrix.
-
-    Parameters
-    ----------
-    features : np.ndarray
-        Molecule feature matrix.
-    metric : str
-        Distance metric.
-
-    Returns
-    -------
-    dist : ndarray
-        Symmetric distance array.
-    """
-    # todo: add more metrics implemented here
-    built_in_metrics = {
-        "tanimoto": tanimoto,
-        "modified_tanimoto": modified_tanimoto,
-    }
-
-    # Check if specified metric is supported
-    if metric in built_in_metrics:
-        distances = []
-        size = len(features)
-        for i in range(0, size):
-            for j in range(i + 1, size):
-                # use the metric to compute distance between all molecule pairs
-                distances.append(1 - built_in_metrics[metric](features[i], features[j]))
-        # shape into symmetric matrix
-        dist = squareform(distances)
-
-    else:  # raise error if unsupported
-        raise ValueError(f"Metric {metric} is not supported by the library.")
-
-    return dist
-
-
 def pairwise_similarity_bit(features: np.array, metric: str) -> np.ndarray:
     """Compute the pairwise similarity coefficients and returns them in
         a square symmetric matrix.
@@ -130,7 +88,8 @@ def tanimoto(a: np.array, b: np.array) -> float:
 
     Notes
     -----
-    The Tanimoto coefficient computes similarity by taking the intersection of A and B over their union.
+    The Tanimoto coefficient computes similarity by taking the intersection
+    of A and B over their union.
 
     Bajusz, D., Rácz, A., and Héberger, K.. (2015)
     Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?.
@@ -237,4 +196,3 @@ def nearest_average_tanimoto(x: np.ndarray) -> float:
     # compute average of all shortest tanimoto coeffs
     nat = np.average(tani)
     return nat
-
diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py
index 42320165..10033230 100644
--- a/DiverseSelector/test/test_distance.py
+++ b/DiverseSelector/test/test_distance.py
@@ -23,9 +23,10 @@
 
 """Testing for the distance and similarity algorithms in the distance.py module."""
 
-from DiverseSelector.distance import (compute_distance_matrix,
-                                      pairwise_similarity_bit,
-                                      nearest_average_tanimoto
+from DiverseSelector.distance import (pairwise_similarity_bit,
+                                      nearest_average_tanimoto,
+                                      tanimoto,
+                                      modified_tanimoto
                                       )
 
 import numpy as np
@@ -51,22 +52,16 @@
                     [0, 1, 1]])
 
 
-def test_compute_distance_matrix_builtin():
-    """Testing the compute distance matrix with a built in metric."""
-    sci_dist = compute_distance_matrix(sample2, "tanimoto")
-    expected = np.array([[0, 0.6666667, 1, 1],
-                         [0.6666667, 0, 1, 1],
-                         [1, 1, 0, 1],
-                         [1, 1, 1, 0]])
-    assert_almost_equal(expected, sci_dist)
-
-
-def test_compute_distance_matrix_invalid_metric():
-    """Testing the compute distance matrix with an invalid metric."""
-    assert_raises(ValueError, compute_distance_matrix, sample1, "fake_distance")
+def test_tanimoto():
+    """Test the tanimoto function on one pair of points."""
+    a = np.array([2, 0, 1])
+    b = np.array([2, 0, 0])
+    expected = 4 / (5 + 4 - 4)
+    tani = tanimoto(a, b)
+    assert_equal(tani, expected)
 
 
-def test_tanimoto():
+def test_tanimoto_matrix():
     """Testing the tanimoto function with predefined feature matrix."""
     tani = pairwise_similarity_bit(sample3, "tanimoto")
     expected = np.array([[1, (11 / 19)],
@@ -74,7 +69,15 @@ def test_tanimoto():
     assert_equal(expected, tani)
 
 
-def test_modifed_tanimoto():
+def test_modified_tanimoto():
+    a = np.array([1, 1, 0, 0, 1])
+    b = np.array([0, 0, 0, 0, 1])
+    expected = (1.6 / 9) + (1.4/6)
+    mod_tani = modified_tanimoto(a, b)
+    assert_equal(mod_tani, expected)
+
+
+def test_modified_tanimoto_matrix():
     """Testing the modified tanimoto function with predefined feature matrix."""
     mod_tani = pairwise_similarity_bit(sample4, "modified_tanimoto")
     expceted = np.array([[1, (4 / 27)],
@@ -96,10 +99,3 @@ def test_nearest_average_tanimoto():
     shortest_tani = [(11/19), (11/19)]
     average = np.average(shortest_tani)
     assert_equal(nat, average)
-
-
-
-
-
-
-

From 6d4b4bc3e768d0f49bd78f26701fcca638999ca5 Mon Sep 17 00:00:00 2001
From: awb9691 <abby.broscius@gmail.com>
Date: Tue, 20 Jun 2023 15:13:17 -0400
Subject: [PATCH 02/11] add testing, docs to modified_tanimoto

---
 DiverseSelector/distance.py           | 29 ++++++++++++++++++++++-----
 DiverseSelector/test/test_distance.py | 24 ++++++++++++++++++++++
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py
index ef3a0701..48496054 100644
--- a/DiverseSelector/distance.py
+++ b/DiverseSelector/distance.py
@@ -106,11 +106,13 @@ def modified_tanimoto(a: np.array, b: np.array) -> float:
     smaller molecules using a Bernoulli probability model.
 
     ..math::
-    mt = \frac{2-p}{3}t_1 + \frac{1+p}{3}t_0$
-    where
-    p = success probability of independent trials
-    $t_1 = | A \cap B |$
-    $t_0 =  |(1-A) \cap (1-B)|$
+    MT = \frac{2-p}{3}T_1 + \frac{1+p}{3}T_0
+
+    where :math:`p` is success probability of independent trials,
+    :math:`T_1` is the number of common '1' bits between molecules
+    (:math:`T_1 = | A \cap B |`), and :math:`T_0` is the number of common '0'
+    bits between molecules (:math:`T_0 = |(1-A) \cap (1-B)|`).
+
 
     Parameters
     ----------
@@ -126,12 +128,29 @@ def modified_tanimoto(a: np.array, b: np.array) -> float:
 
     Notes
     -----
+    The equation above has been derived from
+
+    ..math::
+    MT_\alpha= {\alpha}T_1 + (1-\alpha)T_0
+
+    where :math:`\alpha = \frac{2-p}{3}`. This is done so that the expected value
+    of the modified tanimoto, :math:`E(MT)`, remains constant even as the number of
+    trials :math:`p` grows larger.
 
     Fligner, M. A., Verducci, J. S., and Blower, P. E.. (2002)
     A Modification of the Jaccard-Tanimoto Similarity Index for
     Diverse Selection of Chemical Compounds Using Binary Strings.
     Technometrics 44, 110-119.
     """
+    if a.ndim != 1:
+        raise ValueError(
+            f"Argument `a` should have dimension 1 rather than {a.ndim}."
+        )
+    if b.ndim != 1:
+        raise ValueError(
+            f"Argument `b` should have dimension 1 rather than {b.ndim}."
+        )
+
     n = len(a)
     # intersection of '1' bits
     n_11 = sum(a * b)
diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py
index 10033230..2f61f497 100644
--- a/DiverseSelector/test/test_distance.py
+++ b/DiverseSelector/test/test_distance.py
@@ -77,6 +77,30 @@ def test_modified_tanimoto():
     assert_equal(mod_tani, expected)
 
 
+def test_modified_tanimoto_all_ones():
+    """Test the modified tanimoto function when input is all '1' bits"""
+    a = np.array([1, 1, 1, 1, 1])
+    expected = 1
+    mod_tani = modified_tanimoto(a,a)
+    assert_equal(mod_tani, expected)
+
+
+def test_modified_tanimoto_all_zeroes():
+    """Test the modified tanimoto function when input is all '0' bits"""
+    a = np.zeros(5)
+    expected = 1
+    mod_tani = modified_tanimoto(a, a)
+    assert_equal(mod_tani, expected)
+
+
+def test_modified_tanimoto_dimension_error():
+    """Test modified tanimoto raises error when input has incorrect dimension."""
+    a = np.zeros([7,5])
+    b = np.zeros(5)
+    assert_raises(ValueError, modified_tanimoto, a, b)
+    assert_raises(ValueError, modified_tanimoto, b, a)
+
+
 def test_modified_tanimoto_matrix():
     """Testing the modified tanimoto function with predefined feature matrix."""
     mod_tani = pairwise_similarity_bit(sample4, "modified_tanimoto")

From 28a857ab16daf2a0c83a5d098efbebc3c6bf1c65 Mon Sep 17 00:00:00 2001
From: awb9691 <abby.broscius@gmail.com>
Date: Tue, 20 Jun 2023 15:57:59 -0400
Subject: [PATCH 03/11] optimize nearest_average_tanimoto, add comments to
 modified_tanimoto

---
 DiverseSelector/distance.py           | 45 ++++++++++++++-------------
 DiverseSelector/test/test_distance.py |  3 +-
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py
index 48496054..2f47f87b 100644
--- a/DiverseSelector/distance.py
+++ b/DiverseSelector/distance.py
@@ -25,6 +25,7 @@
 
 
 import numpy as np
+from scipy.spatial import distance_matrix
 from scipy.spatial.distance import squareform
 
 __all__ = [
@@ -152,23 +153,30 @@ def modified_tanimoto(a: np.array, b: np.array) -> float:
         )
 
     n = len(a)
-    # intersection of '1' bits
+    # number of common '1' bits between molecules A and B
     n_11 = sum(a * b)
-    # intersection of '0' bits
+    # number of common '0' bits between molecules A and B
     n_00 = sum((1 - a) * (1 - b))
 
-    # calculate in terms of '1' bits
+    # calculate Tanimoto coeff based on '1' bits
     if n_00 == n:
+        # bit string is all '0's
         t_1 = 1
     else:
         t_1 = n_11 / (n - n_00)
-    # calculate in terms of '0' bits
+    # calculate Tanimoto coeff based on '1' bits
     if n_11 == n:
+        # bit string is all '1's
         t_0 = 1
     else:
         t_0 = n_00 / (n - n_11)
     # combine into modified tanimoto using Bernoulli Model
-    p = ((n - n_00) + n_11) / (2 * n)
+    # p = independent success trials
+    #       evaluated as total number of '1' bits
+    #       divided by 2x the fingerprint length
+    p = (n - n_00 + n_11) / (2 * n)
+    # mt = x * T_1 + (1-x) * T_0
+    #       x = (2-p)/3 so that E(mt) = 1/3, no matter the value of p
     mt = (((2 - p) / 3) * t_1) + (((1 + p) / 3) * t_0)
     return mt
 
@@ -183,7 +191,7 @@ def nearest_average_tanimoto(x: np.ndarray) -> float:
 
     Returns
     -------
-    nat : float
+    float :
         Average tanimoto of closest pairs.
 
     Notes
@@ -198,20 +206,15 @@ def nearest_average_tanimoto(x: np.ndarray) -> float:
     Journal of Chemical Information and Modeling 46, 1898-1904.
     """
     tani = []
-    for idx, _ in enumerate(x):
-        # arbitrary distance for comparison:
-        short = 100
-        a = 0
-        b = 0
-        # search for shortest distance point from idx
-        for jdx, _ in enumerate(x):
-            dist = np.linalg.norm(x[idx]-x[jdx])
-            if dist < short and idx != jdx:
-                short = dist
-                a = idx
-                b = jdx
-        # calculate tanimoto for each shortest dist pair
-        tani.append(tanimoto(x[a], x[b]))
-    # compute average of all shortest tanimoto coeffs
+    # calculate euclidean distance between all points
+    #     and adjust for distance to self
+    dist = distance_matrix(x, x) + 100*np.eye(x.shape[0])
+    # find closest point for each row of x
+    short_idx = np.argmin(dist, axis=0)
+    print(f"these are the shortest indices:", short_idx)
+    for idx in range(0, len(short_idx)):
+        # compute the tanimoto coeff for each pair of closest points
+        tani.append(tanimoto(x[idx], x[short_idx[idx]]))
+    # take the average of all coeffs calculated
     nat = np.average(tani)
     return nat
diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py
index 2f61f497..7aee0e18 100644
--- a/DiverseSelector/test/test_distance.py
+++ b/DiverseSelector/test/test_distance.py
@@ -120,6 +120,5 @@ def test_nearest_average_tanimoto_bit():
 def test_nearest_average_tanimoto():
     """Test the nearest_average_tanimoto function with non-binary input"""
     nat = nearest_average_tanimoto(sample3)
-    shortest_tani = [(11/19), (11/19)]
-    average = np.average(shortest_tani)
+    average = 11/19
     assert_equal(nat, average)

From cf88ce3202add81aabd2bf4df07c924bc359ad0e Mon Sep 17 00:00:00 2001
From: AWBroscius <abby.broscius@gmail.com>
Date: Wed, 5 Jul 2023 16:30:13 -0400
Subject: [PATCH 04/11] address pull request comments, clean docs

---
 DiverseSelector/distance.py | 53 +++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py
index 2f47f87b..9678d9b3 100644
--- a/DiverseSelector/distance.py
+++ b/DiverseSelector/distance.py
@@ -32,7 +32,7 @@
     "pairwise_similarity_bit",
     "tanimoto",
     "modified_tanimoto",
-    "nearest_average_tanimoto"
+    "nearest_average_tanimoto",
 ]
 
 
@@ -50,7 +50,7 @@ def pairwise_similarity_bit(features: np.array, metric: str) -> np.ndarray:
     Returns
     -------
     pair_coeff : ndarray
-        Similarity coefficients for all molecule pairs in feature matrix.
+        Similarity coefficients for all data point pairs in feature matrix.
     """
 
     function_dict = {
@@ -78,14 +78,14 @@ def tanimoto(a: np.array, b: np.array) -> float:
     Parameters
     ----------
     a : array_like
-        Molecule A's features.
+        Data point A's features.
     b : array_like
-        Molecules B's features.
+        Data point B's features.
 
     Returns
     -------
     coeff : float
-        Tanimoto coefficient for molecules A and B.
+        Tanimoto coefficient for data points A and B.
 
     Notes
     -----
@@ -96,31 +96,31 @@ def tanimoto(a: np.array, b: np.array) -> float:
     Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?.
     Journal of Cheminformatics 7.
     """
-    coeff = (sum(a * b)) / ((sum(a ** 2)) + (sum(b ** 2)) - (sum(a * b)))
+    coeff = (sum(a * b)) / ((sum(a**2)) + (sum(b**2)) - (sum(a * b)))
     return coeff
 
 
 def modified_tanimoto(a: np.array, b: np.array) -> float:
-    r"""Compute the modified tanimoto coefficient from bitstrings of molecules A and B.
+    r"""Compute the modified tanimoto coefficient from bitstring vectors of data points A and B.
 
     Adjusts calculation of the Tanimoto coefficient to counter its natural bias towards
-    smaller molecules using a Bernoulli probability model.
+    shorter vectors using a Bernoulli probability model.
 
     ..math::
     MT = \frac{2-p}{3}T_1 + \frac{1+p}{3}T_0
 
     where :math:`p` is success probability of independent trials,
-    :math:`T_1` is the number of common '1' bits between molecules
+    :math:`T_1` is the number of common '1' bits between data points
     (:math:`T_1 = | A \cap B |`), and :math:`T_0` is the number of common '0'
-    bits between molecules (:math:`T_0 = |(1-A) \cap (1-B)|`).
+    bits between data points (:math:`T_0 = |(1-A) \cap (1-B)|`).
 
 
     Parameters
     ----------
     a : array_like
-        Molecule A's features in bitstring.
+        Data point A's features in bitstring.
     b : array_like
-        Molecules B's features in bitstring.
+        Data point B's features in bitstring.
 
     Returns
     -------
@@ -144,18 +144,14 @@ def modified_tanimoto(a: np.array, b: np.array) -> float:
     Technometrics 44, 110-119.
     """
     if a.ndim != 1:
-        raise ValueError(
-            f"Argument `a` should have dimension 1 rather than {a.ndim}."
-        )
+        raise ValueError(f"Argument `a` should have dimension 1 rather than {a.ndim}.")
     if b.ndim != 1:
-        raise ValueError(
-            f"Argument `b` should have dimension 1 rather than {b.ndim}."
-        )
+        raise ValueError(f"Argument `b` should have dimension 1 rather than {b.ndim}.")
 
     n = len(a)
-    # number of common '1' bits between molecules A and B
+    # number of common '1' bits between points A and B
     n_11 = sum(a * b)
-    # number of common '0' bits between molecules A and B
+    # number of common '0' bits between points A and B
     n_00 = sum((1 - a) * (1 - b))
 
     # calculate Tanimoto coeff based on '1' bits
@@ -182,7 +178,7 @@ def modified_tanimoto(a: np.array, b: np.array) -> float:
 
 
 def nearest_average_tanimoto(x: np.ndarray) -> float:
-    """Computes the average tanimoto for nearest molecules.
+    """Computes the average tanimoto for nearest data points.
 
     Parameters
     ----------
@@ -192,13 +188,13 @@ def nearest_average_tanimoto(x: np.ndarray) -> float:
     Returns
     -------
     float :
-        Average tanimoto of closest pairs.
+        Average Tanimoto of closest pairs.
 
     Notes
     -----
-    This computes the tanimoto coefficient of pairs with the shortest
-    distances, then returns the average of them.
-    This calculation is explictly for the explicit diversity index.
+    This computes the Tanimoto coefficient of pairs of data points
+    with the shortest distances, then returns the average of them.
+    This calculation is explicitly for the explicit diversity index.
 
     Papp, Á., Gulyás-Forró, A., Gulyás, Z., Dormán, G., Ürge, L.,
     and Darvas, F.. (2006) Explicit Diversity Index (EDI):
@@ -208,13 +204,12 @@ def nearest_average_tanimoto(x: np.ndarray) -> float:
     tani = []
     # calculate euclidean distance between all points
     #     and adjust for distance to self
-    dist = distance_matrix(x, x) + 100*np.eye(x.shape[0])
+    dist = distance_matrix(x, x) + np.inf*np.eye(x.shape[0])
     # find closest point for each row of x
     short_idx = np.argmin(dist, axis=0)
-    print(f"these are the shortest indices:", short_idx)
-    for idx in range(0, len(short_idx)):
+    for idx, min_d in enumerate(short_idx):
         # compute the tanimoto coeff for each pair of closest points
-        tani.append(tanimoto(x[idx], x[short_idx[idx]]))
+        tani.append(tanimoto(x[idx], x[min_d]))
     # take the average of all coeffs calculated
     nat = np.average(tani)
     return nat

From caabee99027ee2ca607098f3b81ef819aea8d4b7 Mon Sep 17 00:00:00 2001
From: Farnaz Heidar-Zadeh <farnaz_chem@yahoo.com>
Date: Wed, 5 Jul 2023 19:50:32 -0400
Subject: [PATCH 05/11] Finalize pairwise_similarity_bit function

---
 DiverseSelector/distance.py           | 42 +++++++++++++++------------
 DiverseSelector/test/test_distance.py |  9 ++++++
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py
index 9678d9b3..369dfd11 100644
--- a/DiverseSelector/distance.py
+++ b/DiverseSelector/distance.py
@@ -25,6 +25,7 @@
 
 
 import numpy as np
+from itertools import combinations_with_replacement
 from scipy.spatial import distance_matrix
 from scipy.spatial.distance import squareform
 
@@ -36,37 +37,40 @@
 ]
 
 
-def pairwise_similarity_bit(features: np.array, metric: str) -> np.ndarray:
-    """Compute the pairwise similarity coefficients and returns them in
-        a square symmetric matrix.
+def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray:
+    """Compute pairwise similarity coefficient matrix.
 
     Parameters
     ----------
-    features : ndarray
-        Feature matrix.
+    X : ndarray
+        An `m` by `n` feature array of `m` samples in an `n`-dimensional feature space.
     metric : str
-        Method of calculation.
+        Method for calculating similarity coefficient. Options: `"tanimoto"`, `"modified_tanimoto"`.
 
     Returns
     -------
-    pair_coeff : ndarray
-        Similarity coefficients for all data point pairs in feature matrix.
+    pair_simi : ndarray
+        Returns a symmetric `m` by `m` array containing the similarity coefficient between
+        each pair of samples in the feature matrix. The diagonal elements are directly
+        computed instead of assuming that they are 1.
     """
 
-    function_dict = {
+    available_methods = {
         "tanimoto": tanimoto,
         "modified_tanimoto": modified_tanimoto,
     }
-
-    pair_simi = []
-    size = len(features)
-    for i in range(0, size):
-        for j in range(i + 1, size):
-            # use the specified metric to compute similarity between all distinct molecule pairs
-            pair_simi.append(function_dict[metric](features[i], features[j]))
-    # shape into symmetric matrix
-    pair_coeff = squareform(pair_simi) + np.identity(size)
-    return pair_coeff
+    if metric not in available_methods:
+        raise ValueError(f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}")
+    if X.ndim != 2:
+        raise ValueError(f"Argument features should be a 2D array, got {X.ndim}")
+
+    # make pairwise m-by-m similarity matrix
+    m = len(X)
+    pair_simi = np.zeros((m, m))
+    # compute similarity between all pairs of points (including the diagonal elements)
+    for i, j in combinations_with_replacement(range(m), 2):
+        pair_simi[i, j] = pair_simi[j, i] = available_methods[metric](X[i], X[j])
+    return pair_simi
 
 
 def tanimoto(a: np.array, b: np.array) -> float:
diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py
index 7aee0e18..38da82bc 100644
--- a/DiverseSelector/test/test_distance.py
+++ b/DiverseSelector/test/test_distance.py
@@ -52,6 +52,15 @@
                     [0, 1, 1]])
 
 
+def test_pairwise_similarity_bit_raises():
+    # check raised error for input feature matrix that is not 2D
+    assert_raises(ValueError, pairwise_similarity_bit, np.random.random(5), "tanimoto")
+    assert_raises(ValueError, pairwise_similarity_bit, np.random.random((2, 3, 4)), "tanimoto")
+    # check raised error for not-available method
+    assert_raises(ValueError, pairwise_similarity_bit, np.random.random((5, 1)), "tan")
+    assert_raises(ValueError, pairwise_similarity_bit, np.random.random((5, 1)), tanimoto)
+
+
 def test_tanimoto():
     """Test the tanimoto function on one pair of points."""
     a = np.array([2, 0, 1])

From 8278d6b3a0d90eebd67c5a23bab9dbbee6456fa0 Mon Sep 17 00:00:00 2001
From: Farnaz Heidar-Zadeh <farnaz_chem@yahoo.com>
Date: Wed, 5 Jul 2023 20:09:42 -0400
Subject: [PATCH 06/11] Finalize tanimoto function

---
 DiverseSelector/distance.py           | 29 +++++++++++++++++----------
 DiverseSelector/test/test_distance.py | 20 +++++++++++++-----
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py
index 369dfd11..fec406a6 100644
--- a/DiverseSelector/distance.py
+++ b/DiverseSelector/distance.py
@@ -74,32 +74,39 @@ def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray:
 
 
 def tanimoto(a: np.array, b: np.array) -> float:
-    r"""Compute Tanimoto coefficient.
+    r"""Compute Tanimoto coefficient or index (a.k.a. Jaccard similarity coefficient).
+
+    For two binary or non-binary arrays :math:`A` and :math:`B`, Tanimoto coefficient
+    is defined as the size of their intersection divided by the size of their union:
 
     ..math::
-        T(A,B) = A \cap B / A \cup B
+        T(A, B) = \frac{| A \cap B|}{| A \cup B |} =
+        \frac{| A \cap B|}{|A| + |B| - | A \cap B|} =
+        \frac{A \cdot B}{\|A\|^2 + \|B\|^2 - A \cdot B}
+
+    where :math:`A \cdot B = \sum_i{A_i B_i}` and :math:`\|A\|^2 = \sum_i{A_i^2}`.
 
     Parameters
     ----------
-    a : array_like
-        Data point A's features.
-    b : array_like
-        Data point B's features.
+    a : ndarray
+        The 1D feature array of sample :math:`A` in an `n`-dimensional space.
+    b : ndarray
+        The 1D feature array of sample :math:`B` in an `n`-dimensional space.
 
     Returns
     -------
     coeff : float
-        Tanimoto coefficient for data points A and B.
+        Tanimoto coefficient between feature arrays :math:`A` and :math:`B`.
 
-    Notes
-    -----
-    The Tanimoto coefficient computes similarity by taking the intersection
-    of A and B over their union.
 
     Bajusz, D., Rácz, A., and Héberger, K.. (2015)
     Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?.
     Journal of Cheminformatics 7.
     """
+    if a.ndim != 1 or b.ndim != 1:
+        raise ValueError(f"Arguments a and b should be 1D arrays, got {a.ndim} and {b.ndim}")
+    if a.shape != b.shape:
+        raise ValueError(f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}")
     coeff = (sum(a * b)) / ((sum(a**2)) + (sum(b**2)) - (sum(a * b)))
     return coeff
 
diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py
index 38da82bc..678b883d 100644
--- a/DiverseSelector/test/test_distance.py
+++ b/DiverseSelector/test/test_distance.py
@@ -61,20 +61,30 @@ def test_pairwise_similarity_bit_raises():
     assert_raises(ValueError, pairwise_similarity_bit, np.random.random((5, 1)), tanimoto)
 
 
+def test_tanimoto_raises():
+    # check raised error when a or b is not 1D
+    assert_raises(ValueError, tanimoto, np.random.random((1, 5)), np.random.random(5))
+    assert_raises(ValueError, tanimoto, np.random.random(3), np.random.random((1, 4)))
+    assert_raises(ValueError, tanimoto, np.random.random(4), np.random.random((3, 4)))
+    assert_raises(ValueError, tanimoto, np.random.random((3, 3)), np.random.random((2, 3)))
+    # check raised error when a and b don't have the same length
+    assert_raises(ValueError, tanimoto, np.random.random(3), np.random.random(5))
+    assert_raises(ValueError, tanimoto, np.random.random(20), np.random.random(10))
+
+
 def test_tanimoto():
     """Test the tanimoto function on one pair of points."""
     a = np.array([2, 0, 1])
     b = np.array([2, 0, 0])
     expected = 4 / (5 + 4 - 4)
-    tani = tanimoto(a, b)
-    assert_equal(tani, expected)
+    assert_equal(tanimoto(a, b), expected)
 
 
 def test_tanimoto_matrix():
     """Testing the tanimoto function with predefined feature matrix."""
-    tani = pairwise_similarity_bit(sample3, "tanimoto")
-    expected = np.array([[1, (11 / 19)],
-                         [(11 / 19), 1]])
+    x = np.array([[1, 4], [3, 2]])
+    tani = pairwise_similarity_bit(x, "tanimoto")
+    expected = np.array([[1, (11 / 19)], [(11 / 19), 1]])
     assert_equal(expected, tani)
 
 

From f403882b9fa8ba553421adcb577559bd009519b9 Mon Sep 17 00:00:00 2001
From: Farnaz Heidar-Zadeh <farnaz_chem@yahoo.com>
Date: Thu, 6 Jul 2023 20:47:52 -0400
Subject: [PATCH 07/11] Finalize nearest_average_tanimoto function

---
 DiverseSelector/distance.py           | 45 +++++++++++++--------------
 DiverseSelector/test/test_distance.py | 18 +++++++++--
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py
index fec406a6..66a1bb6d 100644
--- a/DiverseSelector/distance.py
+++ b/DiverseSelector/distance.py
@@ -188,39 +188,38 @@ def modified_tanimoto(a: np.array, b: np.array) -> float:
     return mt
 
 
-def nearest_average_tanimoto(x: np.ndarray) -> float:
-    """Computes the average tanimoto for nearest data points.
+def nearest_average_tanimoto(X: np.ndarray) -> float:
+    """Compute the average tanimoto for nearest data points measured by Minkowski 2-norm.
+
+    For each sample, the closest neighbor is identified by computing its Minkowski 2-norm
+    (i.e., Euclidean) distance with all other samples, and identifying neighboring sample
+    with the shortest distance.
 
     Parameters
     ----------
-    x : ndarray
-        Feature matrix.
+    X : (M, K) array_like
+        Matrix of `M` samples in an `K` dimensional feature space.
 
     Returns
     -------
     float :
-        Average Tanimoto of closest pairs.
-
-    Notes
-    -----
-    This computes the Tanimoto coefficient of pairs of data points
-    with the shortest distances, then returns the average of them.
-    This calculation is explicitly for the explicit diversity index.
+        Average of the Tanimoto coefficients for each sample and its closest neighbor.
 
     Papp, Á., Gulyás-Forró, A., Gulyás, Z., Dormán, G., Ürge, L.,
     and Darvas, F.. (2006) Explicit Diversity Index (EDI):
     A Novel Measure for Assessing the Diversity of Compound Databases.
     Journal of Chemical Information and Modeling 46, 1898-1904.
     """
-    tani = []
-    # calculate euclidean distance between all points
-    #     and adjust for distance to self
-    dist = distance_matrix(x, x) + np.inf*np.eye(x.shape[0])
-    # find closest point for each row of x
-    short_idx = np.argmin(dist, axis=0)
-    for idx, min_d in enumerate(short_idx):
-        # compute the tanimoto coeff for each pair of closest points
-        tani.append(tanimoto(x[idx], x[min_d]))
-    # take the average of all coeffs calculated
-    nat = np.average(tani)
-    return nat
+    # compute euclidean distance between all samples
+    dist = distance_matrix(X, X, p=2)
+    # replace zero self-distance with infinity, before computing nearest neighbors
+    np.fill_diagonal(dist, np.inf)
+    # find index of closest neighbor for each sample
+    nearest_neighbors = np.argmin(dist, axis=0)
+    assert nearest_neighbors.shape == (X.shape[0],)
+    # compute the tanimoto coeff for each sample and its closest neighbor
+    coeffs = []
+    for idx_sample, idx_neighbor in enumerate(nearest_neighbors):
+        coeffs.append(tanimoto(X[idx_sample], X[idx_neighbor]))
+    # return average of all coefficients
+    return np.average(coeffs)
diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py
index 678b883d..a0f90a83 100644
--- a/DiverseSelector/test/test_distance.py
+++ b/DiverseSelector/test/test_distance.py
@@ -138,6 +138,18 @@ def test_nearest_average_tanimoto_bit():
 
 def test_nearest_average_tanimoto():
     """Test the nearest_average_tanimoto function with non-binary input"""
-    nat = nearest_average_tanimoto(sample3)
-    average = 11/19
-    assert_equal(nat, average)
+    x = np.array([[1, 4], [3, 2]])
+    # expected: (1x3 + 4x2) / (1 + 4^2 + 3^3 + 2^2 - 1x3 - 4x2)
+    assert_equal(nearest_average_tanimoto(x), 11/19)
+
+
+def test_nearest_average_tanimoto_nonsquare():
+    """Test the nearest_average_tanimoto function with non-binary input"""
+    x = np.array([[3.5, 4.0, 10.5, 0.5], [1.25, 4.0, 7.0, 0.1], [0.0, 0.0, 0.0, 0.0]])
+    # nearest neighbor of sample 0, 1, and 2 are sample 1, 0, and 1, respectively.
+    expected = np.average([
+        np.sum(x[0] * x[1]) / (np.sum(x[0]**2) + np.sum(x[1]**2) - np.sum(x[0] * x[1])),
+        np.sum(x[1] * x[0]) / (np.sum(x[1]**2) + np.sum(x[0]**2) - np.sum(x[1] * x[0])),
+        np.sum(x[2] * x[1]) / (np.sum(x[2]**2) + np.sum(x[1]**2) - np.sum(x[2] * x[1])),
+        ])
+    assert_equal(nearest_average_tanimoto(x), expected)

From 0007c26aa3ba88da7ea68db7f2ca6761d7dcd063 Mon Sep 17 00:00:00 2001
From: Farnaz Heidar-Zadeh <farnaz_chem@yahoo.com>
Date: Thu, 6 Jul 2023 20:59:30 -0400
Subject: [PATCH 08/11] Make Parameters docstring match sklearn

---
 DiverseSelector/distance.py | 40 ++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py
index 66a1bb6d..21f94a51 100644
--- a/DiverseSelector/distance.py
+++ b/DiverseSelector/distance.py
@@ -42,17 +42,17 @@ def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray:
 
     Parameters
     ----------
-    X : ndarray
-        An `m` by `n` feature array of `m` samples in an `n`-dimensional feature space.
+    X : ndarray of shape (n_samples, n_features)
+        Feature matrix of `n_samples` samples in `n_features` dimensional space.
     metric : str
+        The metric used when calculating similarity coefficients between samples in a feature array.
         Method for calculating similarity coefficient. Options: `"tanimoto"`, `"modified_tanimoto"`.
 
     Returns
     -------
-    pair_simi : ndarray
-        Returns a symmetric `m` by `m` array containing the similarity coefficient between
-        each pair of samples in the feature matrix. The diagonal elements are directly
-        computed instead of assuming that they are 1.
+    s : ndarray of shape (n_samples, n_samples)
+        A symmetric similarity matrix between each pair of samples in the feature matrix.
+        The diagonal elements are directly computed instead of assuming that they are 1.
     """
 
     available_methods = {
@@ -65,12 +65,12 @@ def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray:
         raise ValueError(f"Argument features should be a 2D array, got {X.ndim}")
 
     # make pairwise m-by-m similarity matrix
-    m = len(X)
-    pair_simi = np.zeros((m, m))
+    n_samples = len(X)
+    s = np.zeros((n_samples, n_samples))
     # compute similarity between all pairs of points (including the diagonal elements)
-    for i, j in combinations_with_replacement(range(m), 2):
-        pair_simi[i, j] = pair_simi[j, i] = available_methods[metric](X[i], X[j])
-    return pair_simi
+    for i, j in combinations_with_replacement(range(n_samples), 2):
+        s[i, j] = s[j, i] = available_methods[metric](X[i], X[j])
+    return s
 
 
 def tanimoto(a: np.array, b: np.array) -> float:
@@ -88,17 +88,16 @@ def tanimoto(a: np.array, b: np.array) -> float:
 
     Parameters
     ----------
-    a : ndarray
-        The 1D feature array of sample :math:`A` in an `n`-dimensional space.
-    b : ndarray
-        The 1D feature array of sample :math:`B` in an `n`-dimensional space.
+    a : ndarray of shape (n_features,)
+        The 1D feature array of sample :math:`A` in an `n_features` dimensional space.
+    b : ndarray of shape (n_features,)
+        The 1D feature array of sample :math:`B` in an `n_features` dimensional space.
 
     Returns
     -------
     coeff : float
         Tanimoto coefficient between feature arrays :math:`A` and :math:`B`.
 
-
     Bajusz, D., Rácz, A., and Héberger, K.. (2015)
     Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?.
     Journal of Cheminformatics 7.
@@ -128,6 +127,11 @@ def modified_tanimoto(a: np.array, b: np.array) -> float:
 
     Parameters
     ----------
+    a : ndarray of shape (n_features,)
+        The 1D feature array of sample :math:`A` in an `n_features` dimensional space.
+    b : ndarray of shape (n_features,)
+        The 1D feature array of sample :math:`B` in an `n_features` dimensional space.
+
     a : array_like
         Data point A's features in bitstring.
     b : array_like
@@ -197,8 +201,8 @@ def nearest_average_tanimoto(X: np.ndarray) -> float:
 
     Parameters
     ----------
-    X : (M, K) array_like
-        Matrix of `M` samples in an `K` dimensional feature space.
+    X : ndarray of shape (n_samples, n_features)
+        Feature matrix of `n_samples` samples in `n_features` dimensional space.
 
     Returns
     -------

From 1af0bd4fdae1a6900189f45709d47ccd79959c2c Mon Sep 17 00:00:00 2001
From: Farnaz Heidar-Zadeh <farnaz_chem@yahoo.com>
Date: Thu, 6 Jul 2023 21:19:18 -0400
Subject: [PATCH 09/11] Finalize test_distance.py and apply black

---
 DiverseSelector/test/test_distance.py | 87 ++++++++++++---------------
 1 file changed, 37 insertions(+), 50 deletions(-)

diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py
index a0f90a83..e13b7aff 100644
--- a/DiverseSelector/test/test_distance.py
+++ b/DiverseSelector/test/test_distance.py
@@ -21,36 +21,18 @@
 #
 # --
 
-"""Testing for the distance and similarity algorithms in the distance.py module."""
+"""Test distance.py Module."""
 
-from DiverseSelector.distance import (pairwise_similarity_bit,
-                                      nearest_average_tanimoto,
-                                      tanimoto,
-                                      modified_tanimoto
-                                      )
+from DiverseSelector.distance import (
+    pairwise_similarity_bit,
+    nearest_average_tanimoto,
+    tanimoto,
+    modified_tanimoto,
+)
 
 import numpy as np
 from numpy.testing import assert_almost_equal, assert_equal, assert_raises
 
-# each row is a feature and each column is a molecule
-sample1 = np.array([[4, 2, 6],
-                    [4, 9, 6],
-                    [2, 5, 0],
-                    [2, 0, 9],
-                    [5, 3, 0]])
-
-# each row is a molecule and each column is a feature (scipy)
-sample2 = np.array([[1, 1, 0, 0, 0],
-                    [0, 1, 1, 0, 0],
-                    [0, 0, 0, 1, 0],
-                    [0, 0, 0, 0, 1]])
-
-sample3 = np.array([[1, 4],
-                    [3, 2]])
-
-sample4 = np.array([[1, 0, 1],
-                    [0, 1, 1]])
-
 
 def test_pairwise_similarity_bit_raises():
     # check raised error for input feature matrix that is not 2D
@@ -76,45 +58,47 @@ def test_tanimoto():
     """Test the tanimoto function on one pair of points."""
     a = np.array([2, 0, 1])
     b = np.array([2, 0, 0])
-    expected = 4 / (5 + 4 - 4)
-    assert_equal(tanimoto(a, b), expected)
+    # expected = (2*2 + 0*0 + 1*0) / (2**2 + 1 + 2**2 - 2*2)
+    assert_equal(tanimoto(a, b), 4 / (5 + 4 - 4))
+
+
+def test_tanimoto_bitstring():
+    """Test the tanimoto function on one pair of points."""
+    a = np.array([0, 0, 0, 1, 0, 1, 1])
+    b = np.array([1, 1, 0, 0, 0, 1, 1])
+    assert_equal(tanimoto(a, b), 2 / 5)
 
 
 def test_tanimoto_matrix():
     """Testing the tanimoto function with predefined feature matrix."""
     x = np.array([[1, 4], [3, 2]])
-    tani = pairwise_similarity_bit(x, "tanimoto")
+    s = pairwise_similarity_bit(x, "tanimoto")
     expected = np.array([[1, (11 / 19)], [(11 / 19), 1]])
-    assert_equal(expected, tani)
+    assert_equal(s, expected)
 
 
 def test_modified_tanimoto():
     a = np.array([1, 1, 0, 0, 1])
     b = np.array([0, 0, 0, 0, 1])
-    expected = (1.6 / 9) + (1.4/6)
-    mod_tani = modified_tanimoto(a, b)
-    assert_equal(mod_tani, expected)
+    expected = (1.6 / 9) + (1.4 / 6)
+    assert_equal(modified_tanimoto(a, b), expected)
 
 
 def test_modified_tanimoto_all_ones():
     """Test the modified tanimoto function when input is all '1' bits"""
     a = np.array([1, 1, 1, 1, 1])
-    expected = 1
-    mod_tani = modified_tanimoto(a,a)
-    assert_equal(mod_tani, expected)
+    assert_equal(modified_tanimoto(a, a), 1)
 
 
 def test_modified_tanimoto_all_zeroes():
     """Test the modified tanimoto function when input is all '0' bits"""
     a = np.zeros(5)
-    expected = 1
-    mod_tani = modified_tanimoto(a, a)
-    assert_equal(mod_tani, expected)
+    assert_equal(modified_tanimoto(a, a), 1)
 
 
 def test_modified_tanimoto_dimension_error():
     """Test modified tanimoto raises error when input has incorrect dimension."""
-    a = np.zeros([7,5])
+    a = np.zeros([7, 5])
     b = np.zeros(5)
     assert_raises(ValueError, modified_tanimoto, a, b)
     assert_raises(ValueError, modified_tanimoto, b, a)
@@ -122,15 +106,16 @@ def test_modified_tanimoto_dimension_error():
 
 def test_modified_tanimoto_matrix():
     """Testing the modified tanimoto function with predefined feature matrix."""
-    mod_tani = pairwise_similarity_bit(sample4, "modified_tanimoto")
-    expceted = np.array([[1, (4 / 27)],
-                         [(4 / 27), 1]])
-    assert_equal(mod_tani, expceted)
+    x = np.array([[1, 0, 1], [0, 1, 1]])
+    s = pairwise_similarity_bit(x, "modified_tanimoto")
+    expceted = np.array([[1, (4 / 27)], [(4 / 27), 1]])
+    assert_equal(s, expceted)
 
 
 def test_nearest_average_tanimoto_bit():
     """Test the nearest_average_tanimoto function with binary input"""
-    nat = nearest_average_tanimoto(sample2)
+    x = np.array([[1, 1, 0, 0, 0], [0, 1, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]])
+    nat = nearest_average_tanimoto(x)
     shortest_tani = [0.3333333, 0.3333333, 0, 0]
     average = np.average(shortest_tani)
     assert_almost_equal(nat, average)
@@ -140,16 +125,18 @@ def test_nearest_average_tanimoto():
     """Test the nearest_average_tanimoto function with non-binary input"""
     x = np.array([[1, 4], [3, 2]])
     # expected: (1x3 + 4x2) / (1 + 4^2 + 3^3 + 2^2 - 1x3 - 4x2)
-    assert_equal(nearest_average_tanimoto(x), 11/19)
+    assert_equal(nearest_average_tanimoto(x), 11 / 19)
 
 
 def test_nearest_average_tanimoto_nonsquare():
     """Test the nearest_average_tanimoto function with non-binary input"""
     x = np.array([[3.5, 4.0, 10.5, 0.5], [1.25, 4.0, 7.0, 0.1], [0.0, 0.0, 0.0, 0.0]])
     # nearest neighbor of sample 0, 1, and 2 are sample 1, 0, and 1, respectively.
-    expected = np.average([
-        np.sum(x[0] * x[1]) / (np.sum(x[0]**2) + np.sum(x[1]**2) - np.sum(x[0] * x[1])),
-        np.sum(x[1] * x[0]) / (np.sum(x[1]**2) + np.sum(x[0]**2) - np.sum(x[1] * x[0])),
-        np.sum(x[2] * x[1]) / (np.sum(x[2]**2) + np.sum(x[1]**2) - np.sum(x[2] * x[1])),
-        ])
+    expected = np.average(
+        [
+            np.sum(x[0] * x[1]) / (np.sum(x[0] ** 2) + np.sum(x[1] ** 2) - np.sum(x[0] * x[1])),
+            np.sum(x[1] * x[0]) / (np.sum(x[1] ** 2) + np.sum(x[0] ** 2) - np.sum(x[1] * x[0])),
+            np.sum(x[2] * x[1]) / (np.sum(x[2] ** 2) + np.sum(x[1] ** 2) - np.sum(x[2] * x[1])),
+        ]
+    )
     assert_equal(nearest_average_tanimoto(x), expected)

From d7f7e3ab278b59fd508ea6b8cbfeb7bc0355af37 Mon Sep 17 00:00:00 2001
From: Farnaz Heidar-Zadeh <farnaz_chem@yahoo.com>
Date: Thu, 6 Jul 2023 21:35:50 -0400
Subject: [PATCH 10/11] Update docstring, black fixes, & error messages

---
 DiverseSelector/distance.py           | 65 +++++++++++++--------------
 DiverseSelector/test/test_distance.py |  1 +
 2 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py
index 21f94a51..44a70327 100644
--- a/DiverseSelector/distance.py
+++ b/DiverseSelector/distance.py
@@ -21,20 +21,15 @@
 #
 # --
 
-"""Metric calculation module."""
+"""Similarity Module."""
 
 
 import numpy as np
 from itertools import combinations_with_replacement
 from scipy.spatial import distance_matrix
-from scipy.spatial.distance import squareform
 
-__all__ = [
-    "pairwise_similarity_bit",
-    "tanimoto",
-    "modified_tanimoto",
-    "nearest_average_tanimoto",
-]
+
+__all__ = ["pairwise_similarity_bit", "tanimoto", "modified_tanimoto", "nearest_average_tanimoto"]
 
 
 def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray:
@@ -60,7 +55,9 @@ def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray:
         "modified_tanimoto": modified_tanimoto,
     }
     if metric not in available_methods:
-        raise ValueError(f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}")
+        raise ValueError(
+            f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}"
+        )
     if X.ndim != 2:
         raise ValueError(f"Argument features should be a 2D array, got {X.ndim}")
 
@@ -105,8 +102,10 @@ def tanimoto(a: np.array, b: np.array) -> float:
     if a.ndim != 1 or b.ndim != 1:
         raise ValueError(f"Arguments a and b should be 1D arrays, got {a.ndim} and {b.ndim}")
     if a.shape != b.shape:
-        raise ValueError(f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}")
-    coeff = (sum(a * b)) / ((sum(a**2)) + (sum(b**2)) - (sum(a * b)))
+        raise ValueError(
+            f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}"
+        )
+    coeff = sum(a * b) / (sum(a**2) + sum(b**2) - sum(a * b))
     return coeff
 
 
@@ -128,19 +127,14 @@ def modified_tanimoto(a: np.array, b: np.array) -> float:
     Parameters
     ----------
     a : ndarray of shape (n_features,)
-        The 1D feature array of sample :math:`A` in an `n_features` dimensional space.
+        The 1D bitstring feature array of sample :math:`A` in an `n_features` dimensional space.
     b : ndarray of shape (n_features,)
-        The 1D feature array of sample :math:`B` in an `n_features` dimensional space.
-
-    a : array_like
-        Data point A's features in bitstring.
-    b : array_like
-        Data point B's features in bitstring.
+        The 1D bitstring feature array of sample :math:`B` in an `n_features` dimensional space.
 
     Returns
     -------
     mt : float
-        Modified tanimoto coefficient for molecule A and B.
+        Modified tanimoto coefficient between bitstring feature arrays :math:`A` and :math:`B`.
 
     Notes
     -----
@@ -162,30 +156,33 @@ def modified_tanimoto(a: np.array, b: np.array) -> float:
         raise ValueError(f"Argument `a` should have dimension 1 rather than {a.ndim}.")
     if b.ndim != 1:
         raise ValueError(f"Argument `b` should have dimension 1 rather than {b.ndim}.")
+    if a.shape != b.shape:
+        raise ValueError(
+            f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}"
+        )
 
-    n = len(a)
+    n_features = len(a)
     # number of common '1' bits between points A and B
     n_11 = sum(a * b)
     # number of common '0' bits between points A and B
     n_00 = sum((1 - a) * (1 - b))
 
-    # calculate Tanimoto coeff based on '1' bits
-    if n_00 == n:
-        # bit string is all '0's
-        t_1 = 1
-    else:
-        t_1 = n_11 / (n - n_00)
-    # calculate Tanimoto coeff based on '1' bits
-    if n_11 == n:
-        # bit string is all '1's
-        t_0 = 1
-    else:
-        t_0 = n_00 / (n - n_11)
+    # calculate Tanimoto coefficient based on '0' bits
+    t_1 = 1
+    if n_00 != n_features:
+        # bit strings are not all '0's
+        t_1 = n_11 / (n_features - n_00)
+    # calculate Tanimoto coefficient based on '1' bits
+    t_0 = 1
+    if n_11 != n_features:
+        # bit strings are not all '1's
+        t_0 = n_00 / (n_features - n_11)
+
     # combine into modified tanimoto using Bernoulli Model
     # p = independent success trials
     #       evaluated as total number of '1' bits
     #       divided by 2x the fingerprint length
-    p = (n - n_00 + n_11) / (2 * n)
+    p = (n_features - n_00 + n_11) / (2 * n_features)
     # mt = x * T_1 + (1-x) * T_0
     #       x = (2-p)/3 so that E(mt) = 1/3, no matter the value of p
     mt = (((2 - p) / 3) * t_1) + (((1 + p) / 3) * t_0)
@@ -221,7 +218,7 @@ def nearest_average_tanimoto(X: np.ndarray) -> float:
     # find index of closest neighbor for each sample
     nearest_neighbors = np.argmin(dist, axis=0)
     assert nearest_neighbors.shape == (X.shape[0],)
-    # compute the tanimoto coeff for each sample and its closest neighbor
+    # compute the tanimoto coefficient for each sample and its closest neighbor
     coeffs = []
     for idx_sample, idx_neighbor in enumerate(nearest_neighbors):
         coeffs.append(tanimoto(X[idx_sample], X[idx_neighbor]))
diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py
index e13b7aff..8cb0099e 100644
--- a/DiverseSelector/test/test_distance.py
+++ b/DiverseSelector/test/test_distance.py
@@ -102,6 +102,7 @@ def test_modified_tanimoto_dimension_error():
     b = np.zeros(5)
     assert_raises(ValueError, modified_tanimoto, a, b)
     assert_raises(ValueError, modified_tanimoto, b, a)
+    assert_raises(ValueError, modified_tanimoto, np.ones(3), np.ones(5))
 
 
 def test_modified_tanimoto_matrix():

From 3fc9b3e6370daea0ad76cb7c359abd5e86f634bb Mon Sep 17 00:00:00 2001
From: Fanwang Meng <fwmeng88@gmail.com>
Date: Sun, 9 Jul 2023 00:08:22 -0400
Subject: [PATCH 11/11] Fix coverage configuration

---
 .coveragerc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.coveragerc b/.coveragerc
index e956eb4c..bdb67bcb 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,6 +1,6 @@
 [run]
 omit =
-    DiverseSelector/*/test/*
+    DiverseSelector/*/tests/*
     DiverseSelector/test/*
     DiverseSelector/__init__.py
     DiverseSelector/_version.py