From 84b6d6c6d07d6745da89f15c50c543211b20fe6a Mon Sep 17 00:00:00 2001 From: awb9691 Date: Tue, 20 Jun 2023 12:40:01 -0400 Subject: [PATCH 01/11] remove compute_distance_matrix, add tanimoto tests --- DiverseSelector/distance.py | 46 ++------------------------- DiverseSelector/test/test_distance.py | 46 ++++++++++++--------------- 2 files changed, 23 insertions(+), 69 deletions(-) diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py index ed76cc46..ef3a0701 100644 --- a/DiverseSelector/distance.py +++ b/DiverseSelector/distance.py @@ -28,7 +28,6 @@ from scipy.spatial.distance import squareform __all__ = [ - "compute_distance_matrix", "pairwise_similarity_bit", "tanimoto", "modified_tanimoto", @@ -36,47 +35,6 @@ ] -def compute_distance_matrix( - features: np.ndarray, - metric: str -): - """Compute pairwise distance given a feature matrix. - - Parameters - ---------- - features : np.ndarray - Molecule feature matrix. - metric : str - Distance metric. - - Returns - ------- - dist : ndarray - Symmetric distance array. - """ - # todo: add more metrics implemented here - built_in_metrics = { - "tanimoto": tanimoto, - "modified_tanimoto": modified_tanimoto, - } - - # Check if specified metric is supported - if metric in built_in_metrics: - distances = [] - size = len(features) - for i in range(0, size): - for j in range(i + 1, size): - # use the metric to compute distance between all molecule pairs - distances.append(1 - built_in_metrics[metric](features[i], features[j])) - # shape into symmetric matrix - dist = squareform(distances) - - else: # raise error if unsupported - raise ValueError(f"Metric {metric} is not supported by the library.") - - return dist - - def pairwise_similarity_bit(features: np.array, metric: str) -> np.ndarray: """Compute the pairwise similarity coefficients and returns them in a square symmetric matrix. @@ -130,7 +88,8 @@ def tanimoto(a: np.array, b: np.array) -> float: Notes ----- - The Tanimoto coefficient computes similarity by taking the intersection of A and B over their union. + The Tanimoto coefficient computes similarity by taking the intersection + of A and B over their union. Bajusz, D., Rácz, A., and Héberger, K.. (2015) Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?. @@ -237,4 +196,3 @@ def nearest_average_tanimoto(x: np.ndarray) -> float: # compute average of all shortest tanimoto coeffs nat = np.average(tani) return nat - diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py index 42320165..10033230 100644 --- a/DiverseSelector/test/test_distance.py +++ b/DiverseSelector/test/test_distance.py @@ -23,9 +23,10 @@ """Testing for the distance and similarity algorithms in the distance.py module.""" -from DiverseSelector.distance import (compute_distance_matrix, - pairwise_similarity_bit, - nearest_average_tanimoto +from DiverseSelector.distance import (pairwise_similarity_bit, + nearest_average_tanimoto, + tanimoto, + modified_tanimoto ) import numpy as np @@ -51,22 +52,16 @@ [0, 1, 1]]) -def test_compute_distance_matrix_builtin(): - """Testing the compute distance matrix with a built in metric.""" - sci_dist = compute_distance_matrix(sample2, "tanimoto") - expected = np.array([[0, 0.6666667, 1, 1], - [0.6666667, 0, 1, 1], - [1, 1, 0, 1], - [1, 1, 1, 0]]) - assert_almost_equal(expected, sci_dist) - - -def test_compute_distance_matrix_invalid_metric(): - """Testing the compute distance matrix with an invalid metric.""" - assert_raises(ValueError, compute_distance_matrix, sample1, "fake_distance") +def test_tanimoto(): + """Test the tanimoto function on one pair of points.""" + a = np.array([2, 0, 1]) + b = np.array([2, 0, 0]) + expected = 4 / (5 + 4 - 4) + tani = tanimoto(a, b) + assert_equal(tani, expected) -def test_tanimoto(): +def test_tanimoto_matrix(): """Testing the tanimoto function with predefined feature matrix.""" tani = pairwise_similarity_bit(sample3, "tanimoto") expected = np.array([[1, (11 / 19)], @@ -74,7 +69,15 @@ def test_tanimoto(): assert_equal(expected, tani) -def test_modifed_tanimoto(): +def test_modified_tanimoto(): + a = np.array([1, 1, 0, 0, 1]) + b = np.array([0, 0, 0, 0, 1]) + expected = (1.6 / 9) + (1.4/6) + mod_tani = modified_tanimoto(a, b) + assert_equal(mod_tani, expected) + + +def test_modified_tanimoto_matrix(): """Testing the modified tanimoto function with predefined feature matrix.""" mod_tani = pairwise_similarity_bit(sample4, "modified_tanimoto") expceted = np.array([[1, (4 / 27)], @@ -96,10 +99,3 @@ def test_nearest_average_tanimoto(): shortest_tani = [(11/19), (11/19)] average = np.average(shortest_tani) assert_equal(nat, average) - - - - - - - From 6d4b4bc3e768d0f49bd78f26701fcca638999ca5 Mon Sep 17 00:00:00 2001 From: awb9691 Date: Tue, 20 Jun 2023 15:13:17 -0400 Subject: [PATCH 02/11] add testing, docs to modified_tanimoto --- DiverseSelector/distance.py | 29 ++++++++++++++++++++++----- DiverseSelector/test/test_distance.py | 24 ++++++++++++++++++++++ 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py index ef3a0701..48496054 100644 --- a/DiverseSelector/distance.py +++ b/DiverseSelector/distance.py @@ -106,11 +106,13 @@ def modified_tanimoto(a: np.array, b: np.array) -> float: smaller molecules using a Bernoulli probability model. ..math:: - mt = \frac{2-p}{3}t_1 + \frac{1+p}{3}t_0$ - where - p = success probability of independent trials - $t_1 = | A \cap B |$ - $t_0 = |(1-A) \cap (1-B)|$ + MT = \frac{2-p}{3}T_1 + \frac{1+p}{3}T_0 + + where :math:`p` is success probability of independent trials, + :math:`T_1` is the number of common '1' bits between molecules + (:math:`T_1 = | A \cap B |`), and :math:`T_0` is the number of common '0' + bits between molecules (:math:`T_0 = |(1-A) \cap (1-B)|`). + Parameters ---------- @@ -126,12 +128,29 @@ def modified_tanimoto(a: np.array, b: np.array) -> float: Notes ----- + The equation above has been derived from + + ..math:: + MT_\alpha= {\alpha}T_1 + (1-\alpha)T_0 + + where :math:`\alpha = \frac{2-p}{3}`. This is done so that the expected value + of the modified tanimoto, :math:`E(MT)`, remains constant even as the number of + trials :math:`p` grows larger. Fligner, M. A., Verducci, J. S., and Blower, P. E.. (2002) A Modification of the Jaccard-Tanimoto Similarity Index for Diverse Selection of Chemical Compounds Using Binary Strings. Technometrics 44, 110-119. """ + if a.ndim != 1: + raise ValueError( + f"Argument `a` should have dimension 1 rather than {a.ndim}." + ) + if b.ndim != 1: + raise ValueError( + f"Argument `b` should have dimension 1 rather than {b.ndim}." + ) + n = len(a) # intersection of '1' bits n_11 = sum(a * b) diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py index 10033230..2f61f497 100644 --- a/DiverseSelector/test/test_distance.py +++ b/DiverseSelector/test/test_distance.py @@ -77,6 +77,30 @@ def test_modified_tanimoto(): assert_equal(mod_tani, expected) +def test_modified_tanimoto_all_ones(): + """Test the modified tanimoto function when input is all '1' bits""" + a = np.array([1, 1, 1, 1, 1]) + expected = 1 + mod_tani = modified_tanimoto(a,a) + assert_equal(mod_tani, expected) + + +def test_modified_tanimoto_all_zeroes(): + """Test the modified tanimoto function when input is all '0' bits""" + a = np.zeros(5) + expected = 1 + mod_tani = modified_tanimoto(a, a) + assert_equal(mod_tani, expected) + + +def test_modified_tanimoto_dimension_error(): + """Test modified tanimoto raises error when input has incorrect dimension.""" + a = np.zeros([7,5]) + b = np.zeros(5) + assert_raises(ValueError, modified_tanimoto, a, b) + assert_raises(ValueError, modified_tanimoto, b, a) + + def test_modified_tanimoto_matrix(): """Testing the modified tanimoto function with predefined feature matrix.""" mod_tani = pairwise_similarity_bit(sample4, "modified_tanimoto") From 28a857ab16daf2a0c83a5d098efbebc3c6bf1c65 Mon Sep 17 00:00:00 2001 From: awb9691 Date: Tue, 20 Jun 2023 15:57:59 -0400 Subject: [PATCH 03/11] optimize nearest_average_tanimoto, add comments to modified_tanimoto --- DiverseSelector/distance.py | 45 ++++++++++++++------------- DiverseSelector/test/test_distance.py | 3 +- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py index 48496054..2f47f87b 100644 --- a/DiverseSelector/distance.py +++ b/DiverseSelector/distance.py @@ -25,6 +25,7 @@ import numpy as np +from scipy.spatial import distance_matrix from scipy.spatial.distance import squareform __all__ = [ @@ -152,23 +153,30 @@ def modified_tanimoto(a: np.array, b: np.array) -> float: ) n = len(a) - # intersection of '1' bits + # number of common '1' bits between molecules A and B n_11 = sum(a * b) - # intersection of '0' bits + # number of common '0' bits between molecules A and B n_00 = sum((1 - a) * (1 - b)) - # calculate in terms of '1' bits + # calculate Tanimoto coeff based on '1' bits if n_00 == n: + # bit string is all '0's t_1 = 1 else: t_1 = n_11 / (n - n_00) - # calculate in terms of '0' bits + # calculate Tanimoto coeff based on '1' bits if n_11 == n: + # bit string is all '1's t_0 = 1 else: t_0 = n_00 / (n - n_11) # combine into modified tanimoto using Bernoulli Model - p = ((n - n_00) + n_11) / (2 * n) + # p = independent success trials + # evaluated as total number of '1' bits + # divided by 2x the fingerprint length + p = (n - n_00 + n_11) / (2 * n) + # mt = x * T_1 + (1-x) * T_0 + # x = (2-p)/3 so that E(mt) = 1/3, no matter the value of p mt = (((2 - p) / 3) * t_1) + (((1 + p) / 3) * t_0) return mt @@ -183,7 +191,7 @@ def nearest_average_tanimoto(x: np.ndarray) -> float: Returns ------- - nat : float + float : Average tanimoto of closest pairs. Notes @@ -198,20 +206,15 @@ def nearest_average_tanimoto(x: np.ndarray) -> float: Journal of Chemical Information and Modeling 46, 1898-1904. """ tani = [] - for idx, _ in enumerate(x): - # arbitrary distance for comparison: - short = 100 - a = 0 - b = 0 - # search for shortest distance point from idx - for jdx, _ in enumerate(x): - dist = np.linalg.norm(x[idx]-x[jdx]) - if dist < short and idx != jdx: - short = dist - a = idx - b = jdx - # calculate tanimoto for each shortest dist pair - tani.append(tanimoto(x[a], x[b])) - # compute average of all shortest tanimoto coeffs + # calculate euclidean distance between all points + # and adjust for distance to self + dist = distance_matrix(x, x) + 100*np.eye(x.shape[0]) + # find closest point for each row of x + short_idx = np.argmin(dist, axis=0) + print(f"these are the shortest indices:", short_idx) + for idx in range(0, len(short_idx)): + # compute the tanimoto coeff for each pair of closest points + tani.append(tanimoto(x[idx], x[short_idx[idx]])) + # take the average of all coeffs calculated nat = np.average(tani) return nat diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py index 2f61f497..7aee0e18 100644 --- a/DiverseSelector/test/test_distance.py +++ b/DiverseSelector/test/test_distance.py @@ -120,6 +120,5 @@ def test_nearest_average_tanimoto_bit(): def test_nearest_average_tanimoto(): """Test the nearest_average_tanimoto function with non-binary input""" nat = nearest_average_tanimoto(sample3) - shortest_tani = [(11/19), (11/19)] - average = np.average(shortest_tani) + average = 11/19 assert_equal(nat, average) From cf88ce3202add81aabd2bf4df07c924bc359ad0e Mon Sep 17 00:00:00 2001 From: AWBroscius Date: Wed, 5 Jul 2023 16:30:13 -0400 Subject: [PATCH 04/11] address pull request comments, clean docs --- DiverseSelector/distance.py | 53 +++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py index 2f47f87b..9678d9b3 100644 --- a/DiverseSelector/distance.py +++ b/DiverseSelector/distance.py @@ -32,7 +32,7 @@ "pairwise_similarity_bit", "tanimoto", "modified_tanimoto", - "nearest_average_tanimoto" + "nearest_average_tanimoto", ] @@ -50,7 +50,7 @@ def pairwise_similarity_bit(features: np.array, metric: str) -> np.ndarray: Returns ------- pair_coeff : ndarray - Similarity coefficients for all molecule pairs in feature matrix. + Similarity coefficients for all data point pairs in feature matrix. """ function_dict = { @@ -78,14 +78,14 @@ def tanimoto(a: np.array, b: np.array) -> float: Parameters ---------- a : array_like - Molecule A's features. + Data point A's features. b : array_like - Molecules B's features. + Data point B's features. Returns ------- coeff : float - Tanimoto coefficient for molecules A and B. + Tanimoto coefficient for data points A and B. Notes ----- @@ -96,31 +96,31 @@ def tanimoto(a: np.array, b: np.array) -> float: Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?. Journal of Cheminformatics 7. """ - coeff = (sum(a * b)) / ((sum(a ** 2)) + (sum(b ** 2)) - (sum(a * b))) + coeff = (sum(a * b)) / ((sum(a**2)) + (sum(b**2)) - (sum(a * b))) return coeff def modified_tanimoto(a: np.array, b: np.array) -> float: - r"""Compute the modified tanimoto coefficient from bitstrings of molecules A and B. + r"""Compute the modified tanimoto coefficient from bitstring vectors of data points A and B. Adjusts calculation of the Tanimoto coefficient to counter its natural bias towards - smaller molecules using a Bernoulli probability model. + shorter vectors using a Bernoulli probability model. ..math:: MT = \frac{2-p}{3}T_1 + \frac{1+p}{3}T_0 where :math:`p` is success probability of independent trials, - :math:`T_1` is the number of common '1' bits between molecules + :math:`T_1` is the number of common '1' bits between data points (:math:`T_1 = | A \cap B |`), and :math:`T_0` is the number of common '0' - bits between molecules (:math:`T_0 = |(1-A) \cap (1-B)|`). + bits between data points (:math:`T_0 = |(1-A) \cap (1-B)|`). Parameters ---------- a : array_like - Molecule A's features in bitstring. + Data point A's features in bitstring. b : array_like - Molecules B's features in bitstring. + Data point B's features in bitstring. Returns ------- @@ -144,18 +144,14 @@ def modified_tanimoto(a: np.array, b: np.array) -> float: Technometrics 44, 110-119. """ if a.ndim != 1: - raise ValueError( - f"Argument `a` should have dimension 1 rather than {a.ndim}." - ) + raise ValueError(f"Argument `a` should have dimension 1 rather than {a.ndim}.") if b.ndim != 1: - raise ValueError( - f"Argument `b` should have dimension 1 rather than {b.ndim}." - ) + raise ValueError(f"Argument `b` should have dimension 1 rather than {b.ndim}.") n = len(a) - # number of common '1' bits between molecules A and B + # number of common '1' bits between points A and B n_11 = sum(a * b) - # number of common '0' bits between molecules A and B + # number of common '0' bits between points A and B n_00 = sum((1 - a) * (1 - b)) # calculate Tanimoto coeff based on '1' bits @@ -182,7 +178,7 @@ def modified_tanimoto(a: np.array, b: np.array) -> float: def nearest_average_tanimoto(x: np.ndarray) -> float: - """Computes the average tanimoto for nearest molecules. + """Computes the average tanimoto for nearest data points. Parameters ---------- @@ -192,13 +188,13 @@ def nearest_average_tanimoto(x: np.ndarray) -> float: Returns ------- float : - Average tanimoto of closest pairs. + Average Tanimoto of closest pairs. Notes ----- - This computes the tanimoto coefficient of pairs with the shortest - distances, then returns the average of them. - This calculation is explictly for the explicit diversity index. + This computes the Tanimoto coefficient of pairs of data points + with the shortest distances, then returns the average of them. + This calculation is explicitly for the explicit diversity index. Papp, Á., Gulyás-Forró, A., Gulyás, Z., Dormán, G., Ürge, L., and Darvas, F.. (2006) Explicit Diversity Index (EDI): @@ -208,13 +204,12 @@ def nearest_average_tanimoto(x: np.ndarray) -> float: tani = [] # calculate euclidean distance between all points # and adjust for distance to self - dist = distance_matrix(x, x) + 100*np.eye(x.shape[0]) + dist = distance_matrix(x, x) + np.inf*np.eye(x.shape[0]) # find closest point for each row of x short_idx = np.argmin(dist, axis=0) - print(f"these are the shortest indices:", short_idx) - for idx in range(0, len(short_idx)): + for idx, min_d in enumerate(short_idx): # compute the tanimoto coeff for each pair of closest points - tani.append(tanimoto(x[idx], x[short_idx[idx]])) + tani.append(tanimoto(x[idx], x[min_d])) # take the average of all coeffs calculated nat = np.average(tani) return nat From caabee99027ee2ca607098f3b81ef819aea8d4b7 Mon Sep 17 00:00:00 2001 From: Farnaz Heidar-Zadeh Date: Wed, 5 Jul 2023 19:50:32 -0400 Subject: [PATCH 05/11] Finalize pairwise_similarity_bit function --- DiverseSelector/distance.py | 42 +++++++++++++++------------ DiverseSelector/test/test_distance.py | 9 ++++++ 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py index 9678d9b3..369dfd11 100644 --- a/DiverseSelector/distance.py +++ b/DiverseSelector/distance.py @@ -25,6 +25,7 @@ import numpy as np +from itertools import combinations_with_replacement from scipy.spatial import distance_matrix from scipy.spatial.distance import squareform @@ -36,37 +37,40 @@ ] -def pairwise_similarity_bit(features: np.array, metric: str) -> np.ndarray: - """Compute the pairwise similarity coefficients and returns them in - a square symmetric matrix. +def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray: + """Compute pairwise similarity coefficient matrix. Parameters ---------- - features : ndarray - Feature matrix. + X : ndarray + An `m` by `n` feature array of `m` samples in an `n`-dimensional feature space. metric : str - Method of calculation. + Method for calculating similarity coefficient. Options: `"tanimoto"`, `"modified_tanimoto"`. Returns ------- - pair_coeff : ndarray - Similarity coefficients for all data point pairs in feature matrix. + pair_simi : ndarray + Returns a symmetric `m` by `m` array containing the similarity coefficient between + each pair of samples in the feature matrix. The diagonal elements are directly + computed instead of assuming that they are 1. """ - function_dict = { + available_methods = { "tanimoto": tanimoto, "modified_tanimoto": modified_tanimoto, } - - pair_simi = [] - size = len(features) - for i in range(0, size): - for j in range(i + 1, size): - # use the specified metric to compute similarity between all distinct molecule pairs - pair_simi.append(function_dict[metric](features[i], features[j])) - # shape into symmetric matrix - pair_coeff = squareform(pair_simi) + np.identity(size) - return pair_coeff + if metric not in available_methods: + raise ValueError(f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}") + if X.ndim != 2: + raise ValueError(f"Argument features should be a 2D array, got {X.ndim}") + + # make pairwise m-by-m similarity matrix + m = len(X) + pair_simi = np.zeros((m, m)) + # compute similarity between all pairs of points (including the diagonal elements) + for i, j in combinations_with_replacement(range(m), 2): + pair_simi[i, j] = pair_simi[j, i] = available_methods[metric](X[i], X[j]) + return pair_simi def tanimoto(a: np.array, b: np.array) -> float: diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py index 7aee0e18..38da82bc 100644 --- a/DiverseSelector/test/test_distance.py +++ b/DiverseSelector/test/test_distance.py @@ -52,6 +52,15 @@ [0, 1, 1]]) +def test_pairwise_similarity_bit_raises(): + # check raised error for input feature matrix that is not 2D + assert_raises(ValueError, pairwise_similarity_bit, np.random.random(5), "tanimoto") + assert_raises(ValueError, pairwise_similarity_bit, np.random.random((2, 3, 4)), "tanimoto") + # check raised error for not-available method + assert_raises(ValueError, pairwise_similarity_bit, np.random.random((5, 1)), "tan") + assert_raises(ValueError, pairwise_similarity_bit, np.random.random((5, 1)), tanimoto) + + def test_tanimoto(): """Test the tanimoto function on one pair of points.""" a = np.array([2, 0, 1]) From 8278d6b3a0d90eebd67c5a23bab9dbbee6456fa0 Mon Sep 17 00:00:00 2001 From: Farnaz Heidar-Zadeh Date: Wed, 5 Jul 2023 20:09:42 -0400 Subject: [PATCH 06/11] Finalize tanimoto function --- DiverseSelector/distance.py | 29 +++++++++++++++++---------- DiverseSelector/test/test_distance.py | 20 +++++++++++++----- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py index 369dfd11..fec406a6 100644 --- a/DiverseSelector/distance.py +++ b/DiverseSelector/distance.py @@ -74,32 +74,39 @@ def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray: def tanimoto(a: np.array, b: np.array) -> float: - r"""Compute Tanimoto coefficient. + r"""Compute Tanimoto coefficient or index (a.k.a. Jaccard similarity coefficient). + + For two binary or non-binary arrays :math:`A` and :math:`B`, Tanimoto coefficient + is defined as the size of their intersection divided by the size of their union: ..math:: - T(A,B) = A \cap B / A \cup B + T(A, B) = \frac{| A \cap B|}{| A \cup B |} = + \frac{| A \cap B|}{|A| + |B| - | A \cap B|} = + \frac{A \cdot B}{\|A\|^2 + \|B\|^2 - A \cdot B} + + where :math:`A \cdot B = \sum_i{A_i B_i}` and :math:`\|A\|^2 = \sum_i{A_i^2}`. Parameters ---------- - a : array_like - Data point A's features. - b : array_like - Data point B's features. + a : ndarray + The 1D feature array of sample :math:`A` in an `n`-dimensional space. + b : ndarray + The 1D feature array of sample :math:`B` in an `n`-dimensional space. Returns ------- coeff : float - Tanimoto coefficient for data points A and B. + Tanimoto coefficient between feature arrays :math:`A` and :math:`B`. - Notes - ----- - The Tanimoto coefficient computes similarity by taking the intersection - of A and B over their union. Bajusz, D., Rácz, A., and Héberger, K.. (2015) Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?. Journal of Cheminformatics 7. """ + if a.ndim != 1 or b.ndim != 1: + raise ValueError(f"Arguments a and b should be 1D arrays, got {a.ndim} and {b.ndim}") + if a.shape != b.shape: + raise ValueError(f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}") coeff = (sum(a * b)) / ((sum(a**2)) + (sum(b**2)) - (sum(a * b))) return coeff diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py index 38da82bc..678b883d 100644 --- a/DiverseSelector/test/test_distance.py +++ b/DiverseSelector/test/test_distance.py @@ -61,20 +61,30 @@ def test_pairwise_similarity_bit_raises(): assert_raises(ValueError, pairwise_similarity_bit, np.random.random((5, 1)), tanimoto) +def test_tanimoto_raises(): + # check raised error when a or b is not 1D + assert_raises(ValueError, tanimoto, np.random.random((1, 5)), np.random.random(5)) + assert_raises(ValueError, tanimoto, np.random.random(3), np.random.random((1, 4))) + assert_raises(ValueError, tanimoto, np.random.random(4), np.random.random((3, 4))) + assert_raises(ValueError, tanimoto, np.random.random((3, 3)), np.random.random((2, 3))) + # check raised error when a and b don't have the same length + assert_raises(ValueError, tanimoto, np.random.random(3), np.random.random(5)) + assert_raises(ValueError, tanimoto, np.random.random(20), np.random.random(10)) + + def test_tanimoto(): """Test the tanimoto function on one pair of points.""" a = np.array([2, 0, 1]) b = np.array([2, 0, 0]) expected = 4 / (5 + 4 - 4) - tani = tanimoto(a, b) - assert_equal(tani, expected) + assert_equal(tanimoto(a, b), expected) def test_tanimoto_matrix(): """Testing the tanimoto function with predefined feature matrix.""" - tani = pairwise_similarity_bit(sample3, "tanimoto") - expected = np.array([[1, (11 / 19)], - [(11 / 19), 1]]) + x = np.array([[1, 4], [3, 2]]) + tani = pairwise_similarity_bit(x, "tanimoto") + expected = np.array([[1, (11 / 19)], [(11 / 19), 1]]) assert_equal(expected, tani) From f403882b9fa8ba553421adcb577559bd009519b9 Mon Sep 17 00:00:00 2001 From: Farnaz Heidar-Zadeh Date: Thu, 6 Jul 2023 20:47:52 -0400 Subject: [PATCH 07/11] Finalize nearest_average_tanimoto function --- DiverseSelector/distance.py | 45 +++++++++++++-------------- DiverseSelector/test/test_distance.py | 18 +++++++++-- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py index fec406a6..66a1bb6d 100644 --- a/DiverseSelector/distance.py +++ b/DiverseSelector/distance.py @@ -188,39 +188,38 @@ def modified_tanimoto(a: np.array, b: np.array) -> float: return mt -def nearest_average_tanimoto(x: np.ndarray) -> float: - """Computes the average tanimoto for nearest data points. +def nearest_average_tanimoto(X: np.ndarray) -> float: + """Compute the average tanimoto for nearest data points measured by Minkowski 2-norm. + + For each sample, the closest neighbor is identified by computing its Minkowski 2-norm + (i.e., Euclidean) distance with all other samples, and identifying neighboring sample + with the shortest distance. Parameters ---------- - x : ndarray - Feature matrix. + X : (M, K) array_like + Matrix of `M` samples in an `K` dimensional feature space. Returns ------- float : - Average Tanimoto of closest pairs. - - Notes - ----- - This computes the Tanimoto coefficient of pairs of data points - with the shortest distances, then returns the average of them. - This calculation is explicitly for the explicit diversity index. + Average of the Tanimoto coefficients for each sample and its closest neighbor. Papp, Á., Gulyás-Forró, A., Gulyás, Z., Dormán, G., Ürge, L., and Darvas, F.. (2006) Explicit Diversity Index (EDI): A Novel Measure for Assessing the Diversity of Compound Databases. Journal of Chemical Information and Modeling 46, 1898-1904. """ - tani = [] - # calculate euclidean distance between all points - # and adjust for distance to self - dist = distance_matrix(x, x) + np.inf*np.eye(x.shape[0]) - # find closest point for each row of x - short_idx = np.argmin(dist, axis=0) - for idx, min_d in enumerate(short_idx): - # compute the tanimoto coeff for each pair of closest points - tani.append(tanimoto(x[idx], x[min_d])) - # take the average of all coeffs calculated - nat = np.average(tani) - return nat + # compute euclidean distance between all samples + dist = distance_matrix(X, X, p=2) + # replace zero self-distance with infinity, before computing nearest neighbors + np.fill_diagonal(dist, np.inf) + # find index of closest neighbor for each sample + nearest_neighbors = np.argmin(dist, axis=0) + assert nearest_neighbors.shape == (X.shape[0],) + # compute the tanimoto coeff for each sample and its closest neighbor + coeffs = [] + for idx_sample, idx_neighbor in enumerate(nearest_neighbors): + coeffs.append(tanimoto(X[idx_sample], X[idx_neighbor])) + # return average of all coefficients + return np.average(coeffs) diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py index 678b883d..a0f90a83 100644 --- a/DiverseSelector/test/test_distance.py +++ b/DiverseSelector/test/test_distance.py @@ -138,6 +138,18 @@ def test_nearest_average_tanimoto_bit(): def test_nearest_average_tanimoto(): """Test the nearest_average_tanimoto function with non-binary input""" - nat = nearest_average_tanimoto(sample3) - average = 11/19 - assert_equal(nat, average) + x = np.array([[1, 4], [3, 2]]) + # expected: (1x3 + 4x2) / (1 + 4^2 + 3^3 + 2^2 - 1x3 - 4x2) + assert_equal(nearest_average_tanimoto(x), 11/19) + + +def test_nearest_average_tanimoto_nonsquare(): + """Test the nearest_average_tanimoto function with non-binary input""" + x = np.array([[3.5, 4.0, 10.5, 0.5], [1.25, 4.0, 7.0, 0.1], [0.0, 0.0, 0.0, 0.0]]) + # nearest neighbor of sample 0, 1, and 2 are sample 1, 0, and 1, respectively. + expected = np.average([ + np.sum(x[0] * x[1]) / (np.sum(x[0]**2) + np.sum(x[1]**2) - np.sum(x[0] * x[1])), + np.sum(x[1] * x[0]) / (np.sum(x[1]**2) + np.sum(x[0]**2) - np.sum(x[1] * x[0])), + np.sum(x[2] * x[1]) / (np.sum(x[2]**2) + np.sum(x[1]**2) - np.sum(x[2] * x[1])), + ]) + assert_equal(nearest_average_tanimoto(x), expected) From 0007c26aa3ba88da7ea68db7f2ca6761d7dcd063 Mon Sep 17 00:00:00 2001 From: Farnaz Heidar-Zadeh Date: Thu, 6 Jul 2023 20:59:30 -0400 Subject: [PATCH 08/11] Make Parameters docstring match sklearn --- DiverseSelector/distance.py | 40 ++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py index 66a1bb6d..21f94a51 100644 --- a/DiverseSelector/distance.py +++ b/DiverseSelector/distance.py @@ -42,17 +42,17 @@ def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray: Parameters ---------- - X : ndarray - An `m` by `n` feature array of `m` samples in an `n`-dimensional feature space. + X : ndarray of shape (n_samples, n_features) + Feature matrix of `n_samples` samples in `n_features` dimensional space. metric : str + The metric used when calculating similarity coefficients between samples in a feature array. Method for calculating similarity coefficient. Options: `"tanimoto"`, `"modified_tanimoto"`. Returns ------- - pair_simi : ndarray - Returns a symmetric `m` by `m` array containing the similarity coefficient between - each pair of samples in the feature matrix. The diagonal elements are directly - computed instead of assuming that they are 1. + s : ndarray of shape (n_samples, n_samples) + A symmetric similarity matrix between each pair of samples in the feature matrix. + The diagonal elements are directly computed instead of assuming that they are 1. """ available_methods = { @@ -65,12 +65,12 @@ def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray: raise ValueError(f"Argument features should be a 2D array, got {X.ndim}") # make pairwise m-by-m similarity matrix - m = len(X) - pair_simi = np.zeros((m, m)) + n_samples = len(X) + s = np.zeros((n_samples, n_samples)) # compute similarity between all pairs of points (including the diagonal elements) - for i, j in combinations_with_replacement(range(m), 2): - pair_simi[i, j] = pair_simi[j, i] = available_methods[metric](X[i], X[j]) - return pair_simi + for i, j in combinations_with_replacement(range(n_samples), 2): + s[i, j] = s[j, i] = available_methods[metric](X[i], X[j]) + return s def tanimoto(a: np.array, b: np.array) -> float: @@ -88,17 +88,16 @@ def tanimoto(a: np.array, b: np.array) -> float: Parameters ---------- - a : ndarray - The 1D feature array of sample :math:`A` in an `n`-dimensional space. - b : ndarray - The 1D feature array of sample :math:`B` in an `n`-dimensional space. + a : ndarray of shape (n_features,) + The 1D feature array of sample :math:`A` in an `n_features` dimensional space. + b : ndarray of shape (n_features,) + The 1D feature array of sample :math:`B` in an `n_features` dimensional space. Returns ------- coeff : float Tanimoto coefficient between feature arrays :math:`A` and :math:`B`. - Bajusz, D., Rácz, A., and Héberger, K.. (2015) Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?. Journal of Cheminformatics 7. @@ -128,6 +127,11 @@ def modified_tanimoto(a: np.array, b: np.array) -> float: Parameters ---------- + a : ndarray of shape (n_features,) + The 1D feature array of sample :math:`A` in an `n_features` dimensional space. + b : ndarray of shape (n_features,) + The 1D feature array of sample :math:`B` in an `n_features` dimensional space. + a : array_like Data point A's features in bitstring. b : array_like @@ -197,8 +201,8 @@ def nearest_average_tanimoto(X: np.ndarray) -> float: Parameters ---------- - X : (M, K) array_like - Matrix of `M` samples in an `K` dimensional feature space. + X : ndarray of shape (n_samples, n_features) + Feature matrix of `n_samples` samples in `n_features` dimensional space. Returns ------- From 1af0bd4fdae1a6900189f45709d47ccd79959c2c Mon Sep 17 00:00:00 2001 From: Farnaz Heidar-Zadeh Date: Thu, 6 Jul 2023 21:19:18 -0400 Subject: [PATCH 09/11] Finalize test_distance.py and apply black --- DiverseSelector/test/test_distance.py | 87 ++++++++++++--------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py index a0f90a83..e13b7aff 100644 --- a/DiverseSelector/test/test_distance.py +++ b/DiverseSelector/test/test_distance.py @@ -21,36 +21,18 @@ # # -- -"""Testing for the distance and similarity algorithms in the distance.py module.""" +"""Test distance.py Module.""" -from DiverseSelector.distance import (pairwise_similarity_bit, - nearest_average_tanimoto, - tanimoto, - modified_tanimoto - ) +from DiverseSelector.distance import ( + pairwise_similarity_bit, + nearest_average_tanimoto, + tanimoto, + modified_tanimoto, +) import numpy as np from numpy.testing import assert_almost_equal, assert_equal, assert_raises -# each row is a feature and each column is a molecule -sample1 = np.array([[4, 2, 6], - [4, 9, 6], - [2, 5, 0], - [2, 0, 9], - [5, 3, 0]]) - -# each row is a molecule and each column is a feature (scipy) -sample2 = np.array([[1, 1, 0, 0, 0], - [0, 1, 1, 0, 0], - [0, 0, 0, 1, 0], - [0, 0, 0, 0, 1]]) - -sample3 = np.array([[1, 4], - [3, 2]]) - -sample4 = np.array([[1, 0, 1], - [0, 1, 1]]) - def test_pairwise_similarity_bit_raises(): # check raised error for input feature matrix that is not 2D @@ -76,45 +58,47 @@ def test_tanimoto(): """Test the tanimoto function on one pair of points.""" a = np.array([2, 0, 1]) b = np.array([2, 0, 0]) - expected = 4 / (5 + 4 - 4) - assert_equal(tanimoto(a, b), expected) + # expected = (2*2 + 0*0 + 1*0) / (2**2 + 1 + 2**2 - 2*2) + assert_equal(tanimoto(a, b), 4 / (5 + 4 - 4)) + + +def test_tanimoto_bitstring(): + """Test the tanimoto function on one pair of points.""" + a = np.array([0, 0, 0, 1, 0, 1, 1]) + b = np.array([1, 1, 0, 0, 0, 1, 1]) + assert_equal(tanimoto(a, b), 2 / 5) def test_tanimoto_matrix(): """Testing the tanimoto function with predefined feature matrix.""" x = np.array([[1, 4], [3, 2]]) - tani = pairwise_similarity_bit(x, "tanimoto") + s = pairwise_similarity_bit(x, "tanimoto") expected = np.array([[1, (11 / 19)], [(11 / 19), 1]]) - assert_equal(expected, tani) + assert_equal(s, expected) def test_modified_tanimoto(): a = np.array([1, 1, 0, 0, 1]) b = np.array([0, 0, 0, 0, 1]) - expected = (1.6 / 9) + (1.4/6) - mod_tani = modified_tanimoto(a, b) - assert_equal(mod_tani, expected) + expected = (1.6 / 9) + (1.4 / 6) + assert_equal(modified_tanimoto(a, b), expected) def test_modified_tanimoto_all_ones(): """Test the modified tanimoto function when input is all '1' bits""" a = np.array([1, 1, 1, 1, 1]) - expected = 1 - mod_tani = modified_tanimoto(a,a) - assert_equal(mod_tani, expected) + assert_equal(modified_tanimoto(a, a), 1) def test_modified_tanimoto_all_zeroes(): """Test the modified tanimoto function when input is all '0' bits""" a = np.zeros(5) - expected = 1 - mod_tani = modified_tanimoto(a, a) - assert_equal(mod_tani, expected) + assert_equal(modified_tanimoto(a, a), 1) def test_modified_tanimoto_dimension_error(): """Test modified tanimoto raises error when input has incorrect dimension.""" - a = np.zeros([7,5]) + a = np.zeros([7, 5]) b = np.zeros(5) assert_raises(ValueError, modified_tanimoto, a, b) assert_raises(ValueError, modified_tanimoto, b, a) @@ -122,15 +106,16 @@ def test_modified_tanimoto_dimension_error(): def test_modified_tanimoto_matrix(): """Testing the modified tanimoto function with predefined feature matrix.""" - mod_tani = pairwise_similarity_bit(sample4, "modified_tanimoto") - expceted = np.array([[1, (4 / 27)], - [(4 / 27), 1]]) - assert_equal(mod_tani, expceted) + x = np.array([[1, 0, 1], [0, 1, 1]]) + s = pairwise_similarity_bit(x, "modified_tanimoto") + expceted = np.array([[1, (4 / 27)], [(4 / 27), 1]]) + assert_equal(s, expceted) def test_nearest_average_tanimoto_bit(): """Test the nearest_average_tanimoto function with binary input""" - nat = nearest_average_tanimoto(sample2) + x = np.array([[1, 1, 0, 0, 0], [0, 1, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]]) + nat = nearest_average_tanimoto(x) shortest_tani = [0.3333333, 0.3333333, 0, 0] average = np.average(shortest_tani) assert_almost_equal(nat, average) @@ -140,16 +125,18 @@ def test_nearest_average_tanimoto(): """Test the nearest_average_tanimoto function with non-binary input""" x = np.array([[1, 4], [3, 2]]) # expected: (1x3 + 4x2) / (1 + 4^2 + 3^3 + 2^2 - 1x3 - 4x2) - assert_equal(nearest_average_tanimoto(x), 11/19) + assert_equal(nearest_average_tanimoto(x), 11 / 19) def test_nearest_average_tanimoto_nonsquare(): """Test the nearest_average_tanimoto function with non-binary input""" x = np.array([[3.5, 4.0, 10.5, 0.5], [1.25, 4.0, 7.0, 0.1], [0.0, 0.0, 0.0, 0.0]]) # nearest neighbor of sample 0, 1, and 2 are sample 1, 0, and 1, respectively. - expected = np.average([ - np.sum(x[0] * x[1]) / (np.sum(x[0]**2) + np.sum(x[1]**2) - np.sum(x[0] * x[1])), - np.sum(x[1] * x[0]) / (np.sum(x[1]**2) + np.sum(x[0]**2) - np.sum(x[1] * x[0])), - np.sum(x[2] * x[1]) / (np.sum(x[2]**2) + np.sum(x[1]**2) - np.sum(x[2] * x[1])), - ]) + expected = np.average( + [ + np.sum(x[0] * x[1]) / (np.sum(x[0] ** 2) + np.sum(x[1] ** 2) - np.sum(x[0] * x[1])), + np.sum(x[1] * x[0]) / (np.sum(x[1] ** 2) + np.sum(x[0] ** 2) - np.sum(x[1] * x[0])), + np.sum(x[2] * x[1]) / (np.sum(x[2] ** 2) + np.sum(x[1] ** 2) - np.sum(x[2] * x[1])), + ] + ) assert_equal(nearest_average_tanimoto(x), expected) From d7f7e3ab278b59fd508ea6b8cbfeb7bc0355af37 Mon Sep 17 00:00:00 2001 From: Farnaz Heidar-Zadeh Date: Thu, 6 Jul 2023 21:35:50 -0400 Subject: [PATCH 10/11] Update docstring, black fixes, & error messages --- DiverseSelector/distance.py | 65 +++++++++++++-------------- DiverseSelector/test/test_distance.py | 1 + 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py index 21f94a51..44a70327 100644 --- a/DiverseSelector/distance.py +++ b/DiverseSelector/distance.py @@ -21,20 +21,15 @@ # # -- -"""Metric calculation module.""" +"""Similarity Module.""" import numpy as np from itertools import combinations_with_replacement from scipy.spatial import distance_matrix -from scipy.spatial.distance import squareform -__all__ = [ - "pairwise_similarity_bit", - "tanimoto", - "modified_tanimoto", - "nearest_average_tanimoto", -] + +__all__ = ["pairwise_similarity_bit", "tanimoto", "modified_tanimoto", "nearest_average_tanimoto"] def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray: @@ -60,7 +55,9 @@ def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray: "modified_tanimoto": modified_tanimoto, } if metric not in available_methods: - raise ValueError(f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}") + raise ValueError( + f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}" + ) if X.ndim != 2: raise ValueError(f"Argument features should be a 2D array, got {X.ndim}") @@ -105,8 +102,10 @@ def tanimoto(a: np.array, b: np.array) -> float: if a.ndim != 1 or b.ndim != 1: raise ValueError(f"Arguments a and b should be 1D arrays, got {a.ndim} and {b.ndim}") if a.shape != b.shape: - raise ValueError(f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}") - coeff = (sum(a * b)) / ((sum(a**2)) + (sum(b**2)) - (sum(a * b))) + raise ValueError( + f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}" + ) + coeff = sum(a * b) / (sum(a**2) + sum(b**2) - sum(a * b)) return coeff @@ -128,19 +127,14 @@ def modified_tanimoto(a: np.array, b: np.array) -> float: Parameters ---------- a : ndarray of shape (n_features,) - The 1D feature array of sample :math:`A` in an `n_features` dimensional space. + The 1D bitstring feature array of sample :math:`A` in an `n_features` dimensional space. b : ndarray of shape (n_features,) - The 1D feature array of sample :math:`B` in an `n_features` dimensional space. - - a : array_like - Data point A's features in bitstring. - b : array_like - Data point B's features in bitstring. + The 1D bitstring feature array of sample :math:`B` in an `n_features` dimensional space. Returns ------- mt : float - Modified tanimoto coefficient for molecule A and B. + Modified tanimoto coefficient between bitstring feature arrays :math:`A` and :math:`B`. Notes ----- @@ -162,30 +156,33 @@ def modified_tanimoto(a: np.array, b: np.array) -> float: raise ValueError(f"Argument `a` should have dimension 1 rather than {a.ndim}.") if b.ndim != 1: raise ValueError(f"Argument `b` should have dimension 1 rather than {b.ndim}.") + if a.shape != b.shape: + raise ValueError( + f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}" + ) - n = len(a) + n_features = len(a) # number of common '1' bits between points A and B n_11 = sum(a * b) # number of common '0' bits between points A and B n_00 = sum((1 - a) * (1 - b)) - # calculate Tanimoto coeff based on '1' bits - if n_00 == n: - # bit string is all '0's - t_1 = 1 - else: - t_1 = n_11 / (n - n_00) - # calculate Tanimoto coeff based on '1' bits - if n_11 == n: - # bit string is all '1's - t_0 = 1 - else: - t_0 = n_00 / (n - n_11) + # calculate Tanimoto coefficient based on '0' bits + t_1 = 1 + if n_00 != n_features: + # bit strings are not all '0's + t_1 = n_11 / (n_features - n_00) + # calculate Tanimoto coefficient based on '1' bits + t_0 = 1 + if n_11 != n_features: + # bit strings are not all '1's + t_0 = n_00 / (n_features - n_11) + # combine into modified tanimoto using Bernoulli Model # p = independent success trials # evaluated as total number of '1' bits # divided by 2x the fingerprint length - p = (n - n_00 + n_11) / (2 * n) + p = (n_features - n_00 + n_11) / (2 * n_features) # mt = x * T_1 + (1-x) * T_0 # x = (2-p)/3 so that E(mt) = 1/3, no matter the value of p mt = (((2 - p) / 3) * t_1) + (((1 + p) / 3) * t_0) @@ -221,7 +218,7 @@ def nearest_average_tanimoto(X: np.ndarray) -> float: # find index of closest neighbor for each sample nearest_neighbors = np.argmin(dist, axis=0) assert nearest_neighbors.shape == (X.shape[0],) - # compute the tanimoto coeff for each sample and its closest neighbor + # compute the tanimoto coefficient for each sample and its closest neighbor coeffs = [] for idx_sample, idx_neighbor in enumerate(nearest_neighbors): coeffs.append(tanimoto(X[idx_sample], X[idx_neighbor])) diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py index e13b7aff..8cb0099e 100644 --- a/DiverseSelector/test/test_distance.py +++ b/DiverseSelector/test/test_distance.py @@ -102,6 +102,7 @@ def test_modified_tanimoto_dimension_error(): b = np.zeros(5) assert_raises(ValueError, modified_tanimoto, a, b) assert_raises(ValueError, modified_tanimoto, b, a) + assert_raises(ValueError, modified_tanimoto, np.ones(3), np.ones(5)) def test_modified_tanimoto_matrix(): From 3fc9b3e6370daea0ad76cb7c359abd5e86f634bb Mon Sep 17 00:00:00 2001 From: Fanwang Meng Date: Sun, 9 Jul 2023 00:08:22 -0400 Subject: [PATCH 11/11] Fix coverage configuration --- .coveragerc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index e956eb4c..bdb67bcb 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,6 +1,6 @@ [run] omit = - DiverseSelector/*/test/* + DiverseSelector/*/tests/* DiverseSelector/test/* DiverseSelector/__init__.py DiverseSelector/_version.py