diff --git a/.coveragerc b/.coveragerc index e956eb4c..bdb67bcb 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,6 +1,6 @@ [run] omit = - DiverseSelector/*/test/* + DiverseSelector/*/tests/* DiverseSelector/test/* DiverseSelector/__init__.py DiverseSelector/_version.py diff --git a/DiverseSelector/distance.py b/DiverseSelector/distance.py index ed76cc46..44a70327 100644 --- a/DiverseSelector/distance.py +++ b/DiverseSelector/distance.py @@ -21,220 +21,206 @@ # # -- -"""Metric calculation module.""" +"""Similarity Module.""" import numpy as np -from scipy.spatial.distance import squareform +from itertools import combinations_with_replacement +from scipy.spatial import distance_matrix -__all__ = [ - "compute_distance_matrix", - "pairwise_similarity_bit", - "tanimoto", - "modified_tanimoto", - "nearest_average_tanimoto" -] +__all__ = ["pairwise_similarity_bit", "tanimoto", "modified_tanimoto", "nearest_average_tanimoto"] -def compute_distance_matrix( - features: np.ndarray, - metric: str -): - """Compute pairwise distance given a feature matrix. - Parameters - ---------- - features : np.ndarray - Molecule feature matrix. - metric : str - Distance metric. - - Returns - ------- - dist : ndarray - Symmetric distance array. - """ - # todo: add more metrics implemented here - built_in_metrics = { - "tanimoto": tanimoto, - "modified_tanimoto": modified_tanimoto, - } - - # Check if specified metric is supported - if metric in built_in_metrics: - distances = [] - size = len(features) - for i in range(0, size): - for j in range(i + 1, size): - # use the metric to compute distance between all molecule pairs - distances.append(1 - built_in_metrics[metric](features[i], features[j])) - # shape into symmetric matrix - dist = squareform(distances) - - else: # raise error if unsupported - raise ValueError(f"Metric {metric} is not supported by the library.") - - return dist - - -def pairwise_similarity_bit(features: np.array, metric: str) -> np.ndarray: - """Compute the pairwise similarity coefficients and returns them in - a square symmetric matrix. +def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray: + """Compute pairwise similarity coefficient matrix. Parameters ---------- - features : ndarray - Feature matrix. + X : ndarray of shape (n_samples, n_features) + Feature matrix of `n_samples` samples in `n_features` dimensional space. metric : str - Method of calculation. + The metric used when calculating similarity coefficients between samples in a feature array. + Method for calculating similarity coefficient. Options: `"tanimoto"`, `"modified_tanimoto"`. Returns ------- - pair_coeff : ndarray - Similarity coefficients for all molecule pairs in feature matrix. + s : ndarray of shape (n_samples, n_samples) + A symmetric similarity matrix between each pair of samples in the feature matrix. + The diagonal elements are directly computed instead of assuming that they are 1. """ - function_dict = { + available_methods = { "tanimoto": tanimoto, "modified_tanimoto": modified_tanimoto, } - - pair_simi = [] - size = len(features) - for i in range(0, size): - for j in range(i + 1, size): - # use the specified metric to compute similarity between all distinct molecule pairs - pair_simi.append(function_dict[metric](features[i], features[j])) - # shape into symmetric matrix - pair_coeff = squareform(pair_simi) + np.identity(size) - return pair_coeff + if metric not in available_methods: + raise ValueError( + f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}" + ) + if X.ndim != 2: + raise ValueError(f"Argument features should be a 2D array, got {X.ndim}") + + # make pairwise m-by-m similarity matrix + n_samples = len(X) + s = np.zeros((n_samples, n_samples)) + # compute similarity between all pairs of points (including the diagonal elements) + for i, j in combinations_with_replacement(range(n_samples), 2): + s[i, j] = s[j, i] = available_methods[metric](X[i], X[j]) + return s def tanimoto(a: np.array, b: np.array) -> float: - r"""Compute Tanimoto coefficient. + r"""Compute Tanimoto coefficient or index (a.k.a. Jaccard similarity coefficient). + + For two binary or non-binary arrays :math:`A` and :math:`B`, Tanimoto coefficient + is defined as the size of their intersection divided by the size of their union: ..math:: - T(A,B) = A \cap B / A \cup B + T(A, B) = \frac{| A \cap B|}{| A \cup B |} = + \frac{| A \cap B|}{|A| + |B| - | A \cap B|} = + \frac{A \cdot B}{\|A\|^2 + \|B\|^2 - A \cdot B} + + where :math:`A \cdot B = \sum_i{A_i B_i}` and :math:`\|A\|^2 = \sum_i{A_i^2}`. Parameters ---------- - a : array_like - Molecule A's features. - b : array_like - Molecules B's features. + a : ndarray of shape (n_features,) + The 1D feature array of sample :math:`A` in an `n_features` dimensional space. + b : ndarray of shape (n_features,) + The 1D feature array of sample :math:`B` in an `n_features` dimensional space. Returns ------- coeff : float - Tanimoto coefficient for molecules A and B. - - Notes - ----- - The Tanimoto coefficient computes similarity by taking the intersection of A and B over their union. + Tanimoto coefficient between feature arrays :math:`A` and :math:`B`. Bajusz, D., Rácz, A., and Héberger, K.. (2015) Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?. Journal of Cheminformatics 7. """ - coeff = (sum(a * b)) / ((sum(a ** 2)) + (sum(b ** 2)) - (sum(a * b))) + if a.ndim != 1 or b.ndim != 1: + raise ValueError(f"Arguments a and b should be 1D arrays, got {a.ndim} and {b.ndim}") + if a.shape != b.shape: + raise ValueError( + f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}" + ) + coeff = sum(a * b) / (sum(a**2) + sum(b**2) - sum(a * b)) return coeff def modified_tanimoto(a: np.array, b: np.array) -> float: - r"""Compute the modified tanimoto coefficient from bitstrings of molecules A and B. + r"""Compute the modified tanimoto coefficient from bitstring vectors of data points A and B. Adjusts calculation of the Tanimoto coefficient to counter its natural bias towards - smaller molecules using a Bernoulli probability model. + shorter vectors using a Bernoulli probability model. ..math:: - mt = \frac{2-p}{3}t_1 + \frac{1+p}{3}t_0$ - where - p = success probability of independent trials - $t_1 = | A \cap B |$ - $t_0 = |(1-A) \cap (1-B)|$ + MT = \frac{2-p}{3}T_1 + \frac{1+p}{3}T_0 + + where :math:`p` is success probability of independent trials, + :math:`T_1` is the number of common '1' bits between data points + (:math:`T_1 = | A \cap B |`), and :math:`T_0` is the number of common '0' + bits between data points (:math:`T_0 = |(1-A) \cap (1-B)|`). + Parameters ---------- - a : array_like - Molecule A's features in bitstring. - b : array_like - Molecules B's features in bitstring. + a : ndarray of shape (n_features,) + The 1D bitstring feature array of sample :math:`A` in an `n_features` dimensional space. + b : ndarray of shape (n_features,) + The 1D bitstring feature array of sample :math:`B` in an `n_features` dimensional space. Returns ------- mt : float - Modified tanimoto coefficient for molecule A and B. + Modified tanimoto coefficient between bitstring feature arrays :math:`A` and :math:`B`. Notes ----- + The equation above has been derived from + + ..math:: + MT_\alpha= {\alpha}T_1 + (1-\alpha)T_0 + + where :math:`\alpha = \frac{2-p}{3}`. This is done so that the expected value + of the modified tanimoto, :math:`E(MT)`, remains constant even as the number of + trials :math:`p` grows larger. Fligner, M. A., Verducci, J. S., and Blower, P. E.. (2002) A Modification of the Jaccard-Tanimoto Similarity Index for Diverse Selection of Chemical Compounds Using Binary Strings. Technometrics 44, 110-119. """ - n = len(a) - # intersection of '1' bits + if a.ndim != 1: + raise ValueError(f"Argument `a` should have dimension 1 rather than {a.ndim}.") + if b.ndim != 1: + raise ValueError(f"Argument `b` should have dimension 1 rather than {b.ndim}.") + if a.shape != b.shape: + raise ValueError( + f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}" + ) + + n_features = len(a) + # number of common '1' bits between points A and B n_11 = sum(a * b) - # intersection of '0' bits + # number of common '0' bits between points A and B n_00 = sum((1 - a) * (1 - b)) - # calculate in terms of '1' bits - if n_00 == n: - t_1 = 1 - else: - t_1 = n_11 / (n - n_00) - # calculate in terms of '0' bits - if n_11 == n: - t_0 = 1 - else: - t_0 = n_00 / (n - n_11) + # calculate Tanimoto coefficient based on '0' bits + t_1 = 1 + if n_00 != n_features: + # bit strings are not all '0's + t_1 = n_11 / (n_features - n_00) + # calculate Tanimoto coefficient based on '1' bits + t_0 = 1 + if n_11 != n_features: + # bit strings are not all '1's + t_0 = n_00 / (n_features - n_11) + # combine into modified tanimoto using Bernoulli Model - p = ((n - n_00) + n_11) / (2 * n) + # p = independent success trials + # evaluated as total number of '1' bits + # divided by 2x the fingerprint length + p = (n_features - n_00 + n_11) / (2 * n_features) + # mt = x * T_1 + (1-x) * T_0 + # x = (2-p)/3 so that E(mt) = 1/3, no matter the value of p mt = (((2 - p) / 3) * t_1) + (((1 + p) / 3) * t_0) return mt -def nearest_average_tanimoto(x: np.ndarray) -> float: - """Computes the average tanimoto for nearest molecules. +def nearest_average_tanimoto(X: np.ndarray) -> float: + """Compute the average tanimoto for nearest data points measured by Minkowski 2-norm. + + For each sample, the closest neighbor is identified by computing its Minkowski 2-norm + (i.e., Euclidean) distance with all other samples, and identifying neighboring sample + with the shortest distance. Parameters ---------- - x : ndarray - Feature matrix. + X : ndarray of shape (n_samples, n_features) + Feature matrix of `n_samples` samples in `n_features` dimensional space. Returns ------- - nat : float - Average tanimoto of closest pairs. - - Notes - ----- - This computes the tanimoto coefficient of pairs with the shortest - distances, then returns the average of them. - This calculation is explictly for the explicit diversity index. + float : + Average of the Tanimoto coefficients for each sample and its closest neighbor. Papp, Á., Gulyás-Forró, A., Gulyás, Z., Dormán, G., Ürge, L., and Darvas, F.. (2006) Explicit Diversity Index (EDI): A Novel Measure for Assessing the Diversity of Compound Databases. Journal of Chemical Information and Modeling 46, 1898-1904. """ - tani = [] - for idx, _ in enumerate(x): - # arbitrary distance for comparison: - short = 100 - a = 0 - b = 0 - # search for shortest distance point from idx - for jdx, _ in enumerate(x): - dist = np.linalg.norm(x[idx]-x[jdx]) - if dist < short and idx != jdx: - short = dist - a = idx - b = jdx - # calculate tanimoto for each shortest dist pair - tani.append(tanimoto(x[a], x[b])) - # compute average of all shortest tanimoto coeffs - nat = np.average(tani) - return nat - + # compute euclidean distance between all samples + dist = distance_matrix(X, X, p=2) + # replace zero self-distance with infinity, before computing nearest neighbors + np.fill_diagonal(dist, np.inf) + # find index of closest neighbor for each sample + nearest_neighbors = np.argmin(dist, axis=0) + assert nearest_neighbors.shape == (X.shape[0],) + # compute the tanimoto coefficient for each sample and its closest neighbor + coeffs = [] + for idx_sample, idx_neighbor in enumerate(nearest_neighbors): + coeffs.append(tanimoto(X[idx_sample], X[idx_neighbor])) + # return average of all coefficients + return np.average(coeffs) diff --git a/DiverseSelector/test/test_distance.py b/DiverseSelector/test/test_distance.py index 42320165..8cb0099e 100644 --- a/DiverseSelector/test/test_distance.py +++ b/DiverseSelector/test/test_distance.py @@ -21,70 +21,102 @@ # # -- -"""Testing for the distance and similarity algorithms in the distance.py module.""" +"""Test distance.py Module.""" -from DiverseSelector.distance import (compute_distance_matrix, - pairwise_similarity_bit, - nearest_average_tanimoto - ) +from DiverseSelector.distance import ( + pairwise_similarity_bit, + nearest_average_tanimoto, + tanimoto, + modified_tanimoto, +) import numpy as np from numpy.testing import assert_almost_equal, assert_equal, assert_raises -# each row is a feature and each column is a molecule -sample1 = np.array([[4, 2, 6], - [4, 9, 6], - [2, 5, 0], - [2, 0, 9], - [5, 3, 0]]) -# each row is a molecule and each column is a feature (scipy) -sample2 = np.array([[1, 1, 0, 0, 0], - [0, 1, 1, 0, 0], - [0, 0, 0, 1, 0], - [0, 0, 0, 0, 1]]) +def test_pairwise_similarity_bit_raises(): + # check raised error for input feature matrix that is not 2D + assert_raises(ValueError, pairwise_similarity_bit, np.random.random(5), "tanimoto") + assert_raises(ValueError, pairwise_similarity_bit, np.random.random((2, 3, 4)), "tanimoto") + # check raised error for not-available method + assert_raises(ValueError, pairwise_similarity_bit, np.random.random((5, 1)), "tan") + assert_raises(ValueError, pairwise_similarity_bit, np.random.random((5, 1)), tanimoto) -sample3 = np.array([[1, 4], - [3, 2]]) -sample4 = np.array([[1, 0, 1], - [0, 1, 1]]) +def test_tanimoto_raises(): + # check raised error when a or b is not 1D + assert_raises(ValueError, tanimoto, np.random.random((1, 5)), np.random.random(5)) + assert_raises(ValueError, tanimoto, np.random.random(3), np.random.random((1, 4))) + assert_raises(ValueError, tanimoto, np.random.random(4), np.random.random((3, 4))) + assert_raises(ValueError, tanimoto, np.random.random((3, 3)), np.random.random((2, 3))) + # check raised error when a and b don't have the same length + assert_raises(ValueError, tanimoto, np.random.random(3), np.random.random(5)) + assert_raises(ValueError, tanimoto, np.random.random(20), np.random.random(10)) -def test_compute_distance_matrix_builtin(): - """Testing the compute distance matrix with a built in metric.""" - sci_dist = compute_distance_matrix(sample2, "tanimoto") - expected = np.array([[0, 0.6666667, 1, 1], - [0.6666667, 0, 1, 1], - [1, 1, 0, 1], - [1, 1, 1, 0]]) - assert_almost_equal(expected, sci_dist) +def test_tanimoto(): + """Test the tanimoto function on one pair of points.""" + a = np.array([2, 0, 1]) + b = np.array([2, 0, 0]) + # expected = (2*2 + 0*0 + 1*0) / (2**2 + 1 + 2**2 - 2*2) + assert_equal(tanimoto(a, b), 4 / (5 + 4 - 4)) -def test_compute_distance_matrix_invalid_metric(): - """Testing the compute distance matrix with an invalid metric.""" - assert_raises(ValueError, compute_distance_matrix, sample1, "fake_distance") +def test_tanimoto_bitstring(): + """Test the tanimoto function on one pair of points.""" + a = np.array([0, 0, 0, 1, 0, 1, 1]) + b = np.array([1, 1, 0, 0, 0, 1, 1]) + assert_equal(tanimoto(a, b), 2 / 5) -def test_tanimoto(): +def test_tanimoto_matrix(): """Testing the tanimoto function with predefined feature matrix.""" - tani = pairwise_similarity_bit(sample3, "tanimoto") - expected = np.array([[1, (11 / 19)], - [(11 / 19), 1]]) - assert_equal(expected, tani) + x = np.array([[1, 4], [3, 2]]) + s = pairwise_similarity_bit(x, "tanimoto") + expected = np.array([[1, (11 / 19)], [(11 / 19), 1]]) + assert_equal(s, expected) + + +def test_modified_tanimoto(): + a = np.array([1, 1, 0, 0, 1]) + b = np.array([0, 0, 0, 0, 1]) + expected = (1.6 / 9) + (1.4 / 6) + assert_equal(modified_tanimoto(a, b), expected) + + +def test_modified_tanimoto_all_ones(): + """Test the modified tanimoto function when input is all '1' bits""" + a = np.array([1, 1, 1, 1, 1]) + assert_equal(modified_tanimoto(a, a), 1) + + +def test_modified_tanimoto_all_zeroes(): + """Test the modified tanimoto function when input is all '0' bits""" + a = np.zeros(5) + assert_equal(modified_tanimoto(a, a), 1) + +def test_modified_tanimoto_dimension_error(): + """Test modified tanimoto raises error when input has incorrect dimension.""" + a = np.zeros([7, 5]) + b = np.zeros(5) + assert_raises(ValueError, modified_tanimoto, a, b) + assert_raises(ValueError, modified_tanimoto, b, a) + assert_raises(ValueError, modified_tanimoto, np.ones(3), np.ones(5)) -def test_modifed_tanimoto(): + +def test_modified_tanimoto_matrix(): """Testing the modified tanimoto function with predefined feature matrix.""" - mod_tani = pairwise_similarity_bit(sample4, "modified_tanimoto") - expceted = np.array([[1, (4 / 27)], - [(4 / 27), 1]]) - assert_equal(mod_tani, expceted) + x = np.array([[1, 0, 1], [0, 1, 1]]) + s = pairwise_similarity_bit(x, "modified_tanimoto") + expceted = np.array([[1, (4 / 27)], [(4 / 27), 1]]) + assert_equal(s, expceted) def test_nearest_average_tanimoto_bit(): """Test the nearest_average_tanimoto function with binary input""" - nat = nearest_average_tanimoto(sample2) + x = np.array([[1, 1, 0, 0, 0], [0, 1, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]]) + nat = nearest_average_tanimoto(x) shortest_tani = [0.3333333, 0.3333333, 0, 0] average = np.average(shortest_tani) assert_almost_equal(nat, average) @@ -92,14 +124,20 @@ def test_nearest_average_tanimoto_bit(): def test_nearest_average_tanimoto(): """Test the nearest_average_tanimoto function with non-binary input""" - nat = nearest_average_tanimoto(sample3) - shortest_tani = [(11/19), (11/19)] - average = np.average(shortest_tani) - assert_equal(nat, average) - - - - - + x = np.array([[1, 4], [3, 2]]) + # expected: (1x3 + 4x2) / (1 + 4^2 + 3^3 + 2^2 - 1x3 - 4x2) + assert_equal(nearest_average_tanimoto(x), 11 / 19) +def test_nearest_average_tanimoto_nonsquare(): + """Test the nearest_average_tanimoto function with non-binary input""" + x = np.array([[3.5, 4.0, 10.5, 0.5], [1.25, 4.0, 7.0, 0.1], [0.0, 0.0, 0.0, 0.0]]) + # nearest neighbor of sample 0, 1, and 2 are sample 1, 0, and 1, respectively. + expected = np.average( + [ + np.sum(x[0] * x[1]) / (np.sum(x[0] ** 2) + np.sum(x[1] ** 2) - np.sum(x[0] * x[1])), + np.sum(x[1] * x[0]) / (np.sum(x[1] ** 2) + np.sum(x[0] ** 2) - np.sum(x[1] * x[0])), + np.sum(x[2] * x[1]) / (np.sum(x[2] ** 2) + np.sum(x[1] ** 2) - np.sum(x[2] * x[1])), + ] + ) + assert_equal(nearest_average_tanimoto(x), expected)