Skip to content

Commit

Permalink
Merge branch 'main' into directed_spheres
Browse files Browse the repository at this point in the history
  • Loading branch information
FanwangM authored Jul 14, 2023
2 parents d4de433 + 0b9e088 commit daf9b91
Show file tree
Hide file tree
Showing 7 changed files with 277 additions and 200 deletions.
1 change: 1 addition & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[run]
omit =
DiverseSelector/*/tests/*
DiverseSelector/test/*
DiverseSelector/__init__.py
DiverseSelector/_version.py
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
python-version: [3.7]

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Additional info about the build
shell: bash
run: |
Expand Down
260 changes: 123 additions & 137 deletions DiverseSelector/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,220 +21,206 @@
#
# --

"""Metric calculation module."""
"""Similarity Module."""


import numpy as np
from scipy.spatial.distance import squareform
from itertools import combinations_with_replacement
from scipy.spatial import distance_matrix

__all__ = [
"compute_distance_matrix",
"pairwise_similarity_bit",
"tanimoto",
"modified_tanimoto",
"nearest_average_tanimoto"
]

__all__ = ["pairwise_similarity_bit", "tanimoto", "modified_tanimoto", "nearest_average_tanimoto"]

def compute_distance_matrix(
features: np.ndarray,
metric: str
):
"""Compute pairwise distance given a feature matrix.

Parameters
----------
features : np.ndarray
Molecule feature matrix.
metric : str
Distance metric.
Returns
-------
dist : ndarray
Symmetric distance array.
"""
# todo: add more metrics implemented here
built_in_metrics = {
"tanimoto": tanimoto,
"modified_tanimoto": modified_tanimoto,
}

# Check if specified metric is supported
if metric in built_in_metrics:
distances = []
size = len(features)
for i in range(0, size):
for j in range(i + 1, size):
# use the metric to compute distance between all molecule pairs
distances.append(1 - built_in_metrics[metric](features[i], features[j]))
# shape into symmetric matrix
dist = squareform(distances)

else: # raise error if unsupported
raise ValueError(f"Metric {metric} is not supported by the library.")

return dist


def pairwise_similarity_bit(features: np.array, metric: str) -> np.ndarray:
"""Compute the pairwise similarity coefficients and returns them in
a square symmetric matrix.
def pairwise_similarity_bit(X: np.array, metric: str) -> np.ndarray:
"""Compute pairwise similarity coefficient matrix.
Parameters
----------
features : ndarray
Feature matrix.
X : ndarray of shape (n_samples, n_features)
Feature matrix of `n_samples` samples in `n_features` dimensional space.
metric : str
Method of calculation.
The metric used when calculating similarity coefficients between samples in a feature array.
Method for calculating similarity coefficient. Options: `"tanimoto"`, `"modified_tanimoto"`.
Returns
-------
pair_coeff : ndarray
Similarity coefficients for all molecule pairs in feature matrix.
s : ndarray of shape (n_samples, n_samples)
A symmetric similarity matrix between each pair of samples in the feature matrix.
The diagonal elements are directly computed instead of assuming that they are 1.
"""

function_dict = {
available_methods = {
"tanimoto": tanimoto,
"modified_tanimoto": modified_tanimoto,
}

pair_simi = []
size = len(features)
for i in range(0, size):
for j in range(i + 1, size):
# use the specified metric to compute similarity between all distinct molecule pairs
pair_simi.append(function_dict[metric](features[i], features[j]))
# shape into symmetric matrix
pair_coeff = squareform(pair_simi) + np.identity(size)
return pair_coeff
if metric not in available_methods:
raise ValueError(
f"Argument metric={metric} is not recognized! Choose from {available_methods.keys()}"
)
if X.ndim != 2:
raise ValueError(f"Argument features should be a 2D array, got {X.ndim}")

# make pairwise m-by-m similarity matrix
n_samples = len(X)
s = np.zeros((n_samples, n_samples))
# compute similarity between all pairs of points (including the diagonal elements)
for i, j in combinations_with_replacement(range(n_samples), 2):
s[i, j] = s[j, i] = available_methods[metric](X[i], X[j])
return s


def tanimoto(a: np.array, b: np.array) -> float:
r"""Compute Tanimoto coefficient.
r"""Compute Tanimoto coefficient or index (a.k.a. Jaccard similarity coefficient).
For two binary or non-binary arrays :math:`A` and :math:`B`, Tanimoto coefficient
is defined as the size of their intersection divided by the size of their union:
..math::
T(A,B) = A \cap B / A \cup B
T(A, B) = \frac{| A \cap B|}{| A \cup B |} =
\frac{| A \cap B|}{|A| + |B| - | A \cap B|} =
\frac{A \cdot B}{\|A\|^2 + \|B\|^2 - A \cdot B}
where :math:`A \cdot B = \sum_i{A_i B_i}` and :math:`\|A\|^2 = \sum_i{A_i^2}`.
Parameters
----------
a : array_like
Molecule A's features.
b : array_like
Molecules B's features.
a : ndarray of shape (n_features,)
The 1D feature array of sample :math:`A` in an `n_features` dimensional space.
b : ndarray of shape (n_features,)
The 1D feature array of sample :math:`B` in an `n_features` dimensional space.
Returns
-------
coeff : float
Tanimoto coefficient for molecules A and B.
Notes
-----
The Tanimoto coefficient computes similarity by taking the intersection of A and B over their union.
Tanimoto coefficient between feature arrays :math:`A` and :math:`B`.
Bajusz, D., Rácz, A., and Héberger, K.. (2015)
Why is Tanimoto index an appropriate choice for fingerprint-based similarity calculations?.
Journal of Cheminformatics 7.
"""
coeff = (sum(a * b)) / ((sum(a ** 2)) + (sum(b ** 2)) - (sum(a * b)))
if a.ndim != 1 or b.ndim != 1:
raise ValueError(f"Arguments a and b should be 1D arrays, got {a.ndim} and {b.ndim}")
if a.shape != b.shape:
raise ValueError(
f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}"
)
coeff = sum(a * b) / (sum(a**2) + sum(b**2) - sum(a * b))
return coeff


def modified_tanimoto(a: np.array, b: np.array) -> float:
r"""Compute the modified tanimoto coefficient from bitstrings of molecules A and B.
r"""Compute the modified tanimoto coefficient from bitstring vectors of data points A and B.
Adjusts calculation of the Tanimoto coefficient to counter its natural bias towards
smaller molecules using a Bernoulli probability model.
shorter vectors using a Bernoulli probability model.
..math::
mt = \frac{2-p}{3}t_1 + \frac{1+p}{3}t_0$
where
p = success probability of independent trials
$t_1 = | A \cap B |$
$t_0 = |(1-A) \cap (1-B)|$
MT = \frac{2-p}{3}T_1 + \frac{1+p}{3}T_0
where :math:`p` is success probability of independent trials,
:math:`T_1` is the number of common '1' bits between data points
(:math:`T_1 = | A \cap B |`), and :math:`T_0` is the number of common '0'
bits between data points (:math:`T_0 = |(1-A) \cap (1-B)|`).
Parameters
----------
a : array_like
Molecule A's features in bitstring.
b : array_like
Molecules B's features in bitstring.
a : ndarray of shape (n_features,)
The 1D bitstring feature array of sample :math:`A` in an `n_features` dimensional space.
b : ndarray of shape (n_features,)
The 1D bitstring feature array of sample :math:`B` in an `n_features` dimensional space.
Returns
-------
mt : float
Modified tanimoto coefficient for molecule A and B.
Modified tanimoto coefficient between bitstring feature arrays :math:`A` and :math:`B`.
Notes
-----
The equation above has been derived from
..math::
MT_\alpha= {\alpha}T_1 + (1-\alpha)T_0
where :math:`\alpha = \frac{2-p}{3}`. This is done so that the expected value
of the modified tanimoto, :math:`E(MT)`, remains constant even as the number of
trials :math:`p` grows larger.
Fligner, M. A., Verducci, J. S., and Blower, P. E.. (2002)
A Modification of the Jaccard-Tanimoto Similarity Index for
Diverse Selection of Chemical Compounds Using Binary Strings.
Technometrics 44, 110-119.
"""
n = len(a)
# intersection of '1' bits
if a.ndim != 1:
raise ValueError(f"Argument `a` should have dimension 1 rather than {a.ndim}.")
if b.ndim != 1:
raise ValueError(f"Argument `b` should have dimension 1 rather than {b.ndim}.")
if a.shape != b.shape:
raise ValueError(
f"Arguments a and b should have the same shape, got {a.shape} != {b.shape}"
)

n_features = len(a)
# number of common '1' bits between points A and B
n_11 = sum(a * b)
# intersection of '0' bits
# number of common '0' bits between points A and B
n_00 = sum((1 - a) * (1 - b))

# calculate in terms of '1' bits
if n_00 == n:
t_1 = 1
else:
t_1 = n_11 / (n - n_00)
# calculate in terms of '0' bits
if n_11 == n:
t_0 = 1
else:
t_0 = n_00 / (n - n_11)
# calculate Tanimoto coefficient based on '0' bits
t_1 = 1
if n_00 != n_features:
# bit strings are not all '0's
t_1 = n_11 / (n_features - n_00)
# calculate Tanimoto coefficient based on '1' bits
t_0 = 1
if n_11 != n_features:
# bit strings are not all '1's
t_0 = n_00 / (n_features - n_11)

# combine into modified tanimoto using Bernoulli Model
p = ((n - n_00) + n_11) / (2 * n)
# p = independent success trials
# evaluated as total number of '1' bits
# divided by 2x the fingerprint length
p = (n_features - n_00 + n_11) / (2 * n_features)
# mt = x * T_1 + (1-x) * T_0
# x = (2-p)/3 so that E(mt) = 1/3, no matter the value of p
mt = (((2 - p) / 3) * t_1) + (((1 + p) / 3) * t_0)
return mt


def nearest_average_tanimoto(x: np.ndarray) -> float:
"""Computes the average tanimoto for nearest molecules.
def nearest_average_tanimoto(X: np.ndarray) -> float:
"""Compute the average tanimoto for nearest data points measured by Minkowski 2-norm.
For each sample, the closest neighbor is identified by computing its Minkowski 2-norm
(i.e., Euclidean) distance with all other samples, and identifying neighboring sample
with the shortest distance.
Parameters
----------
x : ndarray
Feature matrix.
X : ndarray of shape (n_samples, n_features)
Feature matrix of `n_samples` samples in `n_features` dimensional space.
Returns
-------
nat : float
Average tanimoto of closest pairs.
Notes
-----
This computes the tanimoto coefficient of pairs with the shortest
distances, then returns the average of them.
This calculation is explictly for the explicit diversity index.
float :
Average of the Tanimoto coefficients for each sample and its closest neighbor.
Papp, Á., Gulyás-Forró, A., Gulyás, Z., Dormán, G., Ürge, L.,
and Darvas, F.. (2006) Explicit Diversity Index (EDI):
A Novel Measure for Assessing the Diversity of Compound Databases.
Journal of Chemical Information and Modeling 46, 1898-1904.
"""
tani = []
for idx, _ in enumerate(x):
# arbitrary distance for comparison:
short = 100
a = 0
b = 0
# search for shortest distance point from idx
for jdx, _ in enumerate(x):
dist = np.linalg.norm(x[idx]-x[jdx])
if dist < short and idx != jdx:
short = dist
a = idx
b = jdx
# calculate tanimoto for each shortest dist pair
tani.append(tanimoto(x[a], x[b]))
# compute average of all shortest tanimoto coeffs
nat = np.average(tani)
return nat

# compute euclidean distance between all samples
dist = distance_matrix(X, X, p=2)
# replace zero self-distance with infinity, before computing nearest neighbors
np.fill_diagonal(dist, np.inf)
# find index of closest neighbor for each sample
nearest_neighbors = np.argmin(dist, axis=0)
assert nearest_neighbors.shape == (X.shape[0],)
# compute the tanimoto coefficient for each sample and its closest neighbor
coeffs = []
for idx_sample, idx_neighbor in enumerate(nearest_neighbors):
coeffs.append(tanimoto(X[idx_sample], X[idx_neighbor]))
# return average of all coefficients
return np.average(coeffs)
1 change: 1 addition & 0 deletions DiverseSelector/diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def compute_diversity(
Method of calculation diversity for a given molecule set, which
includes "entropy", "logdet", "shannon_entropy", "wdud",
gini_coefficient" and "hypersphere_overlap_of_subset".
Default is "hypersphere_overlap_of_subset".
mols : List[rdkit.Chem.rdchem.Mol], optional
List of RDKit molecule objects. This is only needed when using the
"explicit_diversity_index" method. Default=None.
Expand Down
Loading

0 comments on commit daf9b91

Please sign in to comment.