Skip to content

Commit

Permalink
Merge pull request theochem#142 from theochem/directed_spheres
Browse files Browse the repository at this point in the history
Cleanup of Directed Spheres class
  • Loading branch information
FanwangM authored Jul 14, 2023
2 parents 0b9e088 + daf9b91 commit 9ffd940
Show file tree
Hide file tree
Showing 4 changed files with 246 additions and 170 deletions.
13 changes: 8 additions & 5 deletions DiverseSelector/methods/dissimilarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"""Module for Dissimilarity-Based Selection Methods."""

from DiverseSelector.methods.base import SelectionBase
from DiverseSelector.methods.utils import predict_radius
from DiverseSelector.methods.utils import optimize_radius
import numpy as np
from scipy import spatial

Expand Down Expand Up @@ -187,7 +187,7 @@ class OptiSim(SelectionBase):
Adapted from https://doi.org/10.1021/ci970282v
"""

def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_seed=42):
def __init__(self, r=None, k=10, tol=5.0, eps=0, p=2, start_id=0, random_seed=42, n_iter=10):
"""
Initializing class.
Expand All @@ -199,7 +199,7 @@ def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_s
k: int
Amount of points to add to subsample before selecting one of the points with the
greatest minimum distance to the previously selected points.
tolerance: float
tol: float
Percentage error of number of molecules actually selected from number of molecules
requested.
eps: float
Expand All @@ -213,14 +213,17 @@ def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_s
Index for the first point to be selected.
random_seed: int
Seed for random selection of points be evaluated.
n_iter: int
Number of iterations to execute when optimizing the size of exclusion radius. Default is 10.
"""
self.r = r
self.k = k
self.tolerance = tolerance
self.tol = tol
self.eps = eps
self.p = p
self.start_id = start_id
self.random_seed = random_seed
self.n_iter = n_iter

def algorithm(self, arr, uplimit) -> list:
"""
Expand Down Expand Up @@ -305,6 +308,6 @@ def select_from_cluster(self, arr, num_selected, cluster_ids=None):
selected: list
List of ids of selected molecules
"""
return predict_radius(self, arr, num_selected, cluster_ids)
return optimize_radius(self, arr, num_selected, cluster_ids)


169 changes: 95 additions & 74 deletions DiverseSelector/methods/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@
import math

import bitarray
import scipy.spatial

from DiverseSelector.methods.base import SelectionBase
from DiverseSelector.diversity import compute_diversity
from DiverseSelector.methods.utils import predict_radius
from DiverseSelector.methods.utils import optimize_radius
import numpy as np
from scipy import spatial
from sklearn.decomposition import PCA
Expand All @@ -43,113 +45,132 @@


class DirectedSphereExclusion(SelectionBase):
"""Selecting points using Directed Sphere Exclusion algorithm.
Starting point is chosen as the reference point and not included in the selected molecules. The
distance of each point is calculated to the reference point and the points are then sorted based
on the ascending order of distances. The points are then evaluated in their sorted order, and
are selected if their distance to all the other selected points is at least r away. Euclidian
distance is used by default and the r value is automatically generated if not passed to satisfy
the number of molecules requested.
Adapted from https://doi.org/10.1021/ci025554v
"""Select samples using Directed Sphere Exclusion (DISE) algorithm.
In a nutshell, this algorithm iteratively excludes any sample within a given radius from
any already selected sample. The radius of the exclusion sphere is an adjustable parameter.
Compared to Sphere Exclusion algorithm, the Directed Sphere Exclusion algorithm achieves a
more evenly distributed subset selection by abandoning the random selection approach and
instead imposing a directed selection.
Reference sample is chosen based on the `ref_index`, which is excluded from the selected
subset. All samples are sorted (ascending order) based on their Minkowski p-norm distance
from the reference sample. Looping through sorted samples, the sample is selected if it is
not already excluded. If selected, all its neighboring samples within a sphere of radius r
(i.e., exclusion sphere) are excluded from being selected. When the selected number of points
is greater than specified subset `size`, the selection process terminates. The `r0` is used
as the initial radius of exclusion sphere, however, it is optimized to select the desired
number of samples.
Notes
-----
Gobbi, A., and Lee, M.-L. (2002). DISE: directed sphere exclusion.
Journal of Chemical Information and Computer Sciences,
43(1), 317–323. https://doi.org/10.1021/ci025554v
"""

def __init__(self, r=None, tolerance=5.0, eps=0, p=2, start_id=0, random_seed=42):
"""
Initializing class.
def __init__(self, r0=None, ref_index=0, p=2.0, eps=0.0, tol=0.05, n_iter=10, random_seed=42):
"""Initialize class.
Parameters
----------
r: float
Initial guess of radius for directed sphere exclusion algorithm, no points within r
distance to an already selected point can be selected.
tolerance: float
Percentage error of number of molecules actually selected from number of molecules
requested.
eps: float
Approximate nearest neighbor search for eliminating close points. Branches of the tree
are not explored if their nearest points are further than r / (1 + eps), and branches
are added in bulk if their furthest points are nearer than r * (1 + eps).
p: float
Which Minkowski p-norm to use. Should be in the range [1, inf]. A finite large p may
cause a ValueError if overflow can occur.
start_id: int
Index for the first point to be selected.
random_seed: int
r0: float, optional
Initial guess for radius of the exclusion sphere.
ref_index: int, optional
Index of the reference sample to start the selection algorithm from.
This sample is not included in the selected subset.
p: float, optional
Which Minkowski p-norm to use. The values of `p` should be within [1, inf].
A finite large p may cause a ValueError if overflow can occur.
eps: float, optional
Approximate nearest neighbor search used in `KDTree.query_ball_tree`.
Branches of the tree are not explored if their nearest points are further than
r/(1+eps), and branches are added in bulk if their furthest points are nearer than
r * (1+eps). eps has to be non-negative.
tol: float, optional
Percentage error of number of samples actually selected from number of samples requested.
n_iter: int, optional
Number of iterations for optimizing the radius of exclusion sphere.
random_seed: int, optional
Seed for random selection of points be evaluated.
"""
self.r = r
self.tolerance = tolerance
self.eps = eps
self.r = r0
self.ref_index = ref_index
self.p = p
self.starting_idx = start_id
self.eps = eps
self.tol = tol
self.n_iter = n_iter
self.random_seed = random_seed

def algorithm(self, arr, uplimit):
"""
Directed sphere exclusion algorithm logic.
def algorithm(self, X, max_size):
"""Return selected samples based on directed sphere exclusion algorithm.
Parameters
----------
arr: np.ndarray
Coordinate array of points.
uplimit: int
Maximum number of points to select.
X: ndarray of shape (n_samples, n_features)
Feature matrix of `n_samples` samples in `n_features` dimensional space.
max_size: int
Maximum number of samples to select.
Returns
-------
selected: list
List of ids of selected molecules
List of indices of selected samples.
"""
selected = []
count = 0
candidates = np.delete(np.arange(0, len(arr)), self.starting_idx)
distances = []
for idx in candidates:
ref_point = arr[self.starting_idx]
data_point = arr[idx]
distance = spatial.distance.minkowski(ref_point, data_point, p=self.p)
distances.append((distance, idx))
distances.sort()
order = [idx for dist, idx in distances]

kdtree = spatial.KDTree(arr)
bv = bitarray.bitarray(len(arr))
bv[:] = 0
bv[self.starting_idx] = 1

for idx in order:
if not bv[idx]:
# calculate distance of all samples from reference sample; distance is a (n_samples,) array
distances = scipy.spatial.minkowski_distance(X[self.ref_index], X, p=self.p)
# get sorted index of samples based on their distance from reference (closest to farthest)
index_sorted = np.argsort(distances)
# construct KDTree for quick nearest-neighbor lookup
kdtree = spatial.KDTree(X)

# construct bitarray to track selected samples (1 means exclude)
bv = bitarray.bitarray(list(np.zeros(len(X), dtype=int)))
bv[self.ref_index] = 1

selected = []
for idx in index_sorted:
# select sample if it is not already excluded from consideration
# indexing a single item of a bitarray will always return an integer
if bv[idx] == 0:
selected.append(idx)
count += 1
if count > uplimit:
# return indices of selected samples, if desired number is selected
if len(selected) > max_size:
return selected
elim = kdtree.query_ball_point(arr[idx], self.r, eps=self.eps, p=self.p, workers=-1)
for index in elim:
# find index of all samples within radius of sample idx (this includes the sample index itself)
index_exclude = kdtree.query_ball_point(
X[idx], self.r, eps=self.eps, p=self.p, workers=-1
)
# exclude samples within radius r of sample idx (measure by Minkowski p-norm) from
# future consideration by setting their bitarray value to 1
for index in index_exclude:
bv[index] = 1

return selected

def select_from_cluster(self, arr, num_selected, cluster_ids=None):
"""
Algorithm that uses sphere_exclusion for selecting points from cluster.
def select_from_cluster(self, X, size, cluster_ids=None):
"""Return selected samples from a cluster based on directed sphere exclusion algorithm
Parameters
----------
arr: np.ndarray
Coordinate array of points
num_selected: int
Number of molecules that need to be selected.
X: ndarray of shape (n_samples, n_features)
Feature matrix of `n_samples` samples in `n_features` dimensional space.
size: int
Number of samples to be selected.
cluster_ids: np.ndarray
Indices of molecules that form a cluster
Indices of samples that form a cluster.
Returns
-------
selected: list
List of ids of selected molecules
List of indices of selected samples.
"""
return predict_radius(self, arr, num_selected, cluster_ids)
if X.shape[0] < size:
raise RuntimeError(
f"Number of samples is less than the requested sample size: {X.shape[0]} < {size}."
)
return optimize_radius(self, X, size, cluster_ids)


class GridPartitioning(SelectionBase):
Expand Down
Loading

0 comments on commit 9ffd940

Please sign in to comment.