Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup of Directed Spheres class #142

Merged
merged 8 commits into from
Jul 14, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions DiverseSelector/methods/dissimilarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"""Module for Dissimilarity-Based Selection Methods."""

from DiverseSelector.methods.base import SelectionBase
from DiverseSelector.methods.utils import predict_radius
from DiverseSelector.methods.utils import optimize_radius
import numpy as np
from scipy import spatial

Expand Down Expand Up @@ -171,7 +171,7 @@ class OptiSim(SelectionBase):
Adapted from https://doi.org/10.1021/ci970282v
"""

def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_seed=42):
def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_seed=42, n_iter=10):
"""
Initializing class.

Expand All @@ -197,6 +197,8 @@ def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_s
Index for the first point to be selected.
random_seed: int
Seed for random selection of points be evaluated.
n_iter: int
Number of iterations to execute when optimizing the size of exclusion radius. Default is 10.
"""
self.r = r
self.k = k
Expand All @@ -205,6 +207,7 @@ def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_s
self.p = p
self.start_id = start_id
self.random_seed = random_seed
self.n_iter = n_iter

def algorithm(self, arr, uplimit) -> list:
"""
Expand Down Expand Up @@ -272,6 +275,6 @@ def select_from_cluster(self, arr, num_selected, cluster_ids=None):
selected: list
List of ids of selected molecules
"""
return predict_radius(self, arr, num_selected, cluster_ids)
return optimize_radius(self, arr, num_selected, cluster_ids)


100 changes: 62 additions & 38 deletions DiverseSelector/methods/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import math

import bitarray
import scipy.spatial

from DiverseSelector.methods.base import SelectionBase
from DiverseSelector.diversity import compute_diversity
from DiverseSelector.methods.utils import predict_radius
FarnazH marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -45,27 +47,32 @@
class DirectedSphereExclusion(SelectionBase):
"""Selecting points using Directed Sphere Exclusion algorithm.

Starting point is chosen as the reference point and not included in the selected molecules. The
Starting point is chosen as the reference point
and not included in the selected molecules. The
distance of each point is calculated to the reference point and the points are then sorted based
on the ascending order of distances. The points are then evaluated in their sorted order, and
are selected if their distance to all the other selected points is at least r away. Euclidian
are selected if their distance to all the other selected points is at least r away. Euclidean
distance is used by default and the r value is automatically generated if not passed to satisfy
the number of molecules requested.

Adapted from https://doi.org/10.1021/ci025554v
Notes
-----
Gobbi, A., and Lee, M.-L. (2002). DISE: directed sphere exclusion.
Journal of Chemical Information and Computer Sciences,
43(1), 317–323. https://doi.org/10.1021/ci025554v
"""

def __init__(self, r=None, tolerance=5.0, eps=0, p=2, start_id=0, random_seed=42):
def __init__(self, r_0=None, tolerance=0.05, eps=1e-8, p=2, start_id=0, random_seed=42, n_iter=10):
"""
Initializing class.

Parameters
----------
r: float
r_0: float
Initial guess of radius for directed sphere exclusion algorithm, no points within r
distance to an already selected point can be selected.
tolerance: float
Percentage error of number of molecules actually selected from number of molecules
Percentage error of number of points actually selected from number of points
requested.
eps: float
Approximate nearest neighbor search for eliminating close points. Branches of the tree
Expand All @@ -75,81 +82,98 @@ def __init__(self, r=None, tolerance=5.0, eps=0, p=2, start_id=0, random_seed=42
Which Minkowski p-norm to use. Should be in the range [1, inf]. A finite large p may
cause a ValueError if overflow can occur.
start_id: int
Index for the first point to be selected.
Index for the first point to be selected. Default is 0.
random_seed: int
Seed for random selection of points be evaluated.
n_iter: int
Number of iterations to execute when optimizing the size of exclusion radius. Default is 10.
"""
self.r = r
self.r = r_0
self.tolerance = tolerance
self.eps = eps
self.p = p
self.starting_idx = start_id
self.random_seed = random_seed
self.n_iter = n_iter

def algorithm(self, arr, uplimit):
def algorithm(self, x, uplimit):
"""
Directed sphere exclusion algorithm logic.
Directed sphere exclusion algorithm.

Given a reference point, sorts all points by distance to the reference point.
Then using a KDTree, the closest points are selected and a sphere
is built around the point. All points within the sphere are excluded
from the search. This process iterates until the number of selected
points is greater than `uplimit`, or the algorithm runs out of points
to select from.

Parameters
----------
arr: np.ndarray
Coordinate array of points.
x: np.ndarray
Feature matrix.
uplimit: int
Maximum number of points to select.

Returns
-------
selected: list
List of ids of selected molecules
List of ids of selected points.
"""
selected = []
count = 0
candidates = np.delete(np.arange(0, len(arr)), self.starting_idx)
distances = []
for idx in candidates:
ref_point = arr[self.starting_idx]
data_point = arr[idx]
distance = spatial.distance.minkowski(ref_point, data_point, p=self.p)
distances.append((distance, idx))
distances.sort()
order = [idx for dist, idx in distances]

kdtree = spatial.KDTree(arr)
bv = bitarray.bitarray(len(arr))

# calculate distance from reference point to all data points
ref_point = x[self.starting_idx]
distances = scipy.spatial.minkowski_distance(ref_point, x, p=self.p)
# order points by distance from reference
order = np.argsort(distances)
# Construct KDTree to make it easier to search neighbors
kdtree = spatial.KDTree(x)
# bv tracks viable candidates
bv = bitarray.bitarray(len(x))
bv[:] = 0
bv[self.starting_idx] = 1

# select points based on closest to reference point
selected = []
for idx in order:
if not bv[idx]:
# If point isn't already part of any hyperspheres
if bv[idx] == 0:
# Then add point to selection
selected.append(idx)
count += 1
if count > uplimit:
# finished selecting # of points required, return
if len(selected) > uplimit:
return selected
elim = kdtree.query_ball_point(arr[idx], self.r, eps=self.eps, p=self.p, workers=-1)
# find all points now within radius of newly selected point
elim = kdtree.query_ball_point(x[idx], self.r, eps=self.eps, p=self.p, workers=-1)
# turn 'on' bits in bv to make for loop skip indices of eliminated points
# eliminate points from selection
for index in elim:
bv[index] = 1

return selected

def select_from_cluster(self, arr, num_selected, cluster_ids=None):
def select_from_cluster(self, x, num_selected, cluster_ids=None):
"""
Algorithm that uses sphere_exclusion for selecting points from cluster.

Parameters
----------
arr: np.ndarray
Coordinate array of points
x: np.ndarray
Feature points.
num_selected: int
Number of molecules that need to be selected.
Number of points that need to be selected.
cluster_ids: np.ndarray
Indices of molecules that form a cluster
Indices of points that form a cluster

Returns
-------
selected: list
List of ids of selected molecules
"""
return predict_radius(self, arr, num_selected, cluster_ids)
if x.shape[0] < num_selected:
raise RuntimeError(
f"The number of selected points {num_selected} is greater than the number of points"
f"provided {x.shape[0]}."
)
return predict_radius(self, x, num_selected, cluster_ids)


class GridPartitioning(SelectionBase):
Expand Down
73 changes: 50 additions & 23 deletions DiverseSelector/methods/tests/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,36 +22,63 @@

"""Test Partition-Based Selection Methods."""

import numpy as np
from DiverseSelector.methods.partition import DirectedSphereExclusion, GridPartitioning, Medoid
from DiverseSelector.methods.tests.common import generate_synthetic_data
from numpy.testing import assert_equal
from numpy.testing import assert_equal, assert_raises


def test_directed_sphere_num_selected_error():
"""Test DirectedSphereExclusion error when too many points requested."""
x = np.array([[1, 9]]*100)
selector = DirectedSphereExclusion()
assert_raises(RuntimeError, selector.select, x, num_selected=105)

def test_directedsphereexclusion():
"""Testing DirectedSphereExclusion class."""
coords, _, _ = generate_synthetic_data(n_samples=100,
n_features=2,
n_clusters=1,
pairwise_dist=True,
metric="euclidean",
random_state=42)

coords_cluster, class_labels_cluster, _ = generate_synthetic_data(n_samples=100,
n_features=2,
n_clusters=3,
pairwise_dist=True,
metric="euclidean",
random_state=42)
selector = DirectedSphereExclusion()
selected_ids = selector.select(arr=coords_cluster, size=12, labels=class_labels_cluster)
# make sure all the selected indices are the same with expectation
assert_equal(selected_ids, [95, 14, 88, 84, 76, 68, 93, 50, 29, 19, 54])
def test_directed_sphere_same_number_of_pts():
"""Test DirectSphereExclusion with `num_selected` = number of points in dataset."""
# (0,0) as the reference point
x = np.array([[0,0],[0,1],[0,2],[0,3]])
selector = DirectedSphereExclusion(r_0=1, tolerance=0)
selected = selector.select(arr=x, num_selected=3)
expected = [1,2,3]
assert_equal(selected, expected)
assert_equal(selector.r, 0.5)

selector = DirectedSphereExclusion()
selected_ids = selector.select(arr=coords, size=12)
# make sure all the selected indices are the same with expectation
assert_equal(selected_ids, [17, 92, 64, 6, 12, 76, 10, 87, 73, 66, 11, 57])

def test_directed_sphere_exclusion_select_more_number_of_pts():
"""Test DirectSphereExclusion on points on the line with `num_selected` < number of points in dataset."""
# (0,0) as the reference point
x = np.array([[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6]])
selector = DirectedSphereExclusion(r_0=0.5, tolerance=0)
selected = selector.select(arr=x, num_selected=3)
expected = [1, 3, 5]
assert_equal(selected, expected)
assert_equal(selector.r, 1.0)


def test_directed_sphere_exclusion_on_line_with_():
"""Test Direct Sphere Exclusion on points on line with smaller distribution than the radius."""
# (0,0) as the reference point
x = np.array([[0, 0], [0, 1], [0, 1.1], [0, 1.2], [0, 2],
[0, 3], [0, 3.1], [0, 3.2], [0, 4], [0, 5], [0, 6]])
selector = DirectedSphereExclusion(r_0=0.5, tolerance=0)
selected = selector.select(arr=x, num_selected=3)
expected = [1, 5, 9]
assert_equal(selected, expected)
assert_equal(selector.r, 1.0)


def test_directed_sphere_on_line_with_larger_radius():
"""Test Direct Sphere Exclusion on points on the line with a too large radius size."""
# (0,0) as the reference point
x = np.array([[0, 0], [0, 1], [0, 1.1], [0, 1.2], [0, 2],
[0, 3], [0, 3.1], [0, 3.2], [0, 4], [0, 5]])
selector = DirectedSphereExclusion(r_0=2.0, tolerance=0)
selected = selector.select(arr=x, num_selected=3)
expected = [1, 5, 9]
assert_equal(selected, expected)
assert_equal(selector.r, 1.0)


def test_gridpartitioning():
Expand Down
Loading