theochem · FanwangM · Jul 14, 2023 · Jun 19, 2023 · Jun 23, 2023 · Jun 23, 2023
diff --git a/DiverseSelector/methods/dissimilarity.py b/DiverseSelector/methods/dissimilarity.py
@@ -23,7 +23,7 @@
 """Module for Dissimilarity-Based Selection Methods."""
 
 from DiverseSelector.methods.base import SelectionBase
-from DiverseSelector.methods.utils import predict_radius
+from DiverseSelector.methods.utils import optimize_radius
 import numpy as np
 from scipy import spatial
 
@@ -171,7 +171,7 @@ class OptiSim(SelectionBase):
     Adapted from  https://doi.org/10.1021/ci970282v
     """
 
-    def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_seed=42):
+    def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_seed=42, n_iter=10):
         """
         Initializing class.
 
@@ -197,6 +197,8 @@ def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_s
             Index for the first point to be selected.
         random_seed: int
             Seed for random selection of points be evaluated.
+        n_iter: int
+            Number of iterations to execute when optimizing the size of exclusion radius. Default is 10.
         """
         self.r = r
         self.k = k
@@ -205,6 +207,7 @@ def __init__(self, r=None, k=10, tolerance=5.0, eps=0, p=2, start_id=0, random_s
         self.p = p
         self.start_id = start_id
         self.random_seed = random_seed
+        self.n_iter = n_iter
 
     def algorithm(self, arr, uplimit) -> list:
         """
@@ -272,6 +275,6 @@ def select_from_cluster(self, arr, num_selected, cluster_ids=None):
         selected: list
             List of ids of selected molecules
         """
-        return predict_radius(self, arr, num_selected, cluster_ids)
+        return optimize_radius(self, arr, num_selected, cluster_ids)
 
 
diff --git a/DiverseSelector/methods/partition.py b/DiverseSelector/methods/partition.py
@@ -26,6 +26,8 @@
 import math
 
 import bitarray
+import scipy.spatial
+
 from DiverseSelector.methods.base import SelectionBase
 from DiverseSelector.diversity import compute_diversity
 from DiverseSelector.methods.utils import predict_radius
@@ -45,27 +47,32 @@
 class DirectedSphereExclusion(SelectionBase):
     """Selecting points using Directed Sphere Exclusion algorithm.
 
-    Starting point is chosen as the reference point and not included in the selected molecules. The
+    Starting point is chosen as the reference point
+    and not included in the selected molecules. The
     distance of each point is calculated to the reference point and the points are then sorted based
     on the ascending order of distances. The points are then evaluated in their sorted order, and
-    are selected if their distance to all the other selected points is at least r away. Euclidian
+    are selected if their distance to all the other selected points is at least r away. Euclidean
     distance is used by default and the r value is automatically generated if not passed to satisfy
     the number of molecules requested.
 
-    Adapted from https://doi.org/10.1021/ci025554v
+    Notes
+    -----
+    Gobbi, A., and Lee, M.-L. (2002). DISE: directed sphere exclusion.
+    Journal of Chemical Information and Computer Sciences,
+    43(1), 317–323. https://doi.org/10.1021/ci025554v
     """
 
-    def __init__(self, r=None, tolerance=5.0, eps=0, p=2, start_id=0, random_seed=42):
+    def __init__(self, r_0=None, tolerance=0.05, eps=1e-8, p=2, start_id=0, random_seed=42, n_iter=10):
         """
         Initializing class.
 
         Parameters
         ----------
-        r: float
+        r_0: float
             Initial guess of radius for directed sphere exclusion algorithm, no points within r
             distance to an already selected point can be selected.
         tolerance: float
-            Percentage error of number of molecules actually selected from number of molecules
+            Percentage error of number of points actually selected from number of points
             requested.
         eps: float
             Approximate nearest neighbor search for eliminating close points. Branches of the tree
@@ -75,81 +82,98 @@ def __init__(self, r=None, tolerance=5.0, eps=0, p=2, start_id=0, random_seed=42
             Which Minkowski p-norm to use. Should be in the range [1, inf]. A finite large p may
             cause a ValueError if overflow can occur.
         start_id: int
-            Index for the first point to be selected.
+            Index for the first point to be selected. Default is 0.
         random_seed: int
             Seed for random selection of points be evaluated.
+        n_iter: int
+            Number of iterations to execute when optimizing the size of exclusion radius. Default is 10.
         """
-        self.r = r
+        self.r = r_0
         self.tolerance = tolerance
         self.eps = eps
         self.p = p
         self.starting_idx = start_id
         self.random_seed = random_seed
+        self.n_iter = n_iter
 
-    def algorithm(self, arr, uplimit):
+    def algorithm(self, x, uplimit):
         """
-        Directed sphere exclusion algorithm logic.
+        Directed sphere exclusion algorithm.
+
+        Given a reference point, sorts all points by distance to the reference point.
+        Then using a KDTree, the closest points are selected and a sphere
+        is built around the point. All points within the sphere are excluded
+        from the search. This process iterates until the number of selected
+        points is greater than `uplimit`, or the algorithm runs out of points
+        to select from.
 
         Parameters
         ----------
-        arr: np.ndarray
-            Coordinate array of points.
+        x: np.ndarray
+            Feature matrix.
         uplimit: int
             Maximum number of points to select.
 
         Returns
         -------
         selected: list
-            List of ids of selected molecules
+            List of ids of selected points.
         """
-        selected = []
-        count = 0
-        candidates = np.delete(np.arange(0, len(arr)), self.starting_idx)
-        distances = []
-        for idx in candidates:
-            ref_point = arr[self.starting_idx]
-            data_point = arr[idx]
-            distance = spatial.distance.minkowski(ref_point, data_point, p=self.p)
-            distances.append((distance, idx))
-        distances.sort()
-        order = [idx for dist, idx in distances]
-
-        kdtree = spatial.KDTree(arr)
-        bv = bitarray.bitarray(len(arr))
+
+        # calculate distance from reference point to all data points
+        ref_point = x[self.starting_idx]
+        distances = scipy.spatial.minkowski_distance(ref_point, x, p=self.p)
+        # order points by distance from reference
+        order = np.argsort(distances)
+        # Construct KDTree to make it easier to search neighbors
+        kdtree = spatial.KDTree(x)
+        # bv tracks viable candidates
+        bv = bitarray.bitarray(len(x))
         bv[:] = 0
         bv[self.starting_idx] = 1
-
+        # select points based on closest to reference point
+        selected = []
         for idx in order:
-            if not bv[idx]:
+            # If point isn't already part of any hyperspheres
+            if bv[idx] == 0:
+                # Then add point to selection
                 selected.append(idx)
-                count += 1
-                if count > uplimit:
+                # finished selecting # of points required, return
+                if len(selected) > uplimit:
                     return selected
-                elim = kdtree.query_ball_point(arr[idx], self.r, eps=self.eps, p=self.p, workers=-1)
+                # find all points now within radius of newly selected point
+                elim = kdtree.query_ball_point(x[idx], self.r, eps=self.eps, p=self.p, workers=-1)
+                # turn 'on' bits in bv to make for loop skip indices of eliminated points
+                #   eliminate points from selection
                 for index in elim:
                     bv[index] = 1
 
         return selected
 
-    def select_from_cluster(self, arr, num_selected, cluster_ids=None):
+    def select_from_cluster(self, x, num_selected, cluster_ids=None):
         """
         Algorithm that uses sphere_exclusion for selecting points from cluster.
 
         Parameters
         ----------
-        arr: np.ndarray
-            Coordinate array of points
+        x: np.ndarray
+            Feature points.
         num_selected: int
-            Number of molecules that need to be selected.
+            Number of points that need to be selected.
         cluster_ids: np.ndarray
-            Indices of molecules that form a cluster
+            Indices of points that form a cluster
 
         Returns
         -------
         selected: list
             List of ids of selected molecules
         """
-        return predict_radius(self, arr, num_selected, cluster_ids)
+        if x.shape[0] < num_selected:
+            raise RuntimeError(
+                f"The number of selected points {num_selected} is greater than the number of points"
+                f"provided {x.shape[0]}."
+            )
+        return predict_radius(self, x, num_selected, cluster_ids)
 
 
 class GridPartitioning(SelectionBase):

diff --git a/DiverseSelector/methods/tests/test_partition.py b/DiverseSelector/methods/tests/test_partition.py
@@ -22,36 +22,63 @@
 
 """Test Partition-Based Selection Methods."""
 
+import numpy as np
 from DiverseSelector.methods.partition import DirectedSphereExclusion, GridPartitioning, Medoid
 from DiverseSelector.methods.tests.common import generate_synthetic_data
-from numpy.testing import assert_equal
+from numpy.testing import assert_equal, assert_raises
 
 
+def test_directed_sphere_num_selected_error():
+    """Test DirectedSphereExclusion error when too many points requested."""
+    x = np.array([[1, 9]]*100)
+    selector = DirectedSphereExclusion()
+    assert_raises(RuntimeError, selector.select, x, num_selected=105)
 
-def test_directedsphereexclusion():
-    """Testing DirectedSphereExclusion class."""
-    coords, _, _ = generate_synthetic_data(n_samples=100,
-                                           n_features=2,
-                                           n_clusters=1,
-                                           pairwise_dist=True,
-                                           metric="euclidean",
-                                           random_state=42)
 
-    coords_cluster, class_labels_cluster, _ = generate_synthetic_data(n_samples=100,
-                                                                      n_features=2,
-                                                                      n_clusters=3,
-                                                                      pairwise_dist=True,
-                                                                      metric="euclidean",
-                                                                      random_state=42)
-    selector = DirectedSphereExclusion()
-    selected_ids = selector.select(arr=coords_cluster, size=12, labels=class_labels_cluster)
-    # make sure all the selected indices are the same with expectation
-    assert_equal(selected_ids, [95, 14, 88, 84, 76, 68, 93, 50, 29, 19, 54])
+def test_directed_sphere_same_number_of_pts():
+    """Test DirectSphereExclusion with `num_selected` = number of points in dataset."""
+    # (0,0) as the reference point
+    x = np.array([[0,0],[0,1],[0,2],[0,3]])
+    selector = DirectedSphereExclusion(r_0=1, tolerance=0)
+    selected = selector.select(arr=x, num_selected=3)
+    expected = [1,2,3]
+    assert_equal(selected, expected)
+    assert_equal(selector.r, 0.5)
 
-    selector = DirectedSphereExclusion()
-    selected_ids = selector.select(arr=coords, size=12)
-    # make sure all the selected indices are the same with expectation
-    assert_equal(selected_ids, [17, 92, 64, 6, 12, 76, 10, 87, 73, 66, 11, 57])
+
+def test_directed_sphere_exclusion_select_more_number_of_pts():
+    """Test DirectSphereExclusion on points on the line with `num_selected` < number of points in dataset."""
+    # (0,0) as the reference point
+    x = np.array([[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6]])
+    selector = DirectedSphereExclusion(r_0=0.5, tolerance=0)
+    selected = selector.select(arr=x, num_selected=3)
+    expected = [1, 3, 5]
+    assert_equal(selected, expected)
+    assert_equal(selector.r, 1.0)
+
+
+def test_directed_sphere_exclusion_on_line_with_():
+    """Test Direct Sphere Exclusion on points on line with smaller distribution than the radius."""
+    # (0,0) as the reference point
+    x = np.array([[0, 0], [0, 1], [0, 1.1], [0, 1.2], [0, 2],
+                  [0, 3], [0, 3.1], [0, 3.2], [0, 4], [0, 5], [0, 6]])
+    selector = DirectedSphereExclusion(r_0=0.5, tolerance=0)
+    selected = selector.select(arr=x, num_selected=3)
+    expected = [1, 5, 9]
+    assert_equal(selected, expected)
+    assert_equal(selector.r, 1.0)
+
+
+def test_directed_sphere_on_line_with_larger_radius():
+    """Test Direct Sphere Exclusion on points on the line with a too large radius size."""
+    # (0,0) as the reference point
+    x = np.array([[0, 0], [0, 1], [0, 1.1], [0, 1.2], [0, 2],
+                  [0, 3], [0, 3.1], [0, 3.2], [0, 4], [0, 5]])
+    selector = DirectedSphereExclusion(r_0=2.0, tolerance=0)
+    selected = selector.select(arr=x, num_selected=3)
+    expected = [1, 5, 9]
+    assert_equal(selected, expected)
+    assert_equal(selector.r, 1.0)
 
 
 def test_gridpartitioning():