From 9f746c2c68b7ea31db078049b2bf71eb2cad9f4e Mon Sep 17 00:00:00 2001
From: Fanwang Meng <fwmeng88@gmail.com>
Date: Sat, 8 Jul 2023 23:42:08 -0400
Subject: [PATCH] Rename `total_diversity_volume` -->
 `hypersphere_overlap_of_subset`

---
 DiverseSelector/diversity.py | 39 +++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/DiverseSelector/diversity.py b/DiverseSelector/diversity.py
index e34b7b08..798c730f 100644
--- a/DiverseSelector/diversity.py
+++ b/DiverseSelector/diversity.py
@@ -23,11 +23,11 @@
 
 """Molecule dataset diversity calculation module."""
 
+import warnings
 from typing import List
 
 import numpy as np
 from scipy.spatial.distance import euclidean
-import warnings
 
 __all__ = [
     "compute_diversity",
@@ -42,7 +42,7 @@
 
 def compute_diversity(
     features: np.array,
-    div_type: str = "total_diversity_volume",
+    div_type: str = "hypersphere_overlap_of_subset",
 ) -> float:
     """Compute diversity metrics.
 
@@ -53,7 +53,8 @@ def compute_diversity(
     div_type : str, optional
         Method of calculation diversity for a given molecule set, which
         includes "entropy", "logdet", "shannon_entropy", "wdud",
-        gini_coefficient" and "total_diversity_volume". Default is "total_diversity_volume".
+        gini_coefficient" and "hypersphere_overlap_of_subset".
+        Default is "hypersphere_overlap_of_subset".
     mols : List[rdkit.Chem.rdchem.Mol], optional
         List of RDKit molecule objects. This is only needed when using the
         "explicit_diversity_index" method. Default=None.
@@ -68,7 +69,7 @@ def compute_diversity(
         "logdet": logdet,
         "shannon_entropy": shannon_entropy,
         "wdud": wdud,
-        "total_diversity_volume": total_diversity_volume,
+        "hypersphere_overlap_of_subset": hypersphere_overlap_of_subset,
         "gini_coefficient": gini_coefficient,
     }
 
@@ -201,7 +202,10 @@ def shannon_entropy(x: np.ndarray) -> float:
         p_i = np.count_nonzero(x[:, i]) / num_mols
         # sum all non-zero terms
         if p_i == 0:
-            raise ValueError(f"Feature {i} has value 0 for all molecules. Remove extraneous feature from data set.")
+            raise ValueError(
+                f"Feature {i} has value 0 for all molecules."
+                "Remove extraneous feature from data set."
+            )
         h_x += (-1 * p_i) * np.log10(p_i)
     return h_x
 
@@ -246,7 +250,9 @@ def wdud(x: np.ndarray) -> float:
     min_x = np.min(x, axis=0)
     # Normalization of each feature to [0, 1]
     if np.any(np.abs(max_x - min_x) < 1e-30):
-        raise ValueError(f"One of the features is redundant and causes normalization to fail.")
+        raise ValueError(
+            f"One of the features is redundant and causes normalization to fail."
+        )
     x_norm = (x - min_x) / (max_x - min_x)
     ans = []  # store the Wasserstein distance for each feature
     for i in range(0, num_features):
@@ -254,7 +260,7 @@ def wdud(x: np.ndarray) -> float:
         y = np.sort(x_norm[:, i])
         # Round to the sixth decimal place and count number of unique elements
         #    to construct an accurate cumulative discrete distribution func \sum_{x <= y_{i + 1}} 1/k
-        y, counts = np.unique(np.round(x_norm[:,i], decimals=6), return_counts=True)
+        y, counts = np.unique(np.round(x_norm[:, i], decimals=6), return_counts=True)
         p = 0
         # Ignore 0 and because v_min= 0
         for j in range(1, len(counts)):
@@ -264,7 +270,7 @@ def wdud(x: np.ndarray) -> float:
             # Make a grid from yi1 to yi
             grid = np.linspace(yi1, yi, num=1000, endpoint=True)
             # Evaluate the integrand  |x - \sum_{x <= y_{i + 1}} 1/k|
-            p += counts[j-1]
+            p += counts[j - 1]
             integrand = np.abs(grid - p / num_mols)
             # Integrate using np.trapz
             wdu += np.trapz(y=integrand, x=grid)
@@ -311,23 +317,28 @@ def hypersphere_overlap_of_subset(lib: np.ndarray, x: np.array) -> float:
     min_x = np.min(lib, axis=0)
     # Normalization of each feature to [0, 1]
     if np.any(np.abs(max_x - min_x) < 1e-30):
-        raise ValueError(f"One of the features is redundant and causes normalization to fail.")
+        raise ValueError(
+            f"One of the features is redundant and causes normalization to fail."
+        )
     x_norm = (x - min_x) / (max_x - min_x)
     # r_o = hypersphere radius
     r_o = d * np.sqrt(1 / k)
     if r_o > 0.5:
-        warnings.warn(f"The number of molecules should be much larger"
-                      " than the number of features.")
+        warnings.warn(
+            f"The number of molecules should be much larger"
+            " than the number of features."
+        )
     g_s = 0
     edge = 0
-    lam = (d - 1.0) / d   # Lambda parameter controls edge penalty
+    # lambda parameter controls edge penalty
+    lam = (d - 1.0) / d
     # calculate overlap volume
     for i in range(0, (k - 1)):
         for j in range((i + 1), k):
             dist = np.linalg.norm(x_norm[i] - x_norm[j])
             # Overlap penalty
             if dist <= (2 * r_o):
-                with np.errstate(divide='ignore'):
+                with np.errstate(divide="ignore"):
                     # min(100) ignores the inf case with divide by zero
                     g_s += min(100, 2 * (r_o / dist) - 1)
         # Edge penalty: lambda (1 - \sum^d_j e_{ij} / (dr_0)
@@ -342,7 +353,7 @@ def hypersphere_overlap_of_subset(lib: np.ndarray, x: np.array) -> float:
             if dist > r_o:
                 dist = r_o
             edge_pen += dist
-        edge_pen /= (d * r_o)
+        edge_pen /= d * r_o
         # print("Should be positive value only", (1.0 - edge_pen))
         edge_pen = lam * (1.0 - edge_pen)
         edge += edge_pen