From 099f1d87441168ebb4bc7a5528692dec511671a2 Mon Sep 17 00:00:00 2001
From: zhuwq <zhuwq0@gmail.com>
Date: Sun, 3 Nov 2024 00:36:25 -0700
Subject: [PATCH] update clustering

---
 scripts/run_event_association.py | 89 +++++++++++++++++++++-----------
 1 file changed, 59 insertions(+), 30 deletions(-)

diff --git a/scripts/run_event_association.py b/scripts/run_event_association.py
index b9d2d44..bfb95af 100644
--- a/scripts/run_event_association.py
+++ b/scripts/run_event_association.py
@@ -8,6 +8,8 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import torch
+import torch.nn.functional as F
 from args import parse_args
 from pyproj import Proj
 from scipy.sparse.csgraph import minimum_spanning_tree
@@ -25,9 +27,17 @@ def associate(
 
     VPVS_RATIO = config["VPVS_RATIO"]
     VP = config["VP"]
+    DT = 1.0  # seconds
+    MIN_STATION = 3
 
-    proj = Proj(proj="merc", datum="WGS84", units="km")
-    stations[["x_km", "y_km"]] = stations.apply(lambda x: pd.Series(proj(x.longitude, x.latitude)), axis=1)
+    # %%
+    t0 = min(events["event_time"].min(), picks["phase_time"].min())
+    events["timestamp"] = events["event_time"].apply(lambda x: (x - t0).total_seconds())
+    events["timestamp_center"] = events["center_time"].apply(lambda x: (x - t0).total_seconds())
+    picks["timestamp"] = picks["phase_time"].apply(lambda x: (x - t0).total_seconds())
+
+    # proj = Proj(proj="merc", datum="WGS84", units="km")
+    # stations[["x_km", "y_km"]] = stations.apply(lambda x: pd.Series(proj(x.longitude, x.latitude)), axis=1)
 
     # dist_matrix = squareform(pdist(stations[["x_km", "y_km"]].values))
     # mst = minimum_spanning_tree(dist_matrix)
@@ -37,24 +47,42 @@ def associate(
     # eps_t = 6.0
     # eps_xy = eps_t * VP * 2 / (1.0 + VPVS_RATIO)
     # print(f"eps_t: {eps_t:.3f}, eps_xy: {eps_xy:.3f}")
-    eps_xy = 30.0
-    print(f"eps_xy: {eps_xy:.3f}")
+    # eps_xy = 30.0
+    # print(f"eps_xy: {eps_xy:.3f}")
+
+    # %% Using DBSCAN to cluster events
+    # events = events.merge(stations[["station_id", "x_km", "y_km"]], on="station_id", how="left")
+
+    # scaling = np.array([1.0, 1.0 / eps_xy, 1.0 / eps_xy])
+    # clustering = DBSCAN(eps=2.0, min_samples=4).fit(events[["timestamp", "x_km", "y_km"]] * scaling)
+    # # clustering = DBSCAN(eps=2.0, min_samples=4).fit(events[["timestamp"]])
+    # # clustering = DBSCAN(eps=3.0, min_samples=3).fit(events[["timestamp"]])
+    # # clustering = DBSCAN(eps=1.0, min_samples=3).fit(events[["timestamp"]])
+    # events["event_index"] = clustering.labels_
+
+    ## Using histogram to cluster events
+    events["event_index"] = -1
+    t = np.arange(events["timestamp"].min(), events["timestamp"].max(), DT)
+    hist, _ = np.histogram(events["timestamp"], bins=t)
+    # retrieve picks using max_pool of kernel size 5 seconds
+    hist = torch.from_numpy(hist).float().unsqueeze(0).unsqueeze(0)
+    hist_pool = F.max_pool1d(hist, kernel_size=5, padding=2, stride=1)
+    # find the index of the maximum value in hist_pool
+    mask = hist_pool == hist
+    hist = hist * mask
+    K = int((t[-1] - t[0]) / 10)  # assume max 1 event per 10 seconds on average
+    topk_score, topk_index = torch.topk(hist, k=K)
+    topk_index = topk_index[topk_score > MIN_STATION]  # min 3 stations
+    topk_index = topk_index.squeeze().numpy()
+    num_events = len(topk_index)
+    # assign timestamp to events based on the topk_index within 2 DT
+    t0 = (topk_index - 2) * DT
+    t1 = (topk_index + 2) * DT
+    timestamp = events["timestamp"].values
+    for i in tqdm(range(num_events), desc="Assigning event index"):
+        mask = (timestamp >= t0[i]) & (timestamp <= t1[i])
+        events.loc[mask, "event_index"] = i
 
-    # %%
-    t0 = min(events["event_time"].min(), picks["phase_time"].min())
-    events["timestamp"] = events["event_time"].apply(lambda x: (x - t0).total_seconds())
-    events["timestamp_center"] = events["center_time"].apply(lambda x: (x - t0).total_seconds())
-    picks["timestamp"] = picks["phase_time"].apply(lambda x: (x - t0).total_seconds())
-
-    # %%
-    events = events.merge(stations[["station_id", "x_km", "y_km"]], on="station_id", how="left")
-
-    scaling = np.array([1.0, 1.0 / eps_xy, 1.0 / eps_xy])
-    clustering = DBSCAN(eps=2.0, min_samples=4).fit(events[["timestamp", "x_km", "y_km"]] * scaling)
-    # clustering = DBSCAN(eps=2.0, min_samples=4).fit(events[["timestamp"]])
-    # clustering = DBSCAN(eps=3.0, min_samples=3).fit(events[["timestamp"]])
-    # clustering = DBSCAN(eps=1.0, min_samples=3).fit(events[["timestamp"]])
-    events["event_index"] = clustering.labels_
     print(f"Number of associated events: {len(events['event_index'].unique())}")
 
     # %% link picks to events
@@ -62,18 +90,19 @@ def associate(
     picks.set_index("station_id", inplace=True)
 
     for group_id, event in tqdm(events.groupby("station_id"), desc="Linking picks to events"):
-        # travel time tt = (tp + ts) / 2 = (ps_ratio + 1)/2 * tp,
-        # (ts - tp) = (ps_ratio - 1) tp = tt * (ps_ratio + 1) * 2 * (ps_ratio - 1)
-        ps_delta = event["travel_time_s"] / (VPVS_RATIO + 1) * 2 * (VPVS_RATIO - 1)
-        t1 = event["timestamp_center"] - ps_delta * 1.2
-        t2 = event["timestamp_center"] + ps_delta * 1.2
-        index = event["event_index"]
-
-        mask = (picks.loc[group_id, "timestamp"].values[None, :] >= t1.values[:, None]) & (
-            picks.loc[group_id, "timestamp"].values[None, :] <= t2.values[:, None]
-        )
+        # travel time tt = (tp + ts) / 2 = (1 + ps_ratio)/2 * tp => tp = tt * 2 / (1 + ps_ratio)
+        # (ts - tp) = (ps_ratio - 1) tp = tt * 2 * (ps_ratio - 1) / (ps_ratio + 1)
+        ps_delta = event["travel_time_s"].values * 2 * (VPVS_RATIO - 1) / (VPVS_RATIO + 1)
+        t1 = event["timestamp_center"].values - ps_delta * 1.2
+        t2 = event["timestamp_center"].values + ps_delta * 1.2
+
+        picks_ = picks.loc[group_id, "timestamp"].values  # (Npk, )
+        mask = (picks_[None, :] >= t1[:, None]) & (picks_[None, :] <= t2[:, None])  # (Nev, Npk)
+        # picks.loc[group_id, "event_index"] = np.where(
+        #     mask.any(axis=0), index.values[mask.argmax(axis=0)], picks.loc[group_id, "event_index"]
+        # )
         picks.loc[group_id, "event_index"] = np.where(
-            mask.any(axis=0), index.values[mask.argmax(axis=0)], picks.loc[group_id, "event_index"]
+            mask.any(axis=0), event["event_index"].values[mask.argmax(axis=0)], -1
         )
 
     picks.reset_index(inplace=True)