Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuwq0 committed Dec 2, 2023
1 parent fecb1a1 commit e8ea205
Show file tree
Hide file tree
Showing 5 changed files with 481 additions and 116 deletions.
118 changes: 66 additions & 52 deletions datasets/NCEDC/convert_hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from datetime import datetime, timedelta, timezone
from pathlib import Path

import fsspec
import h5py
import numpy as np
import obspy
Expand All @@ -19,10 +18,10 @@

# %%
root_path = "dataset"
waveform_path = f"gs/waveform_mseed"
waveform_path = f"{root_path}/waveform"
catalog_path = f"{root_path}/catalog"
station_path = f"{root_path}/station"
result_path = f"dataset/waveform_h5"
result_path = f"dataset/waveform_ps_h5"
if not os.path.exists(result_path):
os.makedirs(result_path)

Expand Down Expand Up @@ -104,7 +103,7 @@ def convert(i, year):
tmp = datetime.strptime(jday, "%Y.%j")

events = pd.read_csv(
f"{catalog_path}/{tmp.year:04d}.{tmp.month:02d}.event.csv", parse_dates=["event_time"])
f"{catalog_path}/{tmp.year:04d}.{tmp.month:02d}.event.csv", parse_dates=["time"])
events.set_index("event_id", inplace=True)

# phases = pd.read_csv(
Expand All @@ -115,7 +114,7 @@ def convert(i, year):
# )

phases = pd.read_csv(
f"{catalog_path}/{tmp.year:04d}.{tmp.month:02d}.phase.csv",
f"{catalog_path}/{tmp.year:04d}.{tmp.month:02d}.phase_ps.csv",
parse_dates=["phase_time"],
date_format="%Y-%m-%dT%H:%M:%S.%f",
dtype={"location": str},
Expand All @@ -137,7 +136,7 @@ def convert(i, year):
for event_id in event_ids:
gp = fp.create_group(event_id)
gp.attrs["event_id"] = event_id
gp.attrs["event_time"] = events.loc[event_id, "event_time"].isoformat()
gp.attrs["event_time"] = events.loc[event_id, "time"].isoformat()
gp.attrs["latitude"] = events.loc[event_id, "latitude"]
gp.attrs["longitude"] = events.loc[event_id, "longitude"]
gp.attrs["depth_km"] = events.loc[event_id, "depth_km"]
Expand All @@ -159,10 +158,10 @@ def convert(i, year):
arrival_time = phases.loc[event_id, "phase_time"].min()
begin_time = arrival_time - pd.Timedelta(seconds=30)
end_time = arrival_time + pd.Timedelta(seconds=90)
gp.attrs["begin_time"] = begin_time.isoformat()
gp.attrs["begin_time"] = begin_time.strftime("%Y-%m-%dT%H:%M:%S.%f")
gp.attrs["end_time"] = end_time.isoformat()
gp.attrs["event_time_index"] = int(
(events.loc[event_id, "event_time"] - begin_time).total_seconds() * 100
(events.loc[event_id, "time"] - begin_time).total_seconds() * 100
)
gp.attrs["sampling_rate"] = sampling_rate
gp.attrs["nt"] = 12000 # default 120s
Expand All @@ -172,58 +171,52 @@ def convert(i, year):
gp.attrs["delta"] = 1 / sampling_rate

# for station_id in event_id.glob("*"):
# station_ids = fs.glob(f"{bucket}/waveform_mseed/{year}/{jday}/{event_id}/*.mseed")
station_ids = glob(f"{waveform_path}/{year}/{jday}/{event_id}/*.mseed")
station_ids = [x.split("/")[-1].replace(".mseed", "") for x in station_ids]
for station_id in station_ids:
ds = gp.create_dataset(station_id, (3, gp.attrs["nt"]), dtype=np.float32)
tr = st.select(id=station_id + "?")
# station_channel_ids = fs.glob(f"{bucket}/waveform_mseed/{year}/{jday}/{event_id}/*.mseed")
station_channel_ids = glob(f"{waveform_path}/{year}/{jday}/{event_id}/*.mseed")
station_channel_ids = [x.split("/")[-1].replace(".mseed", "") for x in station_channel_ids]
for station_channel_id in station_channel_ids:
ds = gp.create_dataset(station_channel_id, (3, gp.attrs["nt"]), dtype=np.float32)
tr = st.select(id=station_channel_id + "?")
for t in tr:
if t.stats.sampling_rate != sampling_rate:
t.resample(sampling_rate)

chn = [tr.stats.channel for tr in tr]
chn = sorted(chn, key=lambda x: order[x[-1]])
components = []
if len(chn) == 3:
for i, t in enumerate(tr):
index0 = int(
round(
(
t.stats.starttime.datetime.replace(tzinfo=timezone.utc) - begin_time
).total_seconds()
* sampling_rate
)
for i, t in enumerate(tr):
index0 = int(
round(
(
t.stats.starttime.datetime.replace(tzinfo=timezone.utc) - begin_time
).total_seconds()
* sampling_rate
)
if index0 > 3000:
del ds
break
ds[i, index0 : index0 + len(t.data)] = (t.data - np.mean(t.data))[
: len(ds[i, index0:])
] * 1e6
components.append(t.stats.channel[-1])
else:
for t in tr:
index0 = int(
round(
(
t.stats.starttime.datetime.replace(tzinfo=timezone.utc) - begin_time
).total_seconds()
* sampling_rate
)
)
if index0 > 3000:
del ds
break
)
if index0 > 0:
it1 = 0
it2 = index0
ll = min(len(t.data), len(ds[i, it2:]))
elif index0 < 0:
it1 = -index0
it2 = 0
ll = min(len(t.data[it1:]), len(ds[i, :]))
else:
it1 = 0
it2 = 0
ll = min(len(t.data), len(ds[i, :]))

if len(chn) != 3:
i = comp2idx[t.stats.channel[-1]]
ds[i, index0 : index0 + len(t.data)] = (t.data - np.mean(t.data))[
: len(ds[i, index0:])
] * 1e6
components.append(t.stats.channel[-1])
# ds[i, index0 : index0 + len(t.data)] = (t.data - np.mean(t.data))[
# : len(ds[i, index0:])
# ] * 1e6
ds[i, it2 : it2 + ll] = (t.data - np.mean(t.data))[it1 : it1 + ll] * 1e6
components.append(t.stats.channel[-1])

if index0 > 3000:
continue
network, station, location, instrument = station_id.split(".")
network, station, location, instrument = station_channel_id.split(".")
ds.attrs["network"] = network
ds.attrs["station"] = station
ds.attrs["location"] = location
Expand Down Expand Up @@ -251,12 +244,15 @@ def convert(i, year):

## pick not exist
if (event_id, station_id, "P") not in phases.index:
del ds
# del ds
del fp[f"{event_id}/{station_channel_id}"]
# print(f"{jday.name}.{event_id}.{network}.{station}.{location} not in P index")
# print(f"{jday.name}.{event_id}.{network}.{station}.{location} not in P index")
continue

if (event_id, station_id, "S") not in phases.index:
del ds
# del ds
del fp[f"{event_id}/{station_channel_id}"]
# print(f"{jday.name}.{event_id}.{network}.{station}.{location} not in S index")
# print(f"{jday.name}.{event_id}.{network}.{station}.{location} not in S index")
continue
Expand All @@ -271,7 +267,8 @@ def convert(i, year):

if len(p_picks[p_picks["event_id"] == event_id]) == 0:
print(f"{jday.name}.{event_id}.{network}.{station}.{location}: no picks")
del ds
# del ds
del fp[f"{event_id}/{station_channel_id}"]
continue

pick = p_picks[p_picks["event_id"] == event_id].iloc[0]
Expand All @@ -287,8 +284,10 @@ def convert(i, year):

if max(snr) == 0:
# print(f"{jday.name}.{event_id}.{network}.{station}.{location}: snr={snr}")
del ds
# del ds
del fp[f"{event_id}/{station_channel_id}"]
continue

ds.attrs["snr"] = snr

(
Expand Down Expand Up @@ -320,6 +319,21 @@ def convert(i, year):
# for x in enumerate(years):
# convert(*x)

# # # check hdf5
# with h5py.File("dataset/waveform_ps_h5/2022.h5", "r") as fp:
# for event_id in fp:
# print(event_id)
# for k in sorted(fp[event_id].attrs.keys()):
# print(k, fp[event_id].attrs[k])
# print("-------------")
# for station_id in fp[event_id]:
# print(station_id)
# print(fp[event_id][station_id].shape)
# for k in sorted(fp[event_id][station_id].attrs.keys()):
# print(k, fp[event_id][station_id].attrs[k])
# print("=============")
# raise

ncpu = len(years)
ctx = mp.get_context("spawn")
with ctx.Pool(ncpu) as pool:
Expand Down
Loading

0 comments on commit e8ea205

Please sign in to comment.