Merge pull request #358 from GEMScienceTools/add_h5_cat_clipper

WIP: Add h5 cat clipper
GEMScienceTools · Nov 2, 2023 · 96d1ebc · 96d1ebc
2 parents 61723cb + 27b110d
commit 96d1ebc
Show file tree

Hide file tree

Showing 12 changed files with 276 additions and 38 deletions.
diff --git a/openquake/cat/hmg/check.py b/openquake/cat/hmg/check.py
@@ -125,8 +125,8 @@ def process(cat, sidx, delta_ll, delta_t, fname_geojson, use_kms=False):
     # datetime can only cover 548 years starting in 1677
     # general advice will be to exclude historic events and
     # add those later
-    subcat = cat[cat['year'] > 1800]
-
+    subcat = cat[(cat['year'] > 1800) & (cat['value'] > 1.0)]
+    
     for index, row in tqdm(subcat.iterrows()):
         # Take the index from delta_ll - this is needed
         # when delta_ll varies with time.
@@ -137,7 +137,7 @@ def process(cat, sidx, delta_ll, delta_t, fname_geojson, use_kms=False):
         ll_thrs = ll_d[idx_t][idx_mag]
         sel_thrs = time_d[idx_t][idx_mag]
         sel_thrs = sel_thrs.total_seconds()
-
+        
         # Find events close in time
         tmp_dff = abs(subcat.loc[:, 'datetime'] - pd.to_datetime(row.datetime))
         threshold = datetime.timedelta(seconds=sel_thrs)
@@ -150,7 +150,8 @@ def process(cat, sidx, delta_ll, delta_t, fname_geojson, use_kms=False):
             minla = row.latitude - ll_thrs
             maxlo = row.longitude + ll_thrs
             maxla = row.latitude + ll_thrs
-            idx_dist = list(sidx.intersection((minlo, minla, maxlo, maxla)))
+            idx_dist_ind = list(sidx.intersection((minlo, minla, maxlo, maxla)))
+            idx_dist = cat.index[idx_dist_ind]
 
         else:
             tmp_dist = abs(geodetic_distance(

diff --git a/openquake/cat/hmg/hmg.py b/openquake/cat/hmg/hmg.py
@@ -33,6 +33,115 @@
 import pandas as pd
 
 
+def apply_mag_conversion_rule_keep_all(low_mags, conv_eqs, rows, save):
+    """
+    This function applies sequentially a set of rules to the information in
+    the `save` :class:`pandas.DataFrame` instance.
+
+    :param low_mags:
+        A list
+    :param conv_eqs:
+        A list
+    :param rows:
+        The :class:`pandas.DataFrame` instance containing the information
+        still to be processed
+    :param save:
+        One :class:`pandas.DataFrame` instance
+        :return:
+        One :class:`pandas.DataFrame` instances  with the
+        homogenised magnitudes.
+    """
+
+    # Formatting
+    fmt2 = "m[m >= {:.2f}]"
+
+    # Temporary assigning magnitude
+    m = np.round(rows['value'].values, 3)
+    tmp = np.zeros_like(m)
+
+    for mlow, conversion in zip(low_mags, conv_eqs):
+        m_inds = m >= mlow
+        if conversion == 'm':
+            tmp[m_inds] = m[m_inds]
+        else:
+            tmpstr = re.sub('m', fmt2.format(mlow), conversion)
+            cmd = "tmp[m >= {:.2f}] = {:s}".format(mlow, tmpstr)
+
+            try:
+                exec(cmd)
+            except ValueError:
+                fmt = 'Cannot execute the following conversion rule:\n{:s}'
+                print(fmt.format(conversion))
+
+    rows = rows.copy()
+    rows.loc[:, 'magMw'] = tmp
+    save = save.copy()
+    save = pd.concat([save, rows], ignore_index=True, sort=False)
+
+    return save
+
+
+def process_magnitude_keep_all(work, mag_rules):
+    """
+    :param work:
+        A :class:`pandas.DataFrame` instance obtained by joining the origin
+        and magnitude dataframes
+    :param mag_rules:
+        A dictionary with the rules to be used for processing the catalogue
+    :return:
+        Two :class:`pandas.DataFrame` instances. The first one with the
+        homogenised catalogue, the second one with the information not
+        processed (if any).
+    """
+
+    # Add a column for destination
+    if "magMw" not in list(work.columns):
+        work["magMw"] = np.nan
+
+    # This is a new dataframe used to store the processed events
+    save = pd.DataFrame(columns=work.columns)
+
+    # Looping over agencies
+    for agency in mag_rules.keys():
+        print('   Agency: {:s} ('.format(agency), end="")
+
+        # Looping over magnitude-types
+        for mag_type in mag_rules[agency].keys():
+            print('{:s} '.format(mag_type), end='')
+
+            # Create the first selection condition and select rows
+            cond = get_mag_selection_condition(agency, mag_type)
+            try:
+                rows = work.loc[eval(cond), :]
+            except ValueError:
+                fmt = 'Cannot evaluate the following condition:\n {:s}'
+                print(fmt.format(cond))
+
+            # TODO
+            # This is an initial solution that is not ideal since it does
+            # not take the best information available.
+            # Remove duplicates. This can happen when we process a magnitude
+            # type without specifying the agency
+
+            flag = rows["eventID"].duplicated(keep='first')
+
+            if any(flag):
+                # this line is so the larger M is taken - expiremental based on MEX issue
+                rows = rows.sort_values("value", ascending=False).drop_duplicates('eventID').sort_index()
+                #tmp = sorted_rows[~flag].copy()
+                #rows = tmp
+
+            # Magnitude conversion
+            if len(rows) > 0:
+                low_mags = mag_rules[agency][mag_type]['low_mags']
+                conv_eqs = mag_rules[agency][mag_type]['conv_eqs']
+                save = apply_mag_conversion_rule_keep_all(low_mags, conv_eqs,
+                                                       rows, save)
+        print(")")
+
+    return save
+
+
 def apply_mag_conversion_rule(low_mags, conv_eqs, rows, save, work):
     """
     This function applies sequentially a set of rules to the information in
@@ -59,12 +168,13 @@ def apply_mag_conversion_rule(low_mags, conv_eqs, rows, save, work):
     fmt2 = "m[m >= {:.2f}]"
 
     # Temporary assigning magnitude
-    m = rows['value'].values
+    m = np.round(rows['value'].values, 3)
     tmp = np.zeros_like(m)
 
     for mlow, conversion in zip(low_mags, conv_eqs):
+        m_inds = m >= mlow
         if conversion == 'm':
-            tmp = m
+            tmp[m_inds] = m[m_inds]
         else:
             tmpstr = re.sub('m', fmt2.format(mlow), conversion)
             cmd = "tmp[m >= {:.2f}] = {:s}".format(mlow, tmpstr)
@@ -77,6 +187,7 @@ def apply_mag_conversion_rule(low_mags, conv_eqs, rows, save, work):
 
     rows = rows.copy()
     rows.loc[:, 'magMw'] = tmp
+    rows = rows.drop(rows[rows['magMw']==0.0].index)
     save = save.copy()
     save = pd.concat([save, rows], ignore_index=True, sort=False)
 
@@ -205,10 +316,14 @@ def process_magnitude(work, mag_rules):
             # not take the best information available.
             # Remove duplicates. This can happen when we process a magnitude
             # type without specifying the agency
+
             flag = rows["eventID"].duplicated(keep='first')
+
             if any(flag):
-                tmp = rows[~flag].copy()
-                rows = tmp
+                # this line is so the larger M is taken - expiremental based on MEX issue
+                rows = rows.sort_values("value", ascending=False).drop_duplicates('eventID').sort_index()
+                #tmp = sorted_rows[~flag].copy()
+                #rows = tmp
 
             # Magnitude conversion
             if len(rows) > 0:
@@ -262,6 +377,9 @@ def process_dfs(odf_fname, mdf_fname, settings_fname=None):
         work = pd.merge(odf, mdf, on=["eventID"])
         save, work = process_magnitude(work, rules['magnitude'])
 
+        work_all_m = pd.merge(odf, mdf, on=["eventID"])
+        save_all_m = process_magnitude_keep_all(work_all_m, rules['magnitude'])
+
     print("Number of origins with final mag type {:d}\n".format(len(save)))
 
     computed = len(save)
@@ -271,4 +389,4 @@ def process_dfs(odf_fname, mdf_fname, settings_fname=None):
         msg = fmt.format(computed - expected)
         warnings.warn(msg)
 
-    return save, work
+    return save, work, save_all_m
diff --git a/openquake/cat/hmg/plot.py b/openquake/cat/hmg/plot.py
@@ -56,7 +56,7 @@ def get_hists(df, bins, agencies=None, column="magMw"):
     out = []
     out_agencies = []
     for key in agencies:
-        mw = df[df['magAgency'] == key][column]
+        mw = df[df['magAgency'] == key][column].apply(lambda x: round(x, 5))
         if len(mw):
             hist, _ = np.histogram(mw, bins=bins)
             out.append(hist)
@@ -212,7 +212,8 @@ def plot_histogram(df, agencies=None, wdt=0.1, column="magMw",
     mpl.rcParams['axes.labelsize'] = 16
 
     # Data
-    mw = df[column].values
+#    mw = df[column].values
+    mw = df[column].apply(lambda x: round(x, 5)).values
 
     # Creating bins and total histogram
     mmi = np.floor(min(mw)/wdt)*wdt-wdt
@@ -259,7 +260,7 @@ def plot_histogram(df, agencies=None, wdt=0.1, column="magMw",
     # Save figure
     folder = os.path.dirname(fname)
     Path(folder).mkdir(parents=True, exist_ok=True)
-    plt.savefig(fname)
+    plt.savefig(fname,bbox_inches='tight')
 
     if "xlim" in kwargs:
         ax.set_xlim(kwargs["xlim"])

diff --git a/openquake/cat/hmg/utils.py b/openquake/cat/hmg/utils.py
@@ -25,27 +25,48 @@
 # coding: utf-8
 
 import pandas as pd
+import geopandas as gpd
 
 
-def to_hmtk_catalogue(cdf: pd.DataFrame):
+def to_hmtk_catalogue(cdf: pd.DataFrame, polygon=None):
     """
     Converts a catalogue obtained from the homogenisation into a format
     compatible with the oq-hmtk.
 
     :param cdf:
         An instance of :class:`pd.DataFrame`
+    :param polygon:
+        Polygon as shapefile which will be used to clip the catalogue extent 
     :returns:
         An instance of :class:`pd.DataFrame`
     """
 
+    # if there is a polygon clip the catalogue to it
+    if polygon:
+
+        # convert df to gdf
+        cgdf = pd.DataFrame(cdf)
+        tmp = gpd.points_from_xy(cgdf.longitude.values, cgdf.latitude.values)
+        cgdf = gpd.GeoDataFrame(cgdf, geometry=tmp, crs="EPSG:4326")
+
+        # Reading shapefile and dissolving polygons into a single one
+        boundaries = gpd.read_file(polygon)
+        boundaries['dummy'] = 'dummy'
+        geom = boundaries.dissolve(by='dummy').geometry[0]
+
+        # clip the catalogue
+        tmpgeo = {'geometry': [geom]}
+        gdf = gpd.GeoDataFrame(tmpgeo, crs="EPSG:4326")
+        cdf = gpd.sjoin(cgdf, gdf, how="inner", op='intersects')
+
     # Select columns
     # Check if catalogue contains strike/dip/rake and retain if it does
     if 'str1' in cdf.columns:
-        col_list = ['eventID', 'Agency', 'year', 'month', 'day', 'longitude',
-               'latitude', 'depth', 'magMw', 'str1', 'dip1', 'rake1', 'str2', 'dip2', 'rake2']
+        col_list = ['eventID', 'Agency', 'year', 'month', 'day','hour','minute','second', 'longitude',
+               'latitude', 'depth', 'magMw', 'sigma', 'str1', 'dip1', 'rake1', 'str2', 'dip2', 'rake2']
     else:
-        col_list = ['eventID', 'Agency', 'year', 'month', 'day', 'longitude',
-               'latitude', 'depth', 'magMw']
+        col_list = ['eventID', 'Agency', 'year', 'month', 'day', 'hour','minute','second', 'longitude',
+               'latitude', 'depth', 'magMw', 'sigma']
 
     cdf = cdf[col_list]
 

diff --git a/openquake/cat/tests/data/test_hmg/test_hmg_mtab.h5 b/openquake/cat/tests/data/test_hmg/test_hmg_mtab.h5
diff --git a/openquake/cat/tests/data/test_hmg/test_hmg_mtab_clip.h5 b/openquake/cat/tests/data/test_hmg/test_hmg_mtab_clip.h5
diff --git a/openquake/cat/tests/data/test_hmg/test_hmg_otab.h5 b/openquake/cat/tests/data/test_hmg/test_hmg_otab.h5
diff --git a/openquake/cat/tests/hmg_test.py b/openquake/cat/tests/hmg_test.py
@@ -0,0 +1,78 @@
+# ------------------- The OpenQuake Model Building Toolkit --------------------
+# Copyright (C) 2022 GEM Foundation
+#           _______  _______        __   __  _______  _______  ___   _
+#          |       ||       |      |  |_|  ||  _    ||       ||   | | |
+#          |   _   ||   _   | ____ |       || |_|   ||_     _||   |_| |
+#          |  | |  ||  | |  ||____||       ||       |  |   |  |      _|
+#          |  |_|  ||  |_|  |      |       ||  _   |   |   |  |     |_
+#          |       ||      |       | ||_|| || |_|   |  |   |  |    _  |
+#          |_______||____||_|      |_|   |_||_______|  |___|  |___| |_|
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# -----------------------------------------------------------------------------
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+# coding: utf-8
+
+import os
+import unittest
+import tempfile
+import numpy as np
+import pandas as pd
+import toml
+
+from openquake.cat.hmg.hmg import process_dfs, process_magnitude
+
+aeq = np.testing.assert_equal
+aaeq = np.testing.assert_almost_equal
+
+
+BASE_PATH = os.path.dirname(__file__)
+
+SETTINGS_HOMOG = """
+[magnitude.NEIC.mb] 
+#from weatherill
+low_mags = [5.0, 6.0]
+conv_eqs = ["0.8 * m + 0.2", "m"]
+sigma = [0.283]
+"""
+
+
+
+class HomogeniseNEICmbTestCase(unittest.TestCase):
+
+    def setUp(self):
+
+        self.data_path = os.path.join(BASE_PATH, 'data', 'test_hmg')
+
+    def test_case01(self):
+        """
+        tests that magnitudes are converted correctly for an example
+        with two "low_mags" values, neither of which is 0
+        """
+        td = toml.loads(SETTINGS_HOMOG)
+
+        # Reading otab and mtab
+        fname_otab = os.path.join(self.data_path, "test_hmg_otab.h5")
+        odf = pd.read_hdf(fname_otab)
+        fname_mtab = os.path.join(self.data_path, "test_hmg_mtab_clip.h5")
+        mdf = pd.read_hdf(fname_mtab)
+        work = pd.merge(odf, mdf, on=["eventID", "originID"])
+        save = pd.DataFrame(columns=work.columns)
+
+        rules = toml.loads(SETTINGS_HOMOG)
+        save, work = process_magnitude(work, rules['magnitude'])
+
+        results = save.magMw.values
+        expected = [4.36, 4.76, 6.8 ]
+        aaeq(results, expected, decimal=6)
diff --git a/openquake/mbi/cat/create_csv.py b/openquake/mbi/cat/create_csv.py
@@ -60,6 +60,8 @@ def main(cat_fname, fname_out):
     create_folder(os.path.dirname(fname_out))
 
     # Save file
+    # writing magnitudes more precisely
+    df.magMw = df.magMw.apply(lambda x: round(x, 5))
     df.to_csv(fname_out, index=False)
 
     # Create hmtk file
@@ -68,6 +70,8 @@ def main(cat_fname, fname_out):
     tmps = ofle.split('.')
     ofle = f'{tmps[0]}_hmtk.csv'
     odf = to_hmtk_catalogue(df)
+    # writing magnitudes more precisely
+    odf.magnitude = odf.magnitude.apply(lambda x: round(x, 2))
     odf.to_csv(os.path.join(odir, ofle), index=False)