refactor open_dataset_extra (#946)

* made open_dataset_extra() open only a single file and simplified the function significantly * updated all code that calls the function * improved tests * made open_dataset_extra() private
Deltares · Aug 12, 2024 · 3c5d797 · 3c5d797
1 parent 26545f5
commit 3c5d797
Show file tree

Hide file tree

Showing 8 changed files with 214 additions and 283 deletions.
diff --git a/dfm_tools/interpolate_grid2bnd.py b/dfm_tools/interpolate_grid2bnd.py
@@ -24,7 +24,6 @@
 __all__ = ["get_conversion_dict",
            "interpolate_tide_to_bc",
            "interpolate_tide_to_plipoints",
-           "open_dataset_extra",
            "interp_regularnc_to_plipointsDataset",
            "interp_uds_to_plipoints",
            "interp_hisnc_to_plipoints",
@@ -343,44 +342,24 @@ def ds_apply_conventions(data_xr):
     return data_xr
 
 
-def ds_apply_conversion_dict(data_xr, conversion_dict, quantity_list):
+def ds_apply_conversion_dict(data_xr, conversion_dict, quantity):
     # rename variables from conversion_dict
     for k,v in conversion_dict.items():
         ncvarn = v['ncvarname']
-        if ncvarn in data_xr.variables.mapping.keys() and k in quantity_list: #k in quantity_list so phyc is not always renamed to tracerbndPON1 (first in conversion_dict)
+        if ncvarn in data_xr.variables.mapping.keys() and k == quantity: #k in quantity_list so phyc is not always renamed to tracerbndPON1 (first in conversion_dict)
             data_xr = data_xr.rename({ncvarn:k})
             print(f'variable {ncvarn} renamed to {k}')
 
     # optional conversion of units. Multiplications or other simple operatiors do not affect performance (dask.array(getitem) becomes dask.array(mul). With more complex operation it is better do do it on the interpolated array.
-    for quan in quantity_list: #TODO: maybe do unit conversion before interp or is that computationally heavy?
-        if 'conversion' in conversion_dict[quan].keys(): #if conversion is present, unit key must also be in conversion_dict
-            print(f'> converting units from [{data_xr[quan].attrs["units"]}] to [{conversion_dict[quan]["unit"]}]')
-            #print(f'attrs are discarded:\n{data_xr_vars[quan].attrs}')
-            data_xr[quan] = data_xr[quan] * conversion_dict[quan]['conversion'] #conversion drops all attributes of which units (which are changed anyway)
-            data_xr[quan].attrs['units'] = conversion_dict[quan]['unit'] #add unit attribute with resulting unit
+    # TODO: maybe do unit conversion before interp or is that computationally heavy?
+    if 'conversion' in conversion_dict[quantity].keys(): #if conversion is present, unit key must also be in conversion_dict
+        print(f'> converting units from [{data_xr[quantity].attrs["units"]}] to [{conversion_dict[quantity]["unit"]}]')
+        #print(f'attrs are discarded:\n{data_xr_vars[quan].attrs}')
+        data_xr[quantity] = data_xr[quantity] * conversion_dict[quantity]['conversion'] #conversion drops all attributes of which units (which are changed anyway)
+        data_xr[quantity].attrs['units'] = conversion_dict[quantity]['unit'] #add unit attribute with resulting unit
     return data_xr
 
 
-def get_quantity_list(quantity):
-    if quantity=='uxuyadvectionvelocitybnd': #T3Dvector
-        quantity_list = ['ux','uy']
-    elif isinstance(quantity, list):
-        quantity_list = quantity
-    else:
-        quantity_list = [quantity]
-    return quantity_list
-
-
-def get_ncvarname_list(quantity_list, conversion_dict):
-    # check existence of requested keys in conversion_dict
-    for quan in quantity_list:
-        if quan not in conversion_dict.keys():
-            raise KeyError(f"quantity '{quan}' not in conversion_dict, (case sensitive) options are: {str(list(conversion_dict.keys()))}")
-
-    ncvarname_list = [conversion_dict[quan]['ncvarname'] for quan in quantity_list]
-    return ncvarname_list
-
-
 def open_dataset_extra(dir_pattern, quantity, tstart, tstop, conversion_dict=None, refdate_str=None, chunks=None):
     """
     empty docstring
@@ -389,28 +368,16 @@ def open_dataset_extra(dir_pattern, quantity, tstart, tstop, conversion_dict=Non
     if conversion_dict is None:
         conversion_dict = get_conversion_dict()
 
-    quantity_list = get_quantity_list(quantity=quantity)
-    ncvarname_list = get_ncvarname_list(quantity_list=quantity_list, conversion_dict=conversion_dict)
-
-    dir_pattern = [Path(str(dir_pattern).format(ncvarname=ncvarname)) for ncvarname in ncvarname_list]
-    file_list_nc = []
-    for dir_pattern_one in dir_pattern:
-        file_list_nc = file_list_nc + glob.glob(str(dir_pattern_one))
-    list_pattern_names = [x.name for x in dir_pattern]
-    print(f'loading mfdataset of {len(file_list_nc)} files with pattern(s) {list_pattern_names}')
+    file_list_nc = glob.glob(str(dir_pattern))
+    print(f'loading mfdataset of {len(file_list_nc)} files with pattern(s) {dir_pattern}')
 
     data_xr = xr.open_mfdataset(file_list_nc, chunks=chunks, join="exact") #TODO: does chunks argument solve "PerformanceWarning: Slicing is producing a large chunk."? {'time':1} is not a convenient chunking to use for timeseries extraction
 
     data_xr = ds_apply_conventions(data_xr=data_xr)
-    data_xr = ds_apply_conversion_dict(data_xr=data_xr, conversion_dict=conversion_dict, quantity_list=quantity_list)
+    data_xr = ds_apply_conversion_dict(data_xr=data_xr, conversion_dict=conversion_dict, quantity=quantity)
 
     # retrieve var(s) (after potential longitude conversion)
-    data_vars = list(data_xr.data_vars)
-    bool_quanavailable = pd.Series(quantity_list).isin(data_vars)
-    if not bool_quanavailable.all():
-        quantity_list_notavailable = pd.Series(quantity_list).loc[~bool_quanavailable].tolist()
-        raise KeyError(f'quantity {quantity_list_notavailable} not found in netcdf, available are: {data_vars}. Try updating conversion_dict to rename these variables.')
-    data_xr_vars = data_xr[quantity_list]
+    data_xr_vars = data_xr[[quantity]]
 
     # slice time
     check_time_extent(data_xr, tstart, tstop)

diff --git a/dfm_tools/modelbuilder.py b/dfm_tools/modelbuilder.py
@@ -7,8 +7,10 @@
 from hydrolib.core.dimr.models import DIMR, FMComponent, Start
 from hydrolib.core.utils import get_path_style_for_current_operating_system
 from dfm_tools.hydrolib_helpers import get_ncbnd_construct
-from dfm_tools.interpolate_grid2bnd import ext_add_boundary_object_per_polyline
-
+from dfm_tools.interpolate_grid2bnd import (ext_add_boundary_object_per_polyline,
+                                            open_dataset_extra,
+                                            )
+
 __all__ = [
     "cmems_nc_to_bc",
     "cmems_nc_to_ini",
@@ -21,24 +23,54 @@
 logger = logging.getLogger(__name__)
 
 
+def get_quantity_list(quantity):
+    if quantity=='uxuyadvectionvelocitybnd': #T3Dvector
+        quantity_list = ['ux','uy']
+    elif isinstance(quantity, list):
+        quantity_list = quantity
+    else:
+        quantity_list = [quantity]
+    return quantity_list
+
+
+def get_ncvarname(quantity, conversion_dict):
+    # check existence of requested keys in conversion_dict
+    if quantity not in conversion_dict.keys():
+        raise KeyError(f"quantity '{quantity}' not in conversion_dict, (case sensitive) options are"
+                       f": {str(list(conversion_dict.keys()))}")
+
+    ncvarname = conversion_dict[quantity]['ncvarname']
+    return ncvarname
+
+
 def cmems_nc_to_bc(ext_bnd, list_quantities, tstart, tstop, file_pli, dir_pattern, dir_output, conversion_dict=None, refdate_str=None):
     #input examples in https://github.com/Deltares/dfm_tools/blob/main/tests/examples/preprocess_interpolate_nc_to_bc.py
     # TODO: rename ext_bnd to ext_new for consistency
     if conversion_dict is None:
         conversion_dict = dfmt.get_conversion_dict()
 
-    for quantity in list_quantities:
+    for quantity in list_quantities: # loop over salinitybnd/uxuyadvectionvelocitybnd/etc
         print(f'processing quantity: {quantity}')
 
         # times in cmems API are at midnight, so round to nearest outer midnight datetime
         tstart = pd.Timestamp(tstart).floor('1d')
         tstop = pd.Timestamp(tstop).ceil('1d')
 
-        #open regulargridDataset and do some basic stuff (time selection, renaming depth/lat/lon/varname, converting units, etc)
-        data_xr_vars = dfmt.open_dataset_extra(dir_pattern=dir_pattern, quantity=quantity,
-                                               tstart=tstart, tstop=tstop,
-                                               conversion_dict=conversion_dict,
-                                               refdate_str=refdate_str)
+        quantity_list = get_quantity_list(quantity=quantity)
+
+        for quantity_key in quantity_list: # loop over ux/uy
+            ncvarname = get_ncvarname(quantity=quantity_key, conversion_dict=conversion_dict)
+            dir_pattern_one = str(dir_pattern).format(ncvarname=ncvarname)
+            #open regulargridDataset and do some basic stuff (time selection, renaming depth/lat/lon/varname, converting units, etc)
+            data_xr_onevar = open_dataset_extra(dir_pattern=dir_pattern_one, quantity=quantity_key,
+                                                tstart=tstart, tstop=tstop,
+                                                conversion_dict=conversion_dict,
+                                                refdate_str=refdate_str)
+            if quantity_key == quantity_list[0]:
+                data_xr_vars = data_xr_onevar
+            else: # only relevant in case of ux/uy, others all have only one quantity
+                data_xr_vars[quantity_key] = data_xr_onevar[quantity_key]
+
         # interpolate regulargridDataset to plipointsDataset
         polyfile_obj = hcdfm.PolyFile(file_pli)
         gdf_points = dfmt.PolyFile_to_geodataframe_points(polyfile_object=polyfile_obj)
@@ -80,22 +112,33 @@ def cmems_nc_to_ini(ext_old, dir_output, list_quantities, tstart, dir_pattern, c
         if quan_bnd in ["temperaturebnd","uxuyadvectionvelocitybnd"]:
             # silently skipped, temperature is handled with salinity, uxuy not supported
             continue
-        elif quan_bnd=="salinitybnd":
+
+        ncvarname = get_ncvarname(quantity=quan_bnd, conversion_dict=conversion_dict)
+        dir_pattern_one = dir_pattern.format(ncvarname=ncvarname)
+
+        if quan_bnd=="salinitybnd":
             # 3D initialsalinity/initialtemperature fields are silently ignored
             # initial 3D conditions are only possible via nudging 1st timestep via quantity=nudge_salinity_temperature
-            data_xr = dfmt.open_dataset_extra(dir_pattern=dir_pattern, quantity=["salinitybnd","temperaturebnd"],
-                                              tstart=tstart_round, tstop=tstop_round,
-                                              conversion_dict=conversion_dict)
+            data_xr = open_dataset_extra(dir_pattern=dir_pattern_one, quantity="salinitybnd",
+                                         tstart=tstart_round, tstop=tstop_round,
+                                         conversion_dict=conversion_dict)
+            ncvarname_tem = get_ncvarname(quantity="temperaturebnd", conversion_dict=conversion_dict)
+            dir_pattern_tem = dir_pattern.format(ncvarname=ncvarname_tem)
+            data_xr_tem = open_dataset_extra(dir_pattern=dir_pattern_tem, quantity="temperaturebnd",
+                                             tstart=tstart_round, tstop=tstop_round,
+                                             conversion_dict=conversion_dict)
+            data_xr["temperaturebnd"] = data_xr_tem["temperaturebnd"]
             data_xr = data_xr.rename_vars({"salinitybnd":"so", "temperaturebnd":"thetao"})
             quantity = "nudge_salinity_temperature"
             varname = None
         else:
-            data_xr = dfmt.open_dataset_extra(dir_pattern=dir_pattern, quantity=quan_bnd,
-                                              tstart=tstart_round, tstop=tstop_round,
-                                              conversion_dict=conversion_dict)
+            data_xr = open_dataset_extra(dir_pattern=dir_pattern_one, quantity=quan_bnd,
+                                         tstart=tstart_round, tstop=tstop_round,
+                                         conversion_dict=conversion_dict)
             quantity = f'initial{quan_bnd.replace("bnd","")}'
             varname = quantity
             data_xr = data_xr.rename_vars({quan_bnd:quantity})
+
 
         # open_dataset_extra converted depths from positive down to positive up, including update of the "positive" attribute
         # TODO: this correctly updated attr negatively impacts model results when using netcdf inifields, so we revert it here

diff --git a/dfm_tools/xarray_helpers.py b/dfm_tools/xarray_helpers.py
@@ -185,16 +185,18 @@ def merge_meteofiles(file_nc:str, preprocess=None,
                                     **kwargs)
     print(f'{(dt.datetime.now()-dtstart).total_seconds():.2f} sec')
 
-    #rename variables
-    if 'longitude' not in data_xr.variables: #TODO: make generic, comparable rename in rename_dims_dict in dfmt.open_dataset_extra()
+    # rename variables
+    # TODO: make generic, comparable rename in rename_dims_dict in dfmt.interpolate_grid2bnd.open_dataset_extra()
+    if 'longitude' not in data_xr.variables:
         if 'lon' in data_xr.variables:
             data_xr = data_xr.rename({'lon':'longitude', 'lat':'latitude'})
         elif 'x' in data_xr.variables:
             data_xr = data_xr.rename({'x':'longitude', 'y':'latitude'})
         else:
             raise KeyError('no longitude/latitude, lon/lat or x/y variables found in dataset')
 
-    #select time and do checks #TODO: check if calendar is standard/gregorian
+    # select time and do checks
+    # TODO: check if calendar is standard/gregorian
     data_xr = data_xr.sel(time=time_slice)
     if data_xr.get_index('time').duplicated().any():
         print('dropping duplicate timesteps')

diff --git a/docs/notebooks/modelbuilder_example.ipynb b/docs/notebooks/modelbuilder_example.ipynb
diff --git a/docs/whats-new.md b/docs/whats-new.md
@@ -7,6 +7,7 @@
 - added station_id variable to dataset returned by `dfmt.interp_uds_to_plipoints()` in [#914](https://github.com/Deltares/dfm_tools/pull/914)
 - update private functions under `dfmt.download_ERA5()` to CDS-Beta (requires ECMWF apikey instead) in [#925](https://github.com/Deltares/dfm_tools/pull/925)
 - simplified prevention of int dtypes in `dfmt.preprocess_ERA5()` in [#943](https://github.com/Deltares/dfm_tools/pull/943)
+- simplified `dfmt.open_dataset_extra()` by dropping multi-quantity support in [#946](https://github.com/Deltares/dfm_tools/pull/946)
 
 ### Fix
 - also apply convert_360to180 to longitude variable in `dfmt.open_dataset_curvilinear()` in [#913](https://github.com/Deltares/dfm_tools/pull/913)

diff --git a/tests/examples/preprocess_interpolate_nc_to_bc.py b/tests/examples/preprocess_interpolate_nc_to_bc.py
@@ -86,8 +86,12 @@
                     continue
                 dir_pattern = dir_pattern_waq
 
+            # TODO: open_dataset_extra only supports non-vector quantities after https://github.com/Deltares/dfm_tools/pull/946
+            # this means uxuyadvectionvelocitybnd has to be passed via ux/uy separately
             #open regulargridDataset and do some basic stuff (time selection, renaming depth/lat/lon/varname, converting units, etc)
-            data_xr_vars = dfmt.open_dataset_extra(dir_pattern=dir_pattern, quantity=quantity, #TODO: maybe replace renaming part with package CMCC/Lisa?
+            # open_dataset_extra() function is also not public anymore
+            from dfm_tools.interpolate_grid2bnd import open_dataset_extra
+            data_xr_vars = open_dataset_extra(dir_pattern=dir_pattern, quantity=quantity, #TODO: maybe replace renaming part with package CMCC/Lisa?
                                                    tstart=tstart, tstop=tstop,
                                                    conversion_dict=conversion_dict,
                                                    refdate_str=refdate_str)