Skip to content

Commit

Permalink
refactor open_dataset_extra (#946)
Browse files Browse the repository at this point in the history
* made open_dataset_extra() open only a single file and simplified the function significantly

* updated all code that calls the function 

* improved tests

* made open_dataset_extra() private
  • Loading branch information
veenstrajelmer authored Aug 12, 2024
1 parent 26545f5 commit 3c5d797
Show file tree
Hide file tree
Showing 8 changed files with 214 additions and 283 deletions.
57 changes: 12 additions & 45 deletions dfm_tools/interpolate_grid2bnd.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
__all__ = ["get_conversion_dict",
"interpolate_tide_to_bc",
"interpolate_tide_to_plipoints",
"open_dataset_extra",
"interp_regularnc_to_plipointsDataset",
"interp_uds_to_plipoints",
"interp_hisnc_to_plipoints",
Expand Down Expand Up @@ -343,44 +342,24 @@ def ds_apply_conventions(data_xr):
return data_xr


def ds_apply_conversion_dict(data_xr, conversion_dict, quantity_list):
def ds_apply_conversion_dict(data_xr, conversion_dict, quantity):
# rename variables from conversion_dict
for k,v in conversion_dict.items():
ncvarn = v['ncvarname']
if ncvarn in data_xr.variables.mapping.keys() and k in quantity_list: #k in quantity_list so phyc is not always renamed to tracerbndPON1 (first in conversion_dict)
if ncvarn in data_xr.variables.mapping.keys() and k == quantity: #k in quantity_list so phyc is not always renamed to tracerbndPON1 (first in conversion_dict)
data_xr = data_xr.rename({ncvarn:k})
print(f'variable {ncvarn} renamed to {k}')

# optional conversion of units. Multiplications or other simple operatiors do not affect performance (dask.array(getitem) becomes dask.array(mul). With more complex operation it is better do do it on the interpolated array.
for quan in quantity_list: #TODO: maybe do unit conversion before interp or is that computationally heavy?
if 'conversion' in conversion_dict[quan].keys(): #if conversion is present, unit key must also be in conversion_dict
print(f'> converting units from [{data_xr[quan].attrs["units"]}] to [{conversion_dict[quan]["unit"]}]')
#print(f'attrs are discarded:\n{data_xr_vars[quan].attrs}')
data_xr[quan] = data_xr[quan] * conversion_dict[quan]['conversion'] #conversion drops all attributes of which units (which are changed anyway)
data_xr[quan].attrs['units'] = conversion_dict[quan]['unit'] #add unit attribute with resulting unit
# TODO: maybe do unit conversion before interp or is that computationally heavy?
if 'conversion' in conversion_dict[quantity].keys(): #if conversion is present, unit key must also be in conversion_dict
print(f'> converting units from [{data_xr[quantity].attrs["units"]}] to [{conversion_dict[quantity]["unit"]}]')
#print(f'attrs are discarded:\n{data_xr_vars[quan].attrs}')
data_xr[quantity] = data_xr[quantity] * conversion_dict[quantity]['conversion'] #conversion drops all attributes of which units (which are changed anyway)
data_xr[quantity].attrs['units'] = conversion_dict[quantity]['unit'] #add unit attribute with resulting unit
return data_xr


def get_quantity_list(quantity):
if quantity=='uxuyadvectionvelocitybnd': #T3Dvector
quantity_list = ['ux','uy']
elif isinstance(quantity, list):
quantity_list = quantity
else:
quantity_list = [quantity]
return quantity_list


def get_ncvarname_list(quantity_list, conversion_dict):
# check existence of requested keys in conversion_dict
for quan in quantity_list:
if quan not in conversion_dict.keys():
raise KeyError(f"quantity '{quan}' not in conversion_dict, (case sensitive) options are: {str(list(conversion_dict.keys()))}")

ncvarname_list = [conversion_dict[quan]['ncvarname'] for quan in quantity_list]
return ncvarname_list


def open_dataset_extra(dir_pattern, quantity, tstart, tstop, conversion_dict=None, refdate_str=None, chunks=None):
"""
empty docstring
Expand All @@ -389,28 +368,16 @@ def open_dataset_extra(dir_pattern, quantity, tstart, tstop, conversion_dict=Non
if conversion_dict is None:
conversion_dict = get_conversion_dict()

quantity_list = get_quantity_list(quantity=quantity)
ncvarname_list = get_ncvarname_list(quantity_list=quantity_list, conversion_dict=conversion_dict)

dir_pattern = [Path(str(dir_pattern).format(ncvarname=ncvarname)) for ncvarname in ncvarname_list]
file_list_nc = []
for dir_pattern_one in dir_pattern:
file_list_nc = file_list_nc + glob.glob(str(dir_pattern_one))
list_pattern_names = [x.name for x in dir_pattern]
print(f'loading mfdataset of {len(file_list_nc)} files with pattern(s) {list_pattern_names}')
file_list_nc = glob.glob(str(dir_pattern))
print(f'loading mfdataset of {len(file_list_nc)} files with pattern(s) {dir_pattern}')

data_xr = xr.open_mfdataset(file_list_nc, chunks=chunks, join="exact") #TODO: does chunks argument solve "PerformanceWarning: Slicing is producing a large chunk."? {'time':1} is not a convenient chunking to use for timeseries extraction

data_xr = ds_apply_conventions(data_xr=data_xr)
data_xr = ds_apply_conversion_dict(data_xr=data_xr, conversion_dict=conversion_dict, quantity_list=quantity_list)
data_xr = ds_apply_conversion_dict(data_xr=data_xr, conversion_dict=conversion_dict, quantity=quantity)

# retrieve var(s) (after potential longitude conversion)
data_vars = list(data_xr.data_vars)
bool_quanavailable = pd.Series(quantity_list).isin(data_vars)
if not bool_quanavailable.all():
quantity_list_notavailable = pd.Series(quantity_list).loc[~bool_quanavailable].tolist()
raise KeyError(f'quantity {quantity_list_notavailable} not found in netcdf, available are: {data_vars}. Try updating conversion_dict to rename these variables.')
data_xr_vars = data_xr[quantity_list]
data_xr_vars = data_xr[[quantity]]

# slice time
check_time_extent(data_xr, tstart, tstop)
Expand Down
73 changes: 58 additions & 15 deletions dfm_tools/modelbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
from hydrolib.core.dimr.models import DIMR, FMComponent, Start
from hydrolib.core.utils import get_path_style_for_current_operating_system
from dfm_tools.hydrolib_helpers import get_ncbnd_construct
from dfm_tools.interpolate_grid2bnd import ext_add_boundary_object_per_polyline

from dfm_tools.interpolate_grid2bnd import (ext_add_boundary_object_per_polyline,
open_dataset_extra,
)

__all__ = [
"cmems_nc_to_bc",
"cmems_nc_to_ini",
Expand All @@ -21,24 +23,54 @@
logger = logging.getLogger(__name__)


def get_quantity_list(quantity):
if quantity=='uxuyadvectionvelocitybnd': #T3Dvector
quantity_list = ['ux','uy']
elif isinstance(quantity, list):
quantity_list = quantity
else:
quantity_list = [quantity]
return quantity_list


def get_ncvarname(quantity, conversion_dict):
# check existence of requested keys in conversion_dict
if quantity not in conversion_dict.keys():
raise KeyError(f"quantity '{quantity}' not in conversion_dict, (case sensitive) options are"
f": {str(list(conversion_dict.keys()))}")

ncvarname = conversion_dict[quantity]['ncvarname']
return ncvarname


def cmems_nc_to_bc(ext_bnd, list_quantities, tstart, tstop, file_pli, dir_pattern, dir_output, conversion_dict=None, refdate_str=None):
#input examples in https://github.com/Deltares/dfm_tools/blob/main/tests/examples/preprocess_interpolate_nc_to_bc.py
# TODO: rename ext_bnd to ext_new for consistency
if conversion_dict is None:
conversion_dict = dfmt.get_conversion_dict()

for quantity in list_quantities:
for quantity in list_quantities: # loop over salinitybnd/uxuyadvectionvelocitybnd/etc
print(f'processing quantity: {quantity}')

# times in cmems API are at midnight, so round to nearest outer midnight datetime
tstart = pd.Timestamp(tstart).floor('1d')
tstop = pd.Timestamp(tstop).ceil('1d')

#open regulargridDataset and do some basic stuff (time selection, renaming depth/lat/lon/varname, converting units, etc)
data_xr_vars = dfmt.open_dataset_extra(dir_pattern=dir_pattern, quantity=quantity,
tstart=tstart, tstop=tstop,
conversion_dict=conversion_dict,
refdate_str=refdate_str)
quantity_list = get_quantity_list(quantity=quantity)

for quantity_key in quantity_list: # loop over ux/uy
ncvarname = get_ncvarname(quantity=quantity_key, conversion_dict=conversion_dict)
dir_pattern_one = str(dir_pattern).format(ncvarname=ncvarname)
#open regulargridDataset and do some basic stuff (time selection, renaming depth/lat/lon/varname, converting units, etc)
data_xr_onevar = open_dataset_extra(dir_pattern=dir_pattern_one, quantity=quantity_key,
tstart=tstart, tstop=tstop,
conversion_dict=conversion_dict,
refdate_str=refdate_str)
if quantity_key == quantity_list[0]:
data_xr_vars = data_xr_onevar
else: # only relevant in case of ux/uy, others all have only one quantity
data_xr_vars[quantity_key] = data_xr_onevar[quantity_key]

# interpolate regulargridDataset to plipointsDataset
polyfile_obj = hcdfm.PolyFile(file_pli)
gdf_points = dfmt.PolyFile_to_geodataframe_points(polyfile_object=polyfile_obj)
Expand Down Expand Up @@ -80,22 +112,33 @@ def cmems_nc_to_ini(ext_old, dir_output, list_quantities, tstart, dir_pattern, c
if quan_bnd in ["temperaturebnd","uxuyadvectionvelocitybnd"]:
# silently skipped, temperature is handled with salinity, uxuy not supported
continue
elif quan_bnd=="salinitybnd":

ncvarname = get_ncvarname(quantity=quan_bnd, conversion_dict=conversion_dict)
dir_pattern_one = dir_pattern.format(ncvarname=ncvarname)

if quan_bnd=="salinitybnd":
# 3D initialsalinity/initialtemperature fields are silently ignored
# initial 3D conditions are only possible via nudging 1st timestep via quantity=nudge_salinity_temperature
data_xr = dfmt.open_dataset_extra(dir_pattern=dir_pattern, quantity=["salinitybnd","temperaturebnd"],
tstart=tstart_round, tstop=tstop_round,
conversion_dict=conversion_dict)
data_xr = open_dataset_extra(dir_pattern=dir_pattern_one, quantity="salinitybnd",
tstart=tstart_round, tstop=tstop_round,
conversion_dict=conversion_dict)
ncvarname_tem = get_ncvarname(quantity="temperaturebnd", conversion_dict=conversion_dict)
dir_pattern_tem = dir_pattern.format(ncvarname=ncvarname_tem)
data_xr_tem = open_dataset_extra(dir_pattern=dir_pattern_tem, quantity="temperaturebnd",
tstart=tstart_round, tstop=tstop_round,
conversion_dict=conversion_dict)
data_xr["temperaturebnd"] = data_xr_tem["temperaturebnd"]
data_xr = data_xr.rename_vars({"salinitybnd":"so", "temperaturebnd":"thetao"})
quantity = "nudge_salinity_temperature"
varname = None
else:
data_xr = dfmt.open_dataset_extra(dir_pattern=dir_pattern, quantity=quan_bnd,
tstart=tstart_round, tstop=tstop_round,
conversion_dict=conversion_dict)
data_xr = open_dataset_extra(dir_pattern=dir_pattern_one, quantity=quan_bnd,
tstart=tstart_round, tstop=tstop_round,
conversion_dict=conversion_dict)
quantity = f'initial{quan_bnd.replace("bnd","")}'
varname = quantity
data_xr = data_xr.rename_vars({quan_bnd:quantity})


# open_dataset_extra converted depths from positive down to positive up, including update of the "positive" attribute
# TODO: this correctly updated attr negatively impacts model results when using netcdf inifields, so we revert it here
Expand Down
8 changes: 5 additions & 3 deletions dfm_tools/xarray_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,16 +185,18 @@ def merge_meteofiles(file_nc:str, preprocess=None,
**kwargs)
print(f'{(dt.datetime.now()-dtstart).total_seconds():.2f} sec')

#rename variables
if 'longitude' not in data_xr.variables: #TODO: make generic, comparable rename in rename_dims_dict in dfmt.open_dataset_extra()
# rename variables
# TODO: make generic, comparable rename in rename_dims_dict in dfmt.interpolate_grid2bnd.open_dataset_extra()
if 'longitude' not in data_xr.variables:
if 'lon' in data_xr.variables:
data_xr = data_xr.rename({'lon':'longitude', 'lat':'latitude'})
elif 'x' in data_xr.variables:
data_xr = data_xr.rename({'x':'longitude', 'y':'latitude'})
else:
raise KeyError('no longitude/latitude, lon/lat or x/y variables found in dataset')

#select time and do checks #TODO: check if calendar is standard/gregorian
# select time and do checks
# TODO: check if calendar is standard/gregorian
data_xr = data_xr.sel(time=time_slice)
if data_xr.get_index('time').duplicated().any():
print('dropping duplicate timesteps')
Expand Down
270 changes: 90 additions & 180 deletions docs/notebooks/modelbuilder_example.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions docs/whats-new.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- added station_id variable to dataset returned by `dfmt.interp_uds_to_plipoints()` in [#914](https://github.com/Deltares/dfm_tools/pull/914)
- update private functions under `dfmt.download_ERA5()` to CDS-Beta (requires ECMWF apikey instead) in [#925](https://github.com/Deltares/dfm_tools/pull/925)
- simplified prevention of int dtypes in `dfmt.preprocess_ERA5()` in [#943](https://github.com/Deltares/dfm_tools/pull/943)
- simplified `dfmt.open_dataset_extra()` by dropping multi-quantity support in [#946](https://github.com/Deltares/dfm_tools/pull/946)

### Fix
- also apply convert_360to180 to longitude variable in `dfmt.open_dataset_curvilinear()` in [#913](https://github.com/Deltares/dfm_tools/pull/913)
Expand Down
6 changes: 5 additions & 1 deletion tests/examples/preprocess_interpolate_nc_to_bc.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,12 @@
continue
dir_pattern = dir_pattern_waq

# TODO: open_dataset_extra only supports non-vector quantities after https://github.com/Deltares/dfm_tools/pull/946
# this means uxuyadvectionvelocitybnd has to be passed via ux/uy separately
#open regulargridDataset and do some basic stuff (time selection, renaming depth/lat/lon/varname, converting units, etc)
data_xr_vars = dfmt.open_dataset_extra(dir_pattern=dir_pattern, quantity=quantity, #TODO: maybe replace renaming part with package CMCC/Lisa?
# open_dataset_extra() function is also not public anymore
from dfm_tools.interpolate_grid2bnd import open_dataset_extra
data_xr_vars = open_dataset_extra(dir_pattern=dir_pattern, quantity=quantity, #TODO: maybe replace renaming part with package CMCC/Lisa?
tstart=tstart, tstop=tstop,
conversion_dict=conversion_dict,
refdate_str=refdate_str)
Expand Down
Loading

0 comments on commit 3c5d797

Please sign in to comment.