Merge branch 'main' into return-kwargs

Deltares · Jul 31, 2023 · 689890d · 689890d
2 parents 815a33e + f55aaee
commit 689890d
Show file tree

Hide file tree

Showing 28 changed files with 1,218 additions and 394 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -22,6 +22,8 @@ General
    :toctree: _generated
 
    data_catalog.DataCatalog
+   data_catalog.DataCatalog.get_source
+   data_catalog.DataCatalog.iter_sources
    data_catalog.DataCatalog.sources
    data_catalog.DataCatalog.keys
    data_catalog.DataCatalog.predefined_catalogs
@@ -36,12 +38,13 @@ Add data sources
 .. autosummary::
    :toctree: _generated
 
-   data_catalog.DataCatalog.set_predefined_catalogs
+   data_catalog.DataCatalog.add_source
+   data_catalog.DataCatalog.update
    data_catalog.DataCatalog.from_predefined_catalogs
    data_catalog.DataCatalog.from_archive
    data_catalog.DataCatalog.from_yml
    data_catalog.DataCatalog.from_dict
-   data_catalog.DataCatalog.update
+   data_catalog.DataCatalog.set_predefined_catalogs
 
 .. _api_data_catalog_get:
 
@@ -54,7 +57,7 @@ Get data
    data_catalog.DataCatalog.get_rasterdataset
    data_catalog.DataCatalog.get_geodataset
    data_catalog.DataCatalog.get_geodataframe
-
+   data_catalog.DataCatalog.get_dataframe
 
 
 RasterDataset

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -12,6 +12,7 @@ Unreleased
 Added
 -----
 - docs now include a dropdown for selecting older versions of the docs. (#457)
+- Support for loading the same data source but from different places (e.g. local & aws)
 
 Changed
 -------

diff --git a/docs/dev/dev_install.rst b/docs/dev/dev_install.rst
@@ -99,7 +99,7 @@ Finally, create a developer installation of HydroMT:
     see :ref:`installation guide <installation_guide>` for the difference between both.
 
 Fine tuned installation
-----------------------
+-----------------------
 
 If you want a more fine tuned installation you can also specify exactly
 which dependency groups you'd like. For instance, this will create an environment

diff --git a/docs/user_guide/data_prepare_cat.rst b/docs/user_guide/data_prepare_cat.rst
@@ -31,6 +31,7 @@ The ``rename``, ``nodata``, ``unit_add`` and ``unit_mult`` options are set per v
     meta:
       root: /path/to/data_root/
       version: version
+      name: data_catalog_name
     my_dataset:
       crs: EPSG/WKT
       data_type: RasterDataset/GeoDataset/GeoDataFrame/DataFrame
@@ -48,8 +49,6 @@ The ``rename``, ``nodata``, ``unit_add`` and ``unit_mult`` options are set per v
       nodata:
         new_variable_name: value
       path: /absolut_path/to/my_dataset.extension OR relative_path/to_my_dataset.extension
-      placeholders:
-        [placeholder_key: [placeholder_values]]
       rename:
         old_variable_name: new_variable_name
       unit_add:
@@ -91,6 +90,10 @@ A full list of **optional data source arguments** is given below
 - **filesystem** (required if different than local): specify if the data is stored locally or remotely (e.g cloud). Supported filesystems are *local* for local data,
   *gcs* for data stored on Google Cloud Storage, and *aws* for data stored on Amazon Web Services. Profile or authentication information can be passed to ``driver_kwargs`` via
   *storage_options*.
+- **version** (recommended): data source version
+  *NOTE*: New in HydroMT v0.8.1
+- **provider** (recommended): data source provider
+  *NOTE*: New in HydroMT v0.8.1
 - **meta** (recommended): additional information on the dataset organized in a sub-list.
   Good meta data includes a *source_url*, *source_license*, *source_version*, *paper_ref*, *paper_doi*, *category*, etc. These are added to the data attributes.
   Usual categories within HydroMT are *geography*, *topography*, *hydrography*, *meteo*, *landuse*, *ocean*, *socio-economic*, *observed data*
@@ -103,23 +106,81 @@ A full list of **optional data source arguments** is given below
 - **unit_mult**: multiply the input data by a value for unit conversion (e.g. 1000 for conversion from m to mm of precipitation).
 - **attrs** (optional): This argument allows for setting attributes like the unit or long name to variables.
   *NOTE*: New in HydroMT v0.7.2
+- **placeholder** (optional): this argument can be used to generate multiple sources with a single entry in the data catalog file. If different files follow a logical
+  nomenclature, multiple data sources can be defined by iterating through all possible combinations of the placeholders. The placeholder names should be given in the
+  source name and the path and its values listed under the placeholder argument.
+- **variants** (optional): This argument can be used to generate multiple sources with the same name, but from different providers or versions.
+  Any keys here are essentially used to extend/overwrite the base arguments.
+
+The following are **optional data source arguments** for *RasterDataset*, *GeoDataFrame*, and *GeoDataset*:
+
 - **crs** (required if missing in the data): EPSG code or WKT string of the reference coordinate system of the data.
   Only used if not crs can be inferred from the input data.
+
+The following are **optional data source arguments** for *RasterDataset*:
+
 - **zoom_level** (optional): this argument can be used for a *RasterDatasets* that contain multiple zoom levels of different resolution.
   It should contain a list of numeric zoom levels that correspond to the `zoom_level` key in file path, e.g.,  ``"path/to/my/files/{zoom_level}/data.tif"``
   and corresponding resolution, expressed in the unit of the data crs.
   The *crs* argument is therefore required when using zoom_levels to correctly interpret the unit of the resolution.
   The required zoom level can be requested from HydroMT as argument to the `DataCatalog.get_rasterdataset` method,
   see `Reading tiled raster data with different zoom levels <../_examples/working_with_tiled_raster_data.ipynb>`_.
-- **placeholder** (optional): this argument can be used to generate multiple sources with a single entry in the data catalog file. If different files follow a logical
-  nomenclature, multiple data sources can be defined by iterating through all possible combinations of the placeholders. The placeholder names should be given in the
-  source name and the path and its values listed under the placeholder argument.
 
 .. note::
 
-  The **alias** argument will be deprecated and should no longer be used, see `github issue for more information <https://github.com/Deltares/hydromt/issues/148>`_
+  The **alias** argument will be deprecated and should no longer be used, see
+  `github issue for more information <https://github.com/Deltares/hydromt/issues/148>`_
 
 .. warning::
 
-  Using cloud data is still experimental and only supported for *DataFrame*, *RasterDataset* and *Geodataset* with *zarr*. *RasterDataset* with *raster* driver is also possible
+  Using cloud data is still experimental and only supported for *DataFrame*, *RasterDataset* and
+  *Geodataset* with *zarr*. *RasterDataset* with *raster* driver is also possible
   but in case of multiple files (mosaic) we strongly recommend using a vrt file for speed and computation efficiency.
+
+Data variants
+-------------
+
+Data variants are used to define multiple data sources with the same name, but from different providers or versions.
+Below, we show an example of a data catalog for a RasterDataset with multiple variants of the same data source (esa_worldcover),
+but this works identical for other data types.
+Here, the *crs*, *data_type*, *driver* and *filesystem* are common arguments used for all variants.
+The variant arguments are used to extend and/or overwrite the common arguments, creating new sources.
+
+.. code-block:: yaml
+
+  esa_worldcover:
+    crs: 4326
+    data_type: RasterDataset
+    driver: raster
+    filesystem: local
+    variants:
+      - provider: local
+        version: 2021
+        path: landuse/esa_worldcover_2021/esa-worldcover.vrt
+      - provider: local
+        version: 2020
+        path: landuse/esa_worldcover/esa-worldcover.vrt
+      - provider: aws
+        version: 2020
+        path: s3://esa-worldcover/v100/2020/ESA_WorldCover_10m_2020_v100_Map_AWS.vrt
+        filesystem: s3
+
+
+To request a specific variant, the variant arguments can be used as keyword arguments
+to the `DataCatalog.get_rasterdataset` method, see code below.
+By default the newest version from the last provider is returned when requesting a data
+source with specific version or provider.
+Requesting a specific version from a HydroMT configuration file is also possible, see :ref:`model_config`.
+
+.. code-block:: python
+
+  from hydromt import DataCatalog
+  dc = DataCatalog.from_yml("data_catalog.yml")
+  # get the default version. This will return the latest (2020) version from the last provider (aws)
+  ds = dc.get_rasterdataset("esa_worldcover")
+  # get a 2020 version. This will return the 2020 version from the last provider (aws)
+  ds = dc.get_rasterdataset("esa_worldcover", version=2020)
+  # get a 2021 version. This will return the 2021 version from the local provider as this verion is not available from aws .
+  ds = dc.get_rasterdataset("esa_worldcover", version=2021)
+  # get the 2020 version from the local provider
+  ds = dc.get_rasterdataset("esa_worldcover", version=2020, provider="local")
diff --git a/docs/user_guide/model_config.rst b/docs/user_guide/model_config.rst
@@ -54,3 +54,9 @@ An example .yaml file is shown below. Note that this .yaml file does not apply t
     setup_manning_roughness:
       lulc_fn: globcover             # source name of landuse-landcover data
       mapping_fn: globcover_mapping  # source name of mapping table converting lulc classes to N values
+
+    setup_infiltration:
+      soil_fn:
+        source: soil_data             # source name of soil data with specific version
+        version: 1.0                  # version of soil data
+      mapping_fn: soil_mapping        # source name of mapping table converting soil classes to infiltration parameters
diff --git a/examples/reading_vector_data.ipynb b/examples/reading_vector_data.ipynb
@@ -70,10 +70,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# supported file formats\n",
-    "import fiona\n",
-    "\n",
-    "print(list(fiona.supported_drivers.keys()))"
+    "# uncomment to see list of supported file formats\n",
+    "# import fiona\n",
+    "# print(list(fiona.supported_drivers.keys()))"
    ]
   },
   {

diff --git a/hydromt/cli/api.py b/hydromt/cli/api.py
@@ -100,13 +100,12 @@ def get_datasets(data_libs: Union[List, str]) -> Dict:
         for accepted yaml format.
     """
     data_catalog = DataCatalog(data_libs)
-    datasets = data_catalog.sources
     dataset_sources = {
         "RasterDatasetSource": [],
         "GeoDatasetSource": [],
         "GeoDataframeSource": [],
     }
-    for k, v in datasets.items():
+    for k, v in data_catalog.iter_sources():
         if v.data_type == "RasterDataset":
             dataset_sources["RasterDatasetSource"].append(k)
         elif v.data_type == "GeoDataFrame":
@@ -167,7 +166,7 @@ def get_region(
         # retrieve global hydrography data (lazy!)
         ds_org = data_catalog.get_rasterdataset(hydrography_fn)
         if "bounds" not in region:
-            region.update(basin_index=data_catalog[basin_index_fn])
+            region.update(basin_index=data_catalog.get_source(basin_index_fn))
         # get basin geometry
         geom, xy = workflows.get_basin_geometry(
             ds=ds_org,

diff --git a/hydromt/data_adapter/data_adapter.py b/hydromt/data_adapter/data_adapter.py
@@ -5,6 +5,7 @@
 from abc import ABCMeta, abstractmethod
 from itertools import product
 from string import Formatter
+from typing import Optional
 
 import geopandas as gpd
 import numpy as np
@@ -125,6 +126,8 @@ def __init__(
         driver_kwargs={},
         name="",  # optional for now
         catalog_name="",  # optional for now
+        provider: Optional[str] = None,
+        version: Optional[str] = None,
     ):
         """General Interface to data source for HydroMT.
 
@@ -170,6 +173,8 @@ def __init__(
         """
         self.name = name
         self.catalog_name = catalog_name
+        self.provider = provider
+        self.version = str(version) if version is not None else None  # version as str
         # general arguments
         self.path = path
         # driver and driver keyword-arguments
@@ -227,6 +232,13 @@ def __repr__(self):
         """Pretty print string representation of self."""
         return self.__str__()
 
+    def __eq__(self, other: object) -> bool:
+        """Return True if self and other are equal."""
+        if type(other) is type(self):
+            return self.to_dict() == other.to_dict()
+        else:
+            return False
+
     def _parse_zoom_level(
         self,
         zoom_level: int | tuple = None,

diff --git a/hydromt/data_adapter/dataframe.py b/hydromt/data_adapter/dataframe.py
@@ -3,7 +3,7 @@
 import os
 import warnings
 from os.path import join
-from typing import Union
+from typing import Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -27,9 +27,9 @@ class DataFrameAdapter(DataAdapter):
     def __init__(
         self,
         path: str,
-        driver: str = None,
+        driver: Optional[str] = None,
         filesystem: str = "local",
-        nodata: Union[dict, float, int] = None,
+        nodata: Optional[Union[dict, float, int]] = None,
         rename: dict = {},
         unit_mult: dict = {},
         unit_add: dict = {},
@@ -38,6 +38,8 @@ def __init__(
         driver_kwargs: dict = {},
         name: str = "",  # optional for now
         catalog_name: str = "",  # optional for now
+        provider: Optional[str] = None,
+        version: Optional[str] = None,
         **kwargs,
     ):
         """Initiate data adapter for 2D tabular data.
@@ -106,6 +108,8 @@ def __init__(
             driver_kwargs=driver_kwargs,
             name=name,
             catalog_name=catalog_name,
+            provider=provider,
+            version=version,
         )
 
     def to_file(

diff --git a/hydromt/data_adapter/geodataframe.py b/hydromt/data_adapter/geodataframe.py
@@ -46,6 +46,8 @@ def __init__(
         driver_kwargs: dict = {},
         name: str = "",  # optional for now
         catalog_name: str = "",  # optional for now
+        provider=None,
+        version=None,
         **kwargs,
     ):
         """Initiate data adapter for geospatial vector data.
@@ -116,6 +118,8 @@ def __init__(
             driver_kwargs=driver_kwargs,
             name=name,
             catalog_name=catalog_name,
+            provider=provider,
+            version=version,
         )
         self.crs = crs
 

diff --git a/hydromt/data_adapter/geodataset.py b/hydromt/data_adapter/geodataset.py
@@ -47,6 +47,8 @@ def __init__(
         driver_kwargs: dict = {},
         name: str = "",  # optional for now
         catalog_name: str = "",  # optional for now
+        provider=None,
+        version=None,
         **kwargs,
     ):
         """Initiate data adapter for geospatial timeseries data.
@@ -123,6 +125,8 @@ def __init__(
             driver_kwargs=driver_kwargs,
             name=name,
             catalog_name=catalog_name,
+            provider=provider,
+            version=version,
         )
         self.crs = crs
 
@@ -255,7 +259,6 @@ def get_data(
         )
 
         kwargs = self.driver_kwargs.copy()
-
         # parse geom, bbox and buffer arguments
         clip_str = ""
         if geom is None and bbox is not None:

diff --git a/hydromt/data_adapter/rasterdataset.py b/hydromt/data_adapter/rasterdataset.py
@@ -50,6 +50,8 @@ def __init__(
         zoom_levels: dict = {},
         name: str = "",  # optional for now
         catalog_name: str = "",  # optional for now
+        provider=None,
+        version=None,
         **kwargs,
     ):
         """Initiate data adapter for geospatial raster data.
@@ -127,6 +129,8 @@ def __init__(
             driver_kwargs=driver_kwargs,
             name=name,
             catalog_name=catalog_name,
+            provider=provider,
+            version=version,
         )
         self.crs = crs
         self.zoom_levels = zoom_levels
@@ -273,6 +277,7 @@ def get_data(
         )
 
         kwargs = self.driver_kwargs.copy()
+
         # zarr can use storage options directly, the rest should be converted to
         # file-like objects
         if "storage_options" in kwargs and self.driver == "raster":