Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: data versioning in the data catalog #438

Merged
merged 32 commits into from
Jul 31, 2023
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
d8c52a4
[no ci] WIP
savente93 Jun 29, 2023
7245aad
[no ci] WIP
savente93 Jul 3, 2023
b5fb06f
fix #413
DirkEilander Jun 28, 2023
977c680
add test and changelog
DirkEilander Jun 29, 2023
124afaf
[no ci] WIP
savente93 Jul 3, 2023
5a2b6b4
wip
savente93 Jul 3, 2023
8deff0b
[no ci] WIP
savente93 Jul 4, 2023
4fbb763
Merge branch 'main' into version-support
savente93 Jul 10, 2023
cd4a3ff
[no ci] WIP
savente93 Jul 10, 2023
bbd5d5e
[no ci] WIP
savente93 Jul 10, 2023
438f375
fix tests, catalog still need diffing
savente93 Jul 10, 2023
c97a7ba
get started on data catalog partitioning
savente93 Jul 10, 2023
ee9b803
fix reading merged catalogs, write impl missing
savente93 Jul 11, 2023
228db74
wip
savente93 Jul 11, 2023
ada6621
[no ci] WIP
savente93 Jul 11, 2023
e034c85
fix alias tests
savente93 Jul 12, 2023
5b7f7a0
remove temporary test skip
savente93 Jul 12, 2023
e46b608
introduce deprecation warning for alias
savente93 Jul 12, 2023
91a4e37
Merge branch 'main' into version-support
savente93 Jul 14, 2023
920aac8
Merge branch 'main' into version-support
savente93 Jul 17, 2023
2aa76ef
Merge branch 'main' into version-support
savente93 Jul 18, 2023
1388d60
fix data adapters part
savente93 Jul 24, 2023
5791818
fix versoined catalog tests
savente93 Jul 24, 2023
664d88a
clean up impl by just using that dicts are ordered
savente93 Jul 24, 2023
516b6ec
Update docs/user_guide/data_prepare_cat.rst
savente93 Jul 27, 2023
cdd4d5e
incorproate PR feedback
savente93 Jul 27, 2023
caea5de
Merge branch 'main' into version-support
savente93 Jul 27, 2023
9e59f6f
remove stray debug statement
savente93 Jul 27, 2023
3f814a4
move type to explicit Union
savente93 Jul 27, 2023
7bf4de3
fix to_dataframe
savente93 Jul 27, 2023
11296c4
fix docs warning
savente93 Jul 27, 2023
b907c90
fix inconsistent argument names; fix to_dict with more than 2 variant…
DirkEilander Jul 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Unreleased

Added
-----
-
- Support for loading the same data source but from different places (e.g. local & aws)

Changed
-------
Expand Down
2 changes: 2 additions & 0 deletions docs/user_guide/data_prepare_cat.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ A full list of **optional data source arguments** is given below
- **placeholder** (optional): this argument can be used to generate multiple sources with a single entry in the data catalog file. If different files follow a logical
nomenclature, multiple data sources can be defined by iterating through all possible combinations of the placeholders. The placeholder names should be given in the
source name and the path and its values listed under the placeholder argument.
- **versions** (optional): If you want to use the same data source but load it from different places (e.g. local & aws) you can add this key
savente93 marked this conversation as resolved.
Show resolved Hide resolved
Keys here are essentially overrides that will get applied to the containing catalog when they get parsed and expanded.

.. note::

Expand Down
1 change: 1 addition & 0 deletions hydromt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@

# high-level methods
from .models import *
from .utils import *
savente93 marked this conversation as resolved.
Show resolved Hide resolved
5 changes: 2 additions & 3 deletions hydromt/cli/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,12 @@ def get_datasets(data_libs: Union[List, str]) -> Dict:
for accepted yaml format.
"""
data_catalog = DataCatalog(data_libs)
datasets = data_catalog.sources
dataset_sources = {
"RasterDatasetSource": [],
"GeoDatasetSource": [],
"GeoDataframeSource": [],
}
for k, v in datasets.items():
for k, v in data_catalog.iter_sources():
if v.data_type == "RasterDataset":
dataset_sources["RasterDatasetSource"].append(k)
elif v.data_type == "GeoDataFrame":
Expand Down Expand Up @@ -167,7 +166,7 @@ def get_region(
# retrieve global hydrography data (lazy!)
ds_org = data_catalog.get_rasterdataset(hydrography_fn)
if "bounds" not in region:
region.update(basin_index=data_catalog[basin_index_fn])
region.update(basin_index=data_catalog.get_source(basin_index_fn))
# get basin geometry
geom, xy = workflows.get_basin_geometry(
ds=ds_org,
Expand Down
4 changes: 4 additions & 0 deletions hydromt/data_adapter/data_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ def __init__(
driver_kwargs={},
name="", # optional for now
catalog_name="", # optional for now
provider="UNSPECIFIED",
data_version="UNSPECIFIED",
savente93 marked this conversation as resolved.
Show resolved Hide resolved
):
"""General Interface to data source for HydroMT.

Expand Down Expand Up @@ -170,6 +172,8 @@ def __init__(
"""
self.name = name
self.catalog_name = catalog_name
self.provider = provider
self.data_version = data_version
# general arguments
self.path = path
# driver and driver keyword-arguments
Expand Down
7 changes: 7 additions & 0 deletions hydromt/data_adapter/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def __init__(
driver_kwargs: dict = {},
name: str = "", # optional for now
catalog_name: str = "", # optional for now
provider=None,
data_version=None,
**kwargs,
):
"""Initiate data adapter for 2D tabular data.
Expand Down Expand Up @@ -106,6 +108,8 @@ def __init__(
driver_kwargs=driver_kwargs,
name=name,
catalog_name=catalog_name,
provider=provider,
data_version=data_version,
)

def to_file(
Expand Down Expand Up @@ -198,6 +202,9 @@ def get_data(
_ = self.resolve_paths(**so_kwargs) # throw nice error if data not found

kwargs = self.driver_kwargs.copy()
# these are just for internal bookeeping. drivers don't need them
_ = kwargs.pop("provider", None)
_ = kwargs.pop("data_version", None)

# read and clip
logger.info(f"DataFrame: Read {self.driver} data.")
Expand Down
7 changes: 7 additions & 0 deletions hydromt/data_adapter/geodataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def __init__(
driver_kwargs: dict = {},
name: str = "", # optional for now
catalog_name: str = "", # optional for now
provider=None,
data_version=None,
**kwargs,
):
"""Initiate data adapter for geospatial vector data.
Expand Down Expand Up @@ -116,6 +118,8 @@ def __init__(
driver_kwargs=driver_kwargs,
name=name,
catalog_name=catalog_name,
provider=provider,
data_version=data_version,
)
self.crs = crs

Expand Down Expand Up @@ -216,6 +220,9 @@ def get_data(
_ = self.resolve_paths() # throw nice error if data not found

kwargs = self.driver_kwargs.copy()
# these are just for internal bookeeping. drivers don't need them
_ = kwargs.pop("provider", None)
_ = kwargs.pop("data_version", None)
# parse geom, bbox and buffer arguments
clip_str = ""
if geom is None and bbox is not None:
Expand Down
11 changes: 10 additions & 1 deletion hydromt/data_adapter/geodataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def __init__(
driver_kwargs: dict = {},
name: str = "", # optional for now
catalog_name: str = "", # optional for now
provider=None,
data_version=None,
**kwargs,
):
"""Initiate data adapter for geospatial timeseries data.
Expand Down Expand Up @@ -123,6 +125,8 @@ def __init__(
driver_kwargs=driver_kwargs,
name=name,
catalog_name=catalog_name,
provider=provider,
data_version=data_version,
)
self.crs = crs

Expand Down Expand Up @@ -255,7 +259,12 @@ def get_data(
)

kwargs = self.driver_kwargs.copy()

# these are just for internal bookeeping. drivers don't need them
_ = kwargs.pop(
"provider",
None,
)
_ = kwargs.pop("data_version", None)
# parse geom, bbox and buffer arguments
clip_str = ""
if geom is None and bbox is not None:
Expand Down
7 changes: 7 additions & 0 deletions hydromt/data_adapter/rasterdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def __init__(
zoom_levels: dict = {},
name: str = "", # optional for now
catalog_name: str = "", # optional for now
provider=None,
data_version=None,
**kwargs,
):
"""Initiate data adapter for geospatial raster data.
Expand Down Expand Up @@ -127,6 +129,8 @@ def __init__(
driver_kwargs=driver_kwargs,
name=name,
catalog_name=catalog_name,
provider=provider,
data_version=data_version,
)
self.crs = crs
self.zoom_levels = zoom_levels
Expand Down Expand Up @@ -271,6 +275,9 @@ def get_data(
)

kwargs = self.driver_kwargs.copy()
# these are just for internal bookeeping. drivers don't need them
_ = kwargs.pop("provider", None)
_ = kwargs.pop("data_version", None)
# zarr can use storage options directly, the rest should be converted to
# file-like objects
if "storage_options" in kwargs and self.driver == "raster":
Expand Down
Loading