From 45cadabc5756a740f30d914be1a9493abc9ea5c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 4 Jul 2024 10:24:02 +0200 Subject: [PATCH 1/5] install stable duckdb version --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index ec0ca1a..7444393 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ RUN pip install ipython==8.26.0 \ pyarrow==14.0.1 \ ydata-profiling==4.6.4 \ # linux AArch64 extensions are not available for 0.9.2 -> 0.10.0 is released early Feb. 2024 - duckdb==0.8.0 \ + "duckdb>=1.0.0" \ polars==0.19.19 \ geocube @@ -48,7 +48,7 @@ RUN mv /whitebox/WhiteboxTools_linux_amd64/WBT /src/WBT # first line checks the architecture, and replaces x86_64 with amd64, which is what duckdb uses RUN arch=$(uname -m | sed s/x86_64/amd64/) && \ mkdir /duck && \ - wget https://github.com/duckdb/duckdb/releases/download/v0.8.0/duckdb_cli-linux-${arch}.zip && \ + wget https://github.com/duckdb/duckdb/releases/download/v1.0.0/duckdb_cli-linux-${arch}.zip && \ unzip duckdb_cli-linux-${arch}.zip && \ rm duckdb_cli-linux-${arch}.zip && \ chmod +x ./duckdb && \ From 2e2187b4066527e3390e4cbf25b19220c3fae472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 4 Jul 2024 10:24:11 +0200 Subject: [PATCH 2/5] fix naming issues --- src/aggregator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/aggregator.py b/src/aggregator.py index 934a856..46f9a85 100644 --- a/src/aggregator.py +++ b/src/aggregator.py @@ -145,18 +145,18 @@ def aggregate_scale(aggregation_scale: str, executor: Executor, precision: Optio if aggregation_scale == 'temporal': on = ['time'] elif aggregation_scale == 'spatial': - on = ['lon', 'lat'] + on = ['x', 'y'] elif aggregation_scale == 'spatiotemporal': - on = ['time', 'lon', 'lat'] + on = ['time', 'x', 'y'] # extract only the index and column 'mean' - mean = df[[*on, 'mean']].rename({'mean': layer}) + mean = df[[*on, 'mean']].clone().rename({'mean': layer}) # join the means if means is None: means = mean else: - means = means.join(mean, on=on, how='outer') + means = means.join(mean, on=on, how='outer').clone() # finally save the means path = params.result_path / f"mean_{aggregation_scale}_aggs.parquet" From 41b175c76465ef340da3262650f58d5f1a1a2b78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 4 Jul 2024 10:24:21 +0200 Subject: [PATCH 3/5] enable spatial extensions --- src/param.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/param.py b/src/param.py index 6d93812..971a2dc 100644 --- a/src/param.py +++ b/src/param.py @@ -54,6 +54,9 @@ class Params(BaseModel): base_path: str = '/out' netcdf_backend: NetCDFBackends = NetCDFBackends.XARRAY + # duckdb settings + use_spatial: bool = False + @property def dataset_path(self) -> Path: if self.keep_data_files: From 0ef29a8c921905f6754c6ac45a674e3dc5361c86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 4 Jul 2024 10:24:36 +0200 Subject: [PATCH 4/5] allow to use spatial extension (not working yet) --- src/ingestor.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/ingestor.py b/src/ingestor.py index 189c537..409a1f2 100644 --- a/src/ingestor.py +++ b/src/ingestor.py @@ -83,9 +83,7 @@ def _table_exists(table_name: str) -> bool: return False -def _create_datasource_table(entry: Entry, table_name: str, use_spatial: bool = False) -> str: - if use_spatial: - raise NotImplementedError('There is still an error with the spatial type.') +def _create_datasource_table(entry: Entry, table_name: str) -> str: # get the parameters params = load_params() @@ -102,7 +100,7 @@ def _create_datasource_table(entry: Entry, table_name: str, use_spatial: bool = column_names.append(f" time TIMESTAMP") # spatial dimensions - if len(spatial_dims) == 2 and use_spatial: + if len(spatial_dims) == 2 and params.use_spatial: column_names.append(f" cell BOX_2D") else: column_names.append(' ' + ','.join([f" {name} DOUBLE" for dim, name in zip(spatial_dims, SPATIAL_DIMENSIONS)])) @@ -126,9 +124,8 @@ def _create_datasource_table(entry: Entry, table_name: str, use_spatial: bool = return dbname -def _create_insert_sql(entry: Entry, table_name: str, source_name: str = 'df', use_spatial: bool = False) -> str: - if use_spatial: - raise NotImplementedError('There is still an error with the spatial type.') +def _create_insert_sql(entry: Entry, table_name: str, source_name: str = 'df') -> str: + params = load_params() # get the dimension names spatial_dims = entry.datasource.spatial_scale.dimension_names if entry.datasource.spatial_scale is not None else [] @@ -146,7 +143,7 @@ def _create_insert_sql(entry: Entry, table_name: str, source_name: str = 'df', u column_names.append(f" {temporal_dims[0]} as time ") # spatial dimensions - if len(spatial_dims) == 2 and use_spatial: + if len(spatial_dims) == 2 and params.use_spatial: column_names.append(f" ({','.join(spatial_dims)})::BOX_2D AS cell ") else: column_names.append(' ' + ', '.join([f"{dim} AS {name}" for dim, name in zip(spatial_dims, SPATIAL_DIMENSIONS)])) From 2f8e0a909544f141943be74df84e77bda3da73ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Tue, 9 Jul 2024 08:15:32 +0200 Subject: [PATCH 5/5] drop na values on ingest --- src/ingestor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingestor.py b/src/ingestor.py index 409a1f2..0931fd9 100644 --- a/src/ingestor.py +++ b/src/ingestor.py @@ -255,7 +255,7 @@ def load_xarray_to_duckdb(entry: Entry, data: xr.Dataset) -> str: # get a delayed dask dataframe try: - ddf = data.to_dask_dataframe()[dimension_names] + ddf = data.to_dask_dataframe()[dimension_names].dropna() except ValueError as e: # check this is the chunking error if 'Object has inconsistent chunks' in str(e):