diff --git a/Dockerfile b/Dockerfile index ec0ca1a..7444393 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ RUN pip install ipython==8.26.0 \ pyarrow==14.0.1 \ ydata-profiling==4.6.4 \ # linux AArch64 extensions are not available for 0.9.2 -> 0.10.0 is released early Feb. 2024 - duckdb==0.8.0 \ + "duckdb>=1.0.0" \ polars==0.19.19 \ geocube @@ -48,7 +48,7 @@ RUN mv /whitebox/WhiteboxTools_linux_amd64/WBT /src/WBT # first line checks the architecture, and replaces x86_64 with amd64, which is what duckdb uses RUN arch=$(uname -m | sed s/x86_64/amd64/) && \ mkdir /duck && \ - wget https://github.com/duckdb/duckdb/releases/download/v0.8.0/duckdb_cli-linux-${arch}.zip && \ + wget https://github.com/duckdb/duckdb/releases/download/v1.0.0/duckdb_cli-linux-${arch}.zip && \ unzip duckdb_cli-linux-${arch}.zip && \ rm duckdb_cli-linux-${arch}.zip && \ chmod +x ./duckdb && \ diff --git a/src/aggregator.py b/src/aggregator.py index 934a856..46f9a85 100644 --- a/src/aggregator.py +++ b/src/aggregator.py @@ -145,18 +145,18 @@ def aggregate_scale(aggregation_scale: str, executor: Executor, precision: Optio if aggregation_scale == 'temporal': on = ['time'] elif aggregation_scale == 'spatial': - on = ['lon', 'lat'] + on = ['x', 'y'] elif aggregation_scale == 'spatiotemporal': - on = ['time', 'lon', 'lat'] + on = ['time', 'x', 'y'] # extract only the index and column 'mean' - mean = df[[*on, 'mean']].rename({'mean': layer}) + mean = df[[*on, 'mean']].clone().rename({'mean': layer}) # join the means if means is None: means = mean else: - means = means.join(mean, on=on, how='outer') + means = means.join(mean, on=on, how='outer').clone() # finally save the means path = params.result_path / f"mean_{aggregation_scale}_aggs.parquet" diff --git a/src/ingestor.py b/src/ingestor.py index 189c537..0931fd9 100644 --- a/src/ingestor.py +++ b/src/ingestor.py @@ -83,9 +83,7 @@ def _table_exists(table_name: str) -> bool: return False -def _create_datasource_table(entry: Entry, table_name: str, use_spatial: bool = False) -> str: - if use_spatial: - raise NotImplementedError('There is still an error with the spatial type.') +def _create_datasource_table(entry: Entry, table_name: str) -> str: # get the parameters params = load_params() @@ -102,7 +100,7 @@ def _create_datasource_table(entry: Entry, table_name: str, use_spatial: bool = column_names.append(f" time TIMESTAMP") # spatial dimensions - if len(spatial_dims) == 2 and use_spatial: + if len(spatial_dims) == 2 and params.use_spatial: column_names.append(f" cell BOX_2D") else: column_names.append(' ' + ','.join([f" {name} DOUBLE" for dim, name in zip(spatial_dims, SPATIAL_DIMENSIONS)])) @@ -126,9 +124,8 @@ def _create_datasource_table(entry: Entry, table_name: str, use_spatial: bool = return dbname -def _create_insert_sql(entry: Entry, table_name: str, source_name: str = 'df', use_spatial: bool = False) -> str: - if use_spatial: - raise NotImplementedError('There is still an error with the spatial type.') +def _create_insert_sql(entry: Entry, table_name: str, source_name: str = 'df') -> str: + params = load_params() # get the dimension names spatial_dims = entry.datasource.spatial_scale.dimension_names if entry.datasource.spatial_scale is not None else [] @@ -146,7 +143,7 @@ def _create_insert_sql(entry: Entry, table_name: str, source_name: str = 'df', u column_names.append(f" {temporal_dims[0]} as time ") # spatial dimensions - if len(spatial_dims) == 2 and use_spatial: + if len(spatial_dims) == 2 and params.use_spatial: column_names.append(f" ({','.join(spatial_dims)})::BOX_2D AS cell ") else: column_names.append(' ' + ', '.join([f"{dim} AS {name}" for dim, name in zip(spatial_dims, SPATIAL_DIMENSIONS)])) @@ -258,7 +255,7 @@ def load_xarray_to_duckdb(entry: Entry, data: xr.Dataset) -> str: # get a delayed dask dataframe try: - ddf = data.to_dask_dataframe()[dimension_names] + ddf = data.to_dask_dataframe()[dimension_names].dropna() except ValueError as e: # check this is the chunking error if 'Object has inconsistent chunks' in str(e): diff --git a/src/param.py b/src/param.py index 6d93812..971a2dc 100644 --- a/src/param.py +++ b/src/param.py @@ -54,6 +54,9 @@ class Params(BaseModel): base_path: str = '/out' netcdf_backend: NetCDFBackends = NetCDFBackends.XARRAY + # duckdb settings + use_spatial: bool = False + @property def dataset_path(self) -> Path: if self.keep_data_files: