From 5873bf74cb0360a429cf7f698c5f9e5d4f73dde1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20M=C3=A4licke?= <mirko.maelicke@kit.edu>
Date: Thu, 1 Aug 2024 12:56:27 +0200
Subject: [PATCH 1/4] rename

---
 src/{ingestor.py => duck_ingestor.py} |  0
 src/loader.py                         | 48 +++++++++++++++++++--------
 src/param.py                          |  6 ++++
 src/run.py                            | 18 +++++++---
 4 files changed, 53 insertions(+), 19 deletions(-)
 rename src/{ingestor.py => duck_ingestor.py} (100%)

diff --git a/src/ingestor.py b/src/duck_ingestor.py
similarity index 100%
rename from src/ingestor.py
rename to src/duck_ingestor.py
diff --git a/src/loader.py b/src/loader.py
index f03ab5b..eafb06f 100644
--- a/src/loader.py
+++ b/src/loader.py
@@ -12,7 +12,7 @@
 import rasterio as rio
 
 from json2args.logger import logger
-from writer import dispatch_save_file, entry_metadata_saver
+from writer import dispatch_save_file, entry_metadata_saver, xarray_to_netcdf_saver
 from param import load_params, Params
 from utils import whitebox_log_handler
 
@@ -112,8 +112,10 @@ def load_netcdf_file(entry: Entry, executor: Executor) -> str:
     # get a path for the current dataset path
     dataset_base_path = params.dataset_path / f"{entry.variable.name.replace(' ', '_')}_{entry.id}"
 
+    # create a counter for the saved parts
+    part = 0
     # preprocess each netcdf / grib / zarr file
-    for i, fname in enumerate(fnames):
+    for fname in fnames:
         # read the min and max time and check if we can skip
         ds = xr.open_dataset(fname, decode_coords='all', mask_and_scale=True)
 
@@ -157,13 +159,19 @@ def load_netcdf_file(entry: Entry, executor: Executor) -> str:
         # and supress the creation of metadata files
         dataset_base_path.mkdir(parents=True, exist_ok=True)
         
-        # get the filenmae
-        filename = f"{entry.variable.name.replace(' ', '_')}_{entry.id}"
-        target_name = f"{filename}_part_{i + 1}.nc"
+        # we will actually save, so increate the part counter
+        part += 1
 
-        dispatch_save_file(entry=entry, data=data, executor=executor, base_path=str(dataset_base_path), target_name=target_name, save_meta=False)
+        # get the filename
+        filename = f"{entry.variable.name.replace(' ', '_')}_{entry.id}"
+        target_name = f"{filename}_part_{part}.nc"
+        
+        # use the dispatch_save_file function to save the data
+        # dispatch_save_file(entry=entry, data=data, executor=executor, base_path=str(dataset_base_path), target_name=target_name, save_meta=False)
+        xarray_to_netcdf_saver(data=data, target_name=str(dataset_base_path / target_name))
+        
         # if there are many files, we save the metadata only once
-        if i == 0:
+        if part == 1:
             metafile_name = str(params.dataset_path / f"{filename}.metadata.json")
             entry_metadata_saver(entry, metafile_name)
             logger.info(f"Saved metadata for dataset <ID={entry.id}> to {metafile_name}.")
@@ -301,18 +309,30 @@ def error_handler(future):
             logger.error(f"ERRORED: clipping dataset <ID={entry.id}>: {str(exc)}")
     
     # collect all futures
-    futures = []
+    # futures = []
+    
+    part = 1
+    
     # go for each file
-    for i, fname in enumerate(fnames):
+    for fname in fnames:
         # derive an out-name
-        out_name = None if len(fnames) == 1 else f"{Path(fname).stem}_part_{i + 1}.tif"
+        if len(fnames) == 1:
+            out_name = f"{entry.variable.name.replace(' ', '_')}_{entry.id}.tif"
+        else:
+            out_name = f"{entry.variable.name.replace(' ', '_')}_{entry.id}_part_{part}.tif"
+        
         # submit each save task to the executor
-        future = executor.submit(_rio_clip_raster, fname, reference_area, dataset_base_path, out_name=out_name, touched=params.cell_touches)
-        future.add_done_callback(error_handler)
-        futures.append(future)
+        #future = executor.submit(_rio_clip_raster, fname, reference_area, dataset_base_path, out_name=out_name, touched=params.cell_touches)
+        #future.add_done_callback(error_handler)
+        #futures.append(future)
+
+        # call procedurally
+        out_path = _rio_clip_raster(fname, reference_area, base_path=dataset_base_path, out_name=out_name, touched=params.cell_touches)
+        if out_path is not None:
+            part += 1
     
     # wait until all are finished
-    tiles = [future.result() for future in futures if future.result() is not None]
+    #tiles = [future.result() for future in futures if future.result() is not None]
     
     # run the merge function and delete the other files
     # if len(tiles) > 1:
diff --git a/src/param.py b/src/param.py
index 4f8acb2..d2fd855 100644
--- a/src/param.py
+++ b/src/param.py
@@ -31,6 +31,11 @@ class NetCDFBackends(str, Enum):
     PARQUET = 'parquet'
 
 
+class IngestorBackend(str, Enum):
+    DUCKDB = 'duckdb'
+    XARRAY = 'xarray'
+
+
 class Params(BaseModel):
     # mandatory inputs are the dataset ids and the reference area
     dataset_ids: List[int]
@@ -54,6 +59,7 @@ class Params(BaseModel):
     # stuff that we do not change in the tool
     base_path: str = '/out'
     netcdf_backend: NetCDFBackends = NetCDFBackends.XARRAY
+    ingestor_backend: IngestorBackend = IngestorBackend.DUCKDB
 
     # duckdb settings
     use_spatial: bool = False
diff --git a/src/run.py b/src/run.py
index dcf6fc5..eb5ac52 100644
--- a/src/run.py
+++ b/src/run.py
@@ -11,10 +11,10 @@
 from metacatalog import api
 from tqdm import tqdm
 
-from param import load_params, Integrations
+from param import load_params, Integrations, IngestorBackend
 from loader import load_entry_data
 from json2args.logger import logger
-import ingestor
+import duck_ingestor
 import aggregator
 import reporter
 from clip import reference_area_to_file
@@ -102,6 +102,8 @@
 file_mapping = []
 with PoolExecutor() as executor:
     logger.debug(f"START {type(executor).__name__} - Pool to load and clip data source files.")
+    logger.info(f"A total of {len(params.dataset_ids)} are requested. Start loading data sources.")
+    
     for dataset_id in tqdm(params.dataset_ids):
         try:
             entry = api.find_entry(session, id=dataset_id, return_iterator=True).one()
@@ -131,11 +133,17 @@
 
 # check if we have any files to process
 elif len(file_mapping) > 0:
-    logger.info(f"Starting to create a consistent DuckDB dataset at {params.database_path}. Check out https://duckdb.org/docs/api/overview to learn more about DuckDB.")
-    
     # start a timer 
     t1 = time.time()
-    path = ingestor.load_files(file_mapping=file_mapping)
+
+    # check which backend to use
+    if params.ingestor_backend == IngestorBackend.DUCKDB:
+        logger.info(f"Starting to create a consistent DuckDB dataset at {params.database_path}. Check out https://duckdb.org/docs/api/overview to learn more about DuckDB.")
+        path = duck_ingestor.load_files(file_mapping=file_mapping)
+    else:
+        logger.info(f"Starting to create a consistent DataCube dataset at {params.database_path}.")
+        raise NotImplementedError
+    
     t2 = time.time()
     logger.info(f"Finished creating the dataset at {path} in {t2-t1:.2f} seconds.")
 else:

From 7c83fee3275ecd7b1b1a8ab54cfcde1ff299c00d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20M=C3=A4licke?= <mirko.maelicke@kit.edu>
Date: Tue, 17 Sep 2024 09:06:55 +0200
Subject: [PATCH 2/4] remove unnecessary parameter

---
 src/param.py | 39 ++--------------------
 src/run.py   | 93 ++++------------------------------------------------
 src/tool.yml | 61 ++++------------------------------
 3 files changed, 15 insertions(+), 178 deletions(-)

diff --git a/src/param.py b/src/param.py
index d2fd855..f9d7ff7 100644
--- a/src/param.py
+++ b/src/param.py
@@ -15,15 +15,6 @@
 import geopandas as gpd
 
 
-# create the Enum for integration type
-class Integrations(str, Enum):
-    TEMPORAL = 'temporal'
-    SPATIAL = 'spatial'
-    SPATIO_TEMPORAL = 'spatiotemporal'
-    ALL = 'all'
-    NONE = 'none'
-    FULL = 'full'
-
 
 class NetCDFBackends(str, Enum):
     XARRAY = 'xarray'
@@ -31,11 +22,6 @@ class NetCDFBackends(str, Enum):
     PARQUET = 'parquet'
 
 
-class IngestorBackend(str, Enum):
-    DUCKDB = 'duckdb'
-    XARRAY = 'xarray'
-
-
 class Params(BaseModel):
     # mandatory inputs are the dataset ids and the reference area
     dataset_ids: List[int]
@@ -44,32 +30,17 @@ class Params(BaseModel):
     # optional parameters to configure the processing
     start_date: datetime = None
     end_date: datetime = None
-    integration: Integrations = Integrations.ALL
-    apply_aggregation: bool = False
-
-    # optional parameter to configure output
-    keep_data_files: bool = True
-    database_name: str = 'dataset.duckdb'
-
-    # optional parameter to provide result output
-    precision: str = 'day'
-    resolution: int = 5000
     cell_touches: bool = True
 
     # stuff that we do not change in the tool
     base_path: str = '/out'
+    dataset_folder_name: str = 'datasets'
     netcdf_backend: NetCDFBackends = NetCDFBackends.XARRAY
-    ingestor_backend: IngestorBackend = IngestorBackend.DUCKDB
-
-    # duckdb settings
-    use_spatial: bool = False
 
     @property
     def dataset_path(self) -> Path:
-        if self.keep_data_files:
-            p = Path(self.base_path) / 'datasets'
-        else:
-            p = Path(tempfile.mkdtemp())
+        # set the databsets path
+        p = Path(self.base_path) / self.dataset_folder_name
         
         # make the directory if it does not exist
         p.mkdir(parents=True, exist_ok=True)
@@ -85,10 +56,6 @@ def result_path(self) -> Path:
 
         return p
     
-    @property
-    def database_path(self) -> Path:
-        return Path(self.base_path) / self.database_name
-
     @property
     def reference_area_df(self) -> gpd.GeoDataFrame:
         return gpd.GeoDataFrame.from_features([self.reference_area])
diff --git a/src/run.py b/src/run.py
index eb5ac52..0539fba 100644
--- a/src/run.py
+++ b/src/run.py
@@ -11,12 +11,9 @@
 from metacatalog import api
 from tqdm import tqdm
 
-from param import load_params, Integrations, IngestorBackend
+from param import load_params
 from loader import load_entry_data
 from json2args.logger import logger
-import duck_ingestor
-import aggregator
-import reporter
 from clip import reference_area_to_file
 from version import __version__
 
@@ -60,8 +57,7 @@
 START DATE:         {params.start_date}
 END DATE:           {params.end_date}
 REFERENCE AREA:     {params.reference_area is not None}
-INTEGRATION:        {params.integration}
-KEEP DATA FILES:    {params.keep_data_files}
+CELL TOUCHES:       {params.cell_touches}
 
 DATASET IDS:
 {', '.join(map(str, params.dataset_ids))}
@@ -69,25 +65,16 @@
 DATABASE CONNECTION: {connection is not None}
 DATABASE URI:        {session.bind}
 
-AGGREGATION SETTINGS
---------------------
-PRECISION:          {params.precision}
-RESOLUTION:         {params.resolution}x{params.resolution}
-TARGET CRS:         EPSG:3857
-
 Processing logs:
 ----------------
 """
 with open('/out/processing.log', 'w') as f:
     f.write(MSG)
 
-# if the integration is set to NONE and the user does not want to keep the data files, there will be no output
-if params.integration == Integrations.NONE and not params.keep_data_files:
-    logger.critical("You have set the integration to NONE and do not want to keep the data files. This will result in no output.")
-    sys.exit(1)
-
-# --------------------------------------------------------------------------- #
 # Here is the actual tool
+# --------------------------------------------------------------------------- #
+# mark the start of the tool
+logger.info("##TOOL START - Vforwater Loader")
 tool_start = time.time()
 
 # debug the params before we do anything with them
@@ -126,78 +113,10 @@
     executor.shutdown(wait=True)
     logger.info(f"STOP {type(executor).__name__} - Pool finished all tasks and shutdown.")
 
-# here to the stuff for creating a consistent dataset
-# check if the user disabled integration
-if params.integration == Integrations.NONE:
-    logger.info("Integration is disabled. No further processing will be done.")
-
-# check if we have any files to process
-elif len(file_mapping) > 0:
-    # start a timer 
-    t1 = time.time()
-
-    # check which backend to use
-    if params.ingestor_backend == IngestorBackend.DUCKDB:
-        logger.info(f"Starting to create a consistent DuckDB dataset at {params.database_path}. Check out https://duckdb.org/docs/api/overview to learn more about DuckDB.")
-        path = duck_ingestor.load_files(file_mapping=file_mapping)
-    else:
-        logger.info(f"Starting to create a consistent DataCube dataset at {params.database_path}.")
-        raise NotImplementedError
-    
-    t2 = time.time()
-    logger.info(f"Finished creating the dataset at {path} in {t2-t1:.2f} seconds.")
-else:
-    logger.warning("It seems like no data files have been processed. This might be an error.")
-
-# switch the type of integrations
-if params.integration != Integrations.NONE and params.apply_aggregation:
-    with PoolExecutor() as executor:
-        logger.debug(f"START {type(executor).__name__} - Pool to ingest data files into a Dataset DuckDB database.")
-
-        if params.integration == Integrations.TEMPORAL or params.integration == Integrations.ALL:
-            # run the temporal aggregation
-            aggregator.aggregate_scale(aggregation_scale='temporal', executor=executor)
-        
-        if params.integration == Integrations.SPATIAL or params.integration == Integrations.ALL and False:
-            # run the spatial aggregation
-            aggregator.aggregate_scale(aggregation_scale='spatial', executor=executor)
-        
-        if params.integration == Integrations.SPATIO_TEMPORAL or params.integration == Integrations.ALL and False:
-            # run the spatio-temporal aggregation
-            aggregator.aggregate_scale(aggregation_scale='spatiotemporal', executor=executor)
-
-        # wait until all results are finished
-        executor.shutdown(wait=True)
-        logger.debug(f"STOP {type(executor).__name__} - Pool finished all tasks and shutdown.")
-
-
-    # finally run a thrid pool to generate reports
-    with PoolExecutor() as executor:
-        logger.debug(f"START {type(executor).__name__} - Pool to generate final reports.")
-
-    # create a callback to log exceptions
-    def callback(future):
-        exc = future.exception() 
-        if exc is not None:
-            logger.exception(exc)
-
-        # generate the profile report - start first as this one might potentially take longer
-        # TODO: there should be an option to disable this
-        executor.submit(reporter.generate_profile_report).add_done_callback(callback)
-
-        # generate the readme
-        executor.submit(reporter.generate_readme).add_done_callback(callback)
-
-
-        # wait until all results are finished
-        executor.shutdown(wait=True)
-        logger.debug(f"STOP {type(executor).__name__} - Pool finished all tasks and shutdown.")
-# --------------------------------------------------------------------------- #
-
-
 # we're finished.
 t2 = time.time()
 logger.info(f"Total runtime: {t2 - tool_start:.2f} seconds.")
+logger.info("##TOOL FINISH - Vforwater Loader")
 
 # print out the report
 with open('/out/processing.log', 'r') as f:
diff --git a/src/tool.yml b/src/tool.yml
index 65fa937..63e1a16 100644
--- a/src/tool.yml
+++ b/src/tool.yml
@@ -27,60 +27,11 @@ tools:
         type: datetime
         description: |
           The end date of the datasetm, if a time dimension applies to the dataset.
-      integration:
-        type: enum
-        values:
-          - none
-          - all
-          - spatial
-          - temporal
-        description: |
-          The mode of operation for the integration of each all data files associated to each data source
-          into a common DuckDB-based dataset. This dataset includes data for a unified spatial and temporal
-          extent and includes macros for aggregation. By setting `integration` the default integrations are
-          selected. The resulting database can still be used to query different aggregation levels. 
-          - `none`: No integration will be performed and the DuckDB database will **NOT** be created.
-          - `all`: Temporal, spatial and spatio-temporal scale aggregations will be integrated, if the scale is defined in the dataset metadata.
-          - `spatial`: Only results for spatial aggregations will be provided.
-          - `temporal`: Only results for temporal aggregations will be provided.
-          - `spatiotemporal`: Only results for spatio-temporal aggregations will be provided.
-        optional: true
-      keep_data_files:
-        type: boolean
-        optional: true
-        description: |
-          If set to `false`, the data files clipped to the spatial and temporal scale as defined in the 
-          data-source metadata will not be kept. This saves a lot of disk space for the output.
-          If set to `true` (default behavior), then there will be a `/out/datasets` directory in the output.
-      precision:
-        type: enum
-        values:
-          - minute
-          - hour
-          - day
-          - month
-          - year
-          - decade
-        optional: true
-        description: |
-          The precision for aggregations along the temporal scale of the datasets. This parameter does only take effect if
-          the integration includes a data integration along the temporal scale. That includes: `temporal`, `spatiotemporal` and `all`.
-          If integration is set set and no precision is supplied, the tool chooses a suitable precision. This decision is not
-          yet deterministic.
-      resolution:
-        type: integer
-        optional: true
-        description: | 
-          The resolution of the output data. This parameter is only relevant for areal datasets. If the dataset is not areal, this parameter
-          is ignored. If the dataset is areal and the parameter is not set, the original resolution of the dataset is used. 
-          If the dataset is areal and the parameter is set, the dataset will be resampled to the given resolution.
-          Note: Right now, the aggregations can only write parquet files. For larger (espeically spatio-temporal) datasets, these
-          aggreations can be large. A future version will write netCDF or Zarr for these cases.
-      apply_aggregation:
+      cell_touches:
         type: boolean
-        optional: true
         description: |
-          If set to `true`, the tool will apply the aggregation as defined in the metadata (precision and resoplution). 
-          If set to `false` (default), the tool will not apply any aggregation while loading. In these casesm the integrated dataset will be available,
-          which is a duckdb instance that holds macros for aggregation. Thus, you can adjust the precision and resolution later on.
-          The main downside for duckdb instances is that they take up more disk space, than aggregated datasets.
+          If set to true, the tool will only return datasets that have a spatial overlap with the reference area.
+          If set to false, the tool will return datasets that have a spatial overlap or touch the reference area.
+          If omitted, the default is true.
+          Note: This parameter only applies to datasets with a defined spatial scale extent.
+        optional: true
\ No newline at end of file

From eb1f1362e7d622a464c3e1bb552abea6551d3340 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20M=C3=A4licke?= <mirko.maelicke@kit.edu>
Date: Tue, 17 Sep 2024 09:07:06 +0200
Subject: [PATCH 3/4] update Dockerfile

---
 Dockerfile | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6528f7e..36b0b72 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -23,18 +23,18 @@ RUN pip install \
     xarray[complete]==2024.7.0 \ 
     rioxarray==0.17.0 \
     pyarrow==17.0.0 \
-    ydata-profiling==4.9.0 \
+    #ydata-profiling==4.9.0 \
     # linux AArch64 extensions are not available for 0.9.2 -> 0.10.0 is released early Feb. 2024
-    "duckdb>=1.0.0" \
+    #"duckdb>=1.0.0" \
     polars-lts-cpu==1.1.0 \
     geocube==0.6.0
 
 # install the needed version for metacatalog
-RUN pip install metacatalog==0.9.1
+RUN pip install metacatalog==0.9.2
 
 # Install CDO, might be used to do seltimestep or sellonlatbox and possibly merge
-RUN apt-get install -y gettext=0.21-12 \
-    gnuplot=5.4.4+dfsg1-2 
+#RUN apt-get install -y gettext=0.21-12 \
+    #gnuplot=5.4.4+dfsg1-2 
     # cdo=2.1.1-1 
 
 # create the tool input structure
@@ -50,16 +50,16 @@ COPY ./CITATION.cf[f] /src/CITATION.cff
 
 # download a precompiled binary of duckdb
 #  first line checks the architecture, and replaces x86_64 with amd64, which is what duckdb uses
-RUN arch=$(uname -m | sed s/x86_64/amd64/) && \     
-    mkdir /duck && \
-    wget https://github.com/duckdb/duckdb/releases/download/v1.0.0/duckdb_cli-linux-${arch}.zip && \
-    unzip duckdb_cli-linux-${arch}.zip && \
-    rm duckdb_cli-linux-${arch}.zip && \
-    chmod +x ./duckdb && \
-    mv ./duckdb /duck/duckdb
+# RUN arch=$(uname -m | sed s/x86_64/amd64/) && \     
+#     mkdir /duck && \
+#     wget https://github.com/duckdb/duckdb/releases/download/v1.0.0/duckdb_cli-linux-${arch}.zip && \
+#     unzip duckdb_cli-linux-${arch}.zip && \
+#     rm duckdb_cli-linux-${arch}.zip && \
+#     chmod +x ./duckdb && \
+#     mv ./duckdb /duck/duckdb
 
 # pre-install the spatial extension into duckdb as it will be used
-RUN /duck/duckdb -c "INSTALL spatial;"
+# RUN /duck/duckdb -c "INSTALL spatial;"
 
 # go to the source directory of this tool
 WORKDIR /src

From 996a5a282275637f1373a9b29915bd698133df62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20M=C3=A4licke?= <mirko.maelicke@kit.edu>
Date: Tue, 17 Sep 2024 09:24:16 +0200
Subject: [PATCH 4/4] update docs

---
 CITATION.cff   |  7 +++----
 README.md      | 34 ++++++----------------------------
 src/version.py |  2 +-
 3 files changed, 10 insertions(+), 33 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index b8670a5..32303ec 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -9,7 +9,7 @@ type: software
 authors:
   - given-names: Mirko
     family-names: Mälicke
-    email: mirko.maelicke@KIT.edu
+    email: mirko.maelicke@kit.edu
     affiliation: >-
       Institute for Water and Environment, Hydrology,
       Karlsruhe Institute for Technology (KIT)
@@ -28,7 +28,6 @@ abstract: >-
   The requested datasources will be made available in the output directory of the tool. Areal datasets
   will be clipped to the **bounding box** of the reference area and multi-file sources are preselected
   to fall into the time range specified. 
-  Note that exact extracts (specific time step, specific area) are not yet supported for areal datasets.
 keywords:
   - docker
   - tool-spec
@@ -38,5 +37,5 @@ keywords:
   - catchment
   - metacatalog
 license: CC-BY-4.0
-version: '0.9.3'
-date-released: '2024-07-31'
+version: '0.10.0'
+date-released: '2024-09-17'
diff --git a/README.md b/README.md
index 510ed41..ef1f162 100644
--- a/README.md
+++ b/README.md
@@ -14,38 +14,18 @@ This tool follows the [Tool Specification](https://vforwater.github.io/tool-spec
 [MetaCatalog](https://github.com/vforwater/metacatalog) stores metadata about internal and external datasets along with
 information about the data sources and how to access them. Using this tool, one can request datasets (called *entries* in MetaCatalog) by their **id**. Additionally, an area of interest is supplied as a GeoJSON feature, called **reference area**.
 
-The tool involves three main processing steps, of which only the first one is mandatory.
-
-1. The database of the connected MetaCatalog instance is queried for the `dataset_ids`. The data-files are reuqested for 
+The database of the connected MetaCatalog instance is queried for the `dataset_ids`. The data-files are reuqested for 
 the temporal extent of `start_date` and `end_date` if given, while the spatial extent is requested for the bounding box
 of `reference_area`. MetaCatalog entires without either of the scales defined are loaded entierly.
 Finally, the spatial extent is clipped by the `reference_area` to match exactly. Experimental parameters are not yet 
 exposed, but involve:
+
     - `netcdf_backend`, which can be either `'CDO'` or `'xarray'` (default) can switch the software used for the clip
     of NetCDF data sources, which are commonly used for spatio-temporal datasets.
-    - `touches` is a boolean that is `false` by default and configures if areal grid cells are considered part of 
-    `reference_area` if they touch (`touches=true`) or only contain the grid center (`touches=false`).
+
 All processed data-files for each source are then saved to `/out/datasets/`, while multi-file sources are saved to
 child repositories. The file (or folder) names are built like: `<variable_name>_<entry_id>`.
 
-2. The second step is only performed if the parameter `integration` is **not** set to `none`.
-All available data sources are converted to long-format, where each atomic data value is indexed by the value of the
-axes, that form the spatial and temporal scales (if given). These files are loaded into a DuckDB, that is exported as
-`/out/dataset.db` along with all metadata from MetaCatalog as JSON, and a number of database MACROs for aggregations
-along the scale axes. 
-For each data integration defined as `integration` (one of `['temporal', 'spatial', 'spatiotemporal']`), the MACRO is
-executed and the result is saved to `/out/results/<variable_name>_<entry_id>_<aggregation_scale>_aggs.parquet` containing
-aggregations to all statistical moments, quartiles, the sum, Shannon Entropy and a histogram.
-The means are further joined into a common `/out/results/mean_<aggregation_scale>_aggs.parquet` as the main result 
-outputs. The aggregation is configured via `precision` (temporal) and `resolution` (spatial). The final database
-can still be used to execute other aggregations, outside of the context of this tool.
-
-3. The last step can only be run, if the second step was performed successfully. As of now, two finishing report-like
-documents are created. First [YData Profiling](https://docs.profiling.ydata.ai/latest/) is run on the 
-`/out/results/mean_temporal_aggs.parquet` to create a time-series exploratory data analysis (EDA) report. It is 
-availabe in HTML and JSON format.
-The second document is a `/out/README.md`, which is created at runtime from the data in the database. Thus, the data 
-tables are listed accordingly and license information is extracted and presented as available in the MetaCatalog instance.
 
 ### Parameters
 
@@ -55,11 +35,7 @@ tables are listed accordingly and license information is extracted and presented
 | reference_area | A valid GeoJSON POLYGON Feature. Areal datasets will be clipped to this area. |
 | start_date | The start date of the dataset, if a time dimension applies to the dataset. |
 | end_date | The end date of the dataset, if a time dimension applies to the dataset. |
-| integration | The mode of operation for integrating all data files associated with each data source into a common DuckDB-based dataset. |
-| keep_data_files | If set to `false`, the data files clipped to the spatial and temporal scale will not be kept. |
-| precision | The precision for aggregations along the temporal scale of the datasets. |
-| resolution | The resolution of the output data. This parameter is only relevant for areal datasets. |
-
+| cell_touches | Specifies if an areal cell is part of the reference area if it only touches the geometry. |
 
 ## Development and local run
 
@@ -125,11 +101,13 @@ Each container needs at least the following structure:
 |- src/
 |  |- tool.yml
 |  |- run.py
+|  |- CITATION.cff
 ```
 
 * `inputs.json` are parameters. Whichever framework runs the container, this is how parameters are passed.
 * `tool.yml` is the tool specification. It contains metadata about the scope of the tool, the number of endpoints (functions) and their parameters
 * `run.py` is the tool itself, or a Python script that handles the execution. It has to capture all outputs and either `print` them to console or create files in `/out`
+* `CITATION.cff` Citation file providing bibliographic information on how to cite this tool.
 
 *Does `run.py` take runtime args?*:
 
diff --git a/src/version.py b/src/version.py
index 7ff9620..a9b029e 100644
--- a/src/version.py
+++ b/src/version.py
@@ -1 +1 @@
-__version__ = "0.12.0"
\ No newline at end of file
+__version__ = "0.10.0"
\ No newline at end of file