From 5873bf74cb0360a429cf7f698c5f9e5d4f73dde1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 1 Aug 2024 12:56:27 +0200 Subject: [PATCH 1/4] rename --- src/{ingestor.py => duck_ingestor.py} | 0 src/loader.py | 48 +++++++++++++++++++-------- src/param.py | 6 ++++ src/run.py | 18 +++++++--- 4 files changed, 53 insertions(+), 19 deletions(-) rename src/{ingestor.py => duck_ingestor.py} (100%) diff --git a/src/ingestor.py b/src/duck_ingestor.py similarity index 100% rename from src/ingestor.py rename to src/duck_ingestor.py diff --git a/src/loader.py b/src/loader.py index f03ab5b..eafb06f 100644 --- a/src/loader.py +++ b/src/loader.py @@ -12,7 +12,7 @@ import rasterio as rio from json2args.logger import logger -from writer import dispatch_save_file, entry_metadata_saver +from writer import dispatch_save_file, entry_metadata_saver, xarray_to_netcdf_saver from param import load_params, Params from utils import whitebox_log_handler @@ -112,8 +112,10 @@ def load_netcdf_file(entry: Entry, executor: Executor) -> str: # get a path for the current dataset path dataset_base_path = params.dataset_path / f"{entry.variable.name.replace(' ', '_')}_{entry.id}" + # create a counter for the saved parts + part = 0 # preprocess each netcdf / grib / zarr file - for i, fname in enumerate(fnames): + for fname in fnames: # read the min and max time and check if we can skip ds = xr.open_dataset(fname, decode_coords='all', mask_and_scale=True) @@ -157,13 +159,19 @@ def load_netcdf_file(entry: Entry, executor: Executor) -> str: # and supress the creation of metadata files dataset_base_path.mkdir(parents=True, exist_ok=True) - # get the filenmae - filename = f"{entry.variable.name.replace(' ', '_')}_{entry.id}" - target_name = f"{filename}_part_{i + 1}.nc" + # we will actually save, so increate the part counter + part += 1 - dispatch_save_file(entry=entry, data=data, executor=executor, base_path=str(dataset_base_path), target_name=target_name, save_meta=False) + # get the filename + filename = f"{entry.variable.name.replace(' ', '_')}_{entry.id}" + target_name = f"{filename}_part_{part}.nc" + + # use the dispatch_save_file function to save the data + # dispatch_save_file(entry=entry, data=data, executor=executor, base_path=str(dataset_base_path), target_name=target_name, save_meta=False) + xarray_to_netcdf_saver(data=data, target_name=str(dataset_base_path / target_name)) + # if there are many files, we save the metadata only once - if i == 0: + if part == 1: metafile_name = str(params.dataset_path / f"{filename}.metadata.json") entry_metadata_saver(entry, metafile_name) logger.info(f"Saved metadata for dataset to {metafile_name}.") @@ -301,18 +309,30 @@ def error_handler(future): logger.error(f"ERRORED: clipping dataset : {str(exc)}") # collect all futures - futures = [] + # futures = [] + + part = 1 + # go for each file - for i, fname in enumerate(fnames): + for fname in fnames: # derive an out-name - out_name = None if len(fnames) == 1 else f"{Path(fname).stem}_part_{i + 1}.tif" + if len(fnames) == 1: + out_name = f"{entry.variable.name.replace(' ', '_')}_{entry.id}.tif" + else: + out_name = f"{entry.variable.name.replace(' ', '_')}_{entry.id}_part_{part}.tif" + # submit each save task to the executor - future = executor.submit(_rio_clip_raster, fname, reference_area, dataset_base_path, out_name=out_name, touched=params.cell_touches) - future.add_done_callback(error_handler) - futures.append(future) + #future = executor.submit(_rio_clip_raster, fname, reference_area, dataset_base_path, out_name=out_name, touched=params.cell_touches) + #future.add_done_callback(error_handler) + #futures.append(future) + + # call procedurally + out_path = _rio_clip_raster(fname, reference_area, base_path=dataset_base_path, out_name=out_name, touched=params.cell_touches) + if out_path is not None: + part += 1 # wait until all are finished - tiles = [future.result() for future in futures if future.result() is not None] + #tiles = [future.result() for future in futures if future.result() is not None] # run the merge function and delete the other files # if len(tiles) > 1: diff --git a/src/param.py b/src/param.py index 4f8acb2..d2fd855 100644 --- a/src/param.py +++ b/src/param.py @@ -31,6 +31,11 @@ class NetCDFBackends(str, Enum): PARQUET = 'parquet' +class IngestorBackend(str, Enum): + DUCKDB = 'duckdb' + XARRAY = 'xarray' + + class Params(BaseModel): # mandatory inputs are the dataset ids and the reference area dataset_ids: List[int] @@ -54,6 +59,7 @@ class Params(BaseModel): # stuff that we do not change in the tool base_path: str = '/out' netcdf_backend: NetCDFBackends = NetCDFBackends.XARRAY + ingestor_backend: IngestorBackend = IngestorBackend.DUCKDB # duckdb settings use_spatial: bool = False diff --git a/src/run.py b/src/run.py index dcf6fc5..eb5ac52 100644 --- a/src/run.py +++ b/src/run.py @@ -11,10 +11,10 @@ from metacatalog import api from tqdm import tqdm -from param import load_params, Integrations +from param import load_params, Integrations, IngestorBackend from loader import load_entry_data from json2args.logger import logger -import ingestor +import duck_ingestor import aggregator import reporter from clip import reference_area_to_file @@ -102,6 +102,8 @@ file_mapping = [] with PoolExecutor() as executor: logger.debug(f"START {type(executor).__name__} - Pool to load and clip data source files.") + logger.info(f"A total of {len(params.dataset_ids)} are requested. Start loading data sources.") + for dataset_id in tqdm(params.dataset_ids): try: entry = api.find_entry(session, id=dataset_id, return_iterator=True).one() @@ -131,11 +133,17 @@ # check if we have any files to process elif len(file_mapping) > 0: - logger.info(f"Starting to create a consistent DuckDB dataset at {params.database_path}. Check out https://duckdb.org/docs/api/overview to learn more about DuckDB.") - # start a timer t1 = time.time() - path = ingestor.load_files(file_mapping=file_mapping) + + # check which backend to use + if params.ingestor_backend == IngestorBackend.DUCKDB: + logger.info(f"Starting to create a consistent DuckDB dataset at {params.database_path}. Check out https://duckdb.org/docs/api/overview to learn more about DuckDB.") + path = duck_ingestor.load_files(file_mapping=file_mapping) + else: + logger.info(f"Starting to create a consistent DataCube dataset at {params.database_path}.") + raise NotImplementedError + t2 = time.time() logger.info(f"Finished creating the dataset at {path} in {t2-t1:.2f} seconds.") else: From 7c83fee3275ecd7b1b1a8ab54cfcde1ff299c00d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Tue, 17 Sep 2024 09:06:55 +0200 Subject: [PATCH 2/4] remove unnecessary parameter --- src/param.py | 39 ++-------------------- src/run.py | 93 ++++------------------------------------------------ src/tool.yml | 61 ++++------------------------------ 3 files changed, 15 insertions(+), 178 deletions(-) diff --git a/src/param.py b/src/param.py index d2fd855..f9d7ff7 100644 --- a/src/param.py +++ b/src/param.py @@ -15,15 +15,6 @@ import geopandas as gpd -# create the Enum for integration type -class Integrations(str, Enum): - TEMPORAL = 'temporal' - SPATIAL = 'spatial' - SPATIO_TEMPORAL = 'spatiotemporal' - ALL = 'all' - NONE = 'none' - FULL = 'full' - class NetCDFBackends(str, Enum): XARRAY = 'xarray' @@ -31,11 +22,6 @@ class NetCDFBackends(str, Enum): PARQUET = 'parquet' -class IngestorBackend(str, Enum): - DUCKDB = 'duckdb' - XARRAY = 'xarray' - - class Params(BaseModel): # mandatory inputs are the dataset ids and the reference area dataset_ids: List[int] @@ -44,32 +30,17 @@ class Params(BaseModel): # optional parameters to configure the processing start_date: datetime = None end_date: datetime = None - integration: Integrations = Integrations.ALL - apply_aggregation: bool = False - - # optional parameter to configure output - keep_data_files: bool = True - database_name: str = 'dataset.duckdb' - - # optional parameter to provide result output - precision: str = 'day' - resolution: int = 5000 cell_touches: bool = True # stuff that we do not change in the tool base_path: str = '/out' + dataset_folder_name: str = 'datasets' netcdf_backend: NetCDFBackends = NetCDFBackends.XARRAY - ingestor_backend: IngestorBackend = IngestorBackend.DUCKDB - - # duckdb settings - use_spatial: bool = False @property def dataset_path(self) -> Path: - if self.keep_data_files: - p = Path(self.base_path) / 'datasets' - else: - p = Path(tempfile.mkdtemp()) + # set the databsets path + p = Path(self.base_path) / self.dataset_folder_name # make the directory if it does not exist p.mkdir(parents=True, exist_ok=True) @@ -85,10 +56,6 @@ def result_path(self) -> Path: return p - @property - def database_path(self) -> Path: - return Path(self.base_path) / self.database_name - @property def reference_area_df(self) -> gpd.GeoDataFrame: return gpd.GeoDataFrame.from_features([self.reference_area]) diff --git a/src/run.py b/src/run.py index eb5ac52..0539fba 100644 --- a/src/run.py +++ b/src/run.py @@ -11,12 +11,9 @@ from metacatalog import api from tqdm import tqdm -from param import load_params, Integrations, IngestorBackend +from param import load_params from loader import load_entry_data from json2args.logger import logger -import duck_ingestor -import aggregator -import reporter from clip import reference_area_to_file from version import __version__ @@ -60,8 +57,7 @@ START DATE: {params.start_date} END DATE: {params.end_date} REFERENCE AREA: {params.reference_area is not None} -INTEGRATION: {params.integration} -KEEP DATA FILES: {params.keep_data_files} +CELL TOUCHES: {params.cell_touches} DATASET IDS: {', '.join(map(str, params.dataset_ids))} @@ -69,25 +65,16 @@ DATABASE CONNECTION: {connection is not None} DATABASE URI: {session.bind} -AGGREGATION SETTINGS --------------------- -PRECISION: {params.precision} -RESOLUTION: {params.resolution}x{params.resolution} -TARGET CRS: EPSG:3857 - Processing logs: ---------------- """ with open('/out/processing.log', 'w') as f: f.write(MSG) -# if the integration is set to NONE and the user does not want to keep the data files, there will be no output -if params.integration == Integrations.NONE and not params.keep_data_files: - logger.critical("You have set the integration to NONE and do not want to keep the data files. This will result in no output.") - sys.exit(1) - -# --------------------------------------------------------------------------- # # Here is the actual tool +# --------------------------------------------------------------------------- # +# mark the start of the tool +logger.info("##TOOL START - Vforwater Loader") tool_start = time.time() # debug the params before we do anything with them @@ -126,78 +113,10 @@ executor.shutdown(wait=True) logger.info(f"STOP {type(executor).__name__} - Pool finished all tasks and shutdown.") -# here to the stuff for creating a consistent dataset -# check if the user disabled integration -if params.integration == Integrations.NONE: - logger.info("Integration is disabled. No further processing will be done.") - -# check if we have any files to process -elif len(file_mapping) > 0: - # start a timer - t1 = time.time() - - # check which backend to use - if params.ingestor_backend == IngestorBackend.DUCKDB: - logger.info(f"Starting to create a consistent DuckDB dataset at {params.database_path}. Check out https://duckdb.org/docs/api/overview to learn more about DuckDB.") - path = duck_ingestor.load_files(file_mapping=file_mapping) - else: - logger.info(f"Starting to create a consistent DataCube dataset at {params.database_path}.") - raise NotImplementedError - - t2 = time.time() - logger.info(f"Finished creating the dataset at {path} in {t2-t1:.2f} seconds.") -else: - logger.warning("It seems like no data files have been processed. This might be an error.") - -# switch the type of integrations -if params.integration != Integrations.NONE and params.apply_aggregation: - with PoolExecutor() as executor: - logger.debug(f"START {type(executor).__name__} - Pool to ingest data files into a Dataset DuckDB database.") - - if params.integration == Integrations.TEMPORAL or params.integration == Integrations.ALL: - # run the temporal aggregation - aggregator.aggregate_scale(aggregation_scale='temporal', executor=executor) - - if params.integration == Integrations.SPATIAL or params.integration == Integrations.ALL and False: - # run the spatial aggregation - aggregator.aggregate_scale(aggregation_scale='spatial', executor=executor) - - if params.integration == Integrations.SPATIO_TEMPORAL or params.integration == Integrations.ALL and False: - # run the spatio-temporal aggregation - aggregator.aggregate_scale(aggregation_scale='spatiotemporal', executor=executor) - - # wait until all results are finished - executor.shutdown(wait=True) - logger.debug(f"STOP {type(executor).__name__} - Pool finished all tasks and shutdown.") - - - # finally run a thrid pool to generate reports - with PoolExecutor() as executor: - logger.debug(f"START {type(executor).__name__} - Pool to generate final reports.") - - # create a callback to log exceptions - def callback(future): - exc = future.exception() - if exc is not None: - logger.exception(exc) - - # generate the profile report - start first as this one might potentially take longer - # TODO: there should be an option to disable this - executor.submit(reporter.generate_profile_report).add_done_callback(callback) - - # generate the readme - executor.submit(reporter.generate_readme).add_done_callback(callback) - - - # wait until all results are finished - executor.shutdown(wait=True) - logger.debug(f"STOP {type(executor).__name__} - Pool finished all tasks and shutdown.") -# --------------------------------------------------------------------------- # - - # we're finished. t2 = time.time() logger.info(f"Total runtime: {t2 - tool_start:.2f} seconds.") +logger.info("##TOOL FINISH - Vforwater Loader") # print out the report with open('/out/processing.log', 'r') as f: diff --git a/src/tool.yml b/src/tool.yml index 65fa937..63e1a16 100644 --- a/src/tool.yml +++ b/src/tool.yml @@ -27,60 +27,11 @@ tools: type: datetime description: | The end date of the datasetm, if a time dimension applies to the dataset. - integration: - type: enum - values: - - none - - all - - spatial - - temporal - description: | - The mode of operation for the integration of each all data files associated to each data source - into a common DuckDB-based dataset. This dataset includes data for a unified spatial and temporal - extent and includes macros for aggregation. By setting `integration` the default integrations are - selected. The resulting database can still be used to query different aggregation levels. - - `none`: No integration will be performed and the DuckDB database will **NOT** be created. - - `all`: Temporal, spatial and spatio-temporal scale aggregations will be integrated, if the scale is defined in the dataset metadata. - - `spatial`: Only results for spatial aggregations will be provided. - - `temporal`: Only results for temporal aggregations will be provided. - - `spatiotemporal`: Only results for spatio-temporal aggregations will be provided. - optional: true - keep_data_files: - type: boolean - optional: true - description: | - If set to `false`, the data files clipped to the spatial and temporal scale as defined in the - data-source metadata will not be kept. This saves a lot of disk space for the output. - If set to `true` (default behavior), then there will be a `/out/datasets` directory in the output. - precision: - type: enum - values: - - minute - - hour - - day - - month - - year - - decade - optional: true - description: | - The precision for aggregations along the temporal scale of the datasets. This parameter does only take effect if - the integration includes a data integration along the temporal scale. That includes: `temporal`, `spatiotemporal` and `all`. - If integration is set set and no precision is supplied, the tool chooses a suitable precision. This decision is not - yet deterministic. - resolution: - type: integer - optional: true - description: | - The resolution of the output data. This parameter is only relevant for areal datasets. If the dataset is not areal, this parameter - is ignored. If the dataset is areal and the parameter is not set, the original resolution of the dataset is used. - If the dataset is areal and the parameter is set, the dataset will be resampled to the given resolution. - Note: Right now, the aggregations can only write parquet files. For larger (espeically spatio-temporal) datasets, these - aggreations can be large. A future version will write netCDF or Zarr for these cases. - apply_aggregation: + cell_touches: type: boolean - optional: true description: | - If set to `true`, the tool will apply the aggregation as defined in the metadata (precision and resoplution). - If set to `false` (default), the tool will not apply any aggregation while loading. In these casesm the integrated dataset will be available, - which is a duckdb instance that holds macros for aggregation. Thus, you can adjust the precision and resolution later on. - The main downside for duckdb instances is that they take up more disk space, than aggregated datasets. + If set to true, the tool will only return datasets that have a spatial overlap with the reference area. + If set to false, the tool will return datasets that have a spatial overlap or touch the reference area. + If omitted, the default is true. + Note: This parameter only applies to datasets with a defined spatial scale extent. + optional: true \ No newline at end of file From eb1f1362e7d622a464c3e1bb552abea6551d3340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Tue, 17 Sep 2024 09:07:06 +0200 Subject: [PATCH 3/4] update Dockerfile --- Dockerfile | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6528f7e..36b0b72 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,18 +23,18 @@ RUN pip install \ xarray[complete]==2024.7.0 \ rioxarray==0.17.0 \ pyarrow==17.0.0 \ - ydata-profiling==4.9.0 \ + #ydata-profiling==4.9.0 \ # linux AArch64 extensions are not available for 0.9.2 -> 0.10.0 is released early Feb. 2024 - "duckdb>=1.0.0" \ + #"duckdb>=1.0.0" \ polars-lts-cpu==1.1.0 \ geocube==0.6.0 # install the needed version for metacatalog -RUN pip install metacatalog==0.9.1 +RUN pip install metacatalog==0.9.2 # Install CDO, might be used to do seltimestep or sellonlatbox and possibly merge -RUN apt-get install -y gettext=0.21-12 \ - gnuplot=5.4.4+dfsg1-2 +#RUN apt-get install -y gettext=0.21-12 \ + #gnuplot=5.4.4+dfsg1-2 # cdo=2.1.1-1 # create the tool input structure @@ -50,16 +50,16 @@ COPY ./CITATION.cf[f] /src/CITATION.cff # download a precompiled binary of duckdb # first line checks the architecture, and replaces x86_64 with amd64, which is what duckdb uses -RUN arch=$(uname -m | sed s/x86_64/amd64/) && \ - mkdir /duck && \ - wget https://github.com/duckdb/duckdb/releases/download/v1.0.0/duckdb_cli-linux-${arch}.zip && \ - unzip duckdb_cli-linux-${arch}.zip && \ - rm duckdb_cli-linux-${arch}.zip && \ - chmod +x ./duckdb && \ - mv ./duckdb /duck/duckdb +# RUN arch=$(uname -m | sed s/x86_64/amd64/) && \ +# mkdir /duck && \ +# wget https://github.com/duckdb/duckdb/releases/download/v1.0.0/duckdb_cli-linux-${arch}.zip && \ +# unzip duckdb_cli-linux-${arch}.zip && \ +# rm duckdb_cli-linux-${arch}.zip && \ +# chmod +x ./duckdb && \ +# mv ./duckdb /duck/duckdb # pre-install the spatial extension into duckdb as it will be used -RUN /duck/duckdb -c "INSTALL spatial;" +# RUN /duck/duckdb -c "INSTALL spatial;" # go to the source directory of this tool WORKDIR /src From 996a5a282275637f1373a9b29915bd698133df62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Tue, 17 Sep 2024 09:24:16 +0200 Subject: [PATCH 4/4] update docs --- CITATION.cff | 7 +++---- README.md | 34 ++++++---------------------------- src/version.py | 2 +- 3 files changed, 10 insertions(+), 33 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index b8670a5..32303ec 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -9,7 +9,7 @@ type: software authors: - given-names: Mirko family-names: Mälicke - email: mirko.maelicke@KIT.edu + email: mirko.maelicke@kit.edu affiliation: >- Institute for Water and Environment, Hydrology, Karlsruhe Institute for Technology (KIT) @@ -28,7 +28,6 @@ abstract: >- The requested datasources will be made available in the output directory of the tool. Areal datasets will be clipped to the **bounding box** of the reference area and multi-file sources are preselected to fall into the time range specified. - Note that exact extracts (specific time step, specific area) are not yet supported for areal datasets. keywords: - docker - tool-spec @@ -38,5 +37,5 @@ keywords: - catchment - metacatalog license: CC-BY-4.0 -version: '0.9.3' -date-released: '2024-07-31' +version: '0.10.0' +date-released: '2024-09-17' diff --git a/README.md b/README.md index 510ed41..ef1f162 100644 --- a/README.md +++ b/README.md @@ -14,38 +14,18 @@ This tool follows the [Tool Specification](https://vforwater.github.io/tool-spec [MetaCatalog](https://github.com/vforwater/metacatalog) stores metadata about internal and external datasets along with information about the data sources and how to access them. Using this tool, one can request datasets (called *entries* in MetaCatalog) by their **id**. Additionally, an area of interest is supplied as a GeoJSON feature, called **reference area**. -The tool involves three main processing steps, of which only the first one is mandatory. - -1. The database of the connected MetaCatalog instance is queried for the `dataset_ids`. The data-files are reuqested for +The database of the connected MetaCatalog instance is queried for the `dataset_ids`. The data-files are reuqested for the temporal extent of `start_date` and `end_date` if given, while the spatial extent is requested for the bounding box of `reference_area`. MetaCatalog entires without either of the scales defined are loaded entierly. Finally, the spatial extent is clipped by the `reference_area` to match exactly. Experimental parameters are not yet exposed, but involve: + - `netcdf_backend`, which can be either `'CDO'` or `'xarray'` (default) can switch the software used for the clip of NetCDF data sources, which are commonly used for spatio-temporal datasets. - - `touches` is a boolean that is `false` by default and configures if areal grid cells are considered part of - `reference_area` if they touch (`touches=true`) or only contain the grid center (`touches=false`). + All processed data-files for each source are then saved to `/out/datasets/`, while multi-file sources are saved to child repositories. The file (or folder) names are built like: `_`. -2. The second step is only performed if the parameter `integration` is **not** set to `none`. -All available data sources are converted to long-format, where each atomic data value is indexed by the value of the -axes, that form the spatial and temporal scales (if given). These files are loaded into a DuckDB, that is exported as -`/out/dataset.db` along with all metadata from MetaCatalog as JSON, and a number of database MACROs for aggregations -along the scale axes. -For each data integration defined as `integration` (one of `['temporal', 'spatial', 'spatiotemporal']`), the MACRO is -executed and the result is saved to `/out/results/___aggs.parquet` containing -aggregations to all statistical moments, quartiles, the sum, Shannon Entropy and a histogram. -The means are further joined into a common `/out/results/mean__aggs.parquet` as the main result -outputs. The aggregation is configured via `precision` (temporal) and `resolution` (spatial). The final database -can still be used to execute other aggregations, outside of the context of this tool. - -3. The last step can only be run, if the second step was performed successfully. As of now, two finishing report-like -documents are created. First [YData Profiling](https://docs.profiling.ydata.ai/latest/) is run on the -`/out/results/mean_temporal_aggs.parquet` to create a time-series exploratory data analysis (EDA) report. It is -availabe in HTML and JSON format. -The second document is a `/out/README.md`, which is created at runtime from the data in the database. Thus, the data -tables are listed accordingly and license information is extracted and presented as available in the MetaCatalog instance. ### Parameters @@ -55,11 +35,7 @@ tables are listed accordingly and license information is extracted and presented | reference_area | A valid GeoJSON POLYGON Feature. Areal datasets will be clipped to this area. | | start_date | The start date of the dataset, if a time dimension applies to the dataset. | | end_date | The end date of the dataset, if a time dimension applies to the dataset. | -| integration | The mode of operation for integrating all data files associated with each data source into a common DuckDB-based dataset. | -| keep_data_files | If set to `false`, the data files clipped to the spatial and temporal scale will not be kept. | -| precision | The precision for aggregations along the temporal scale of the datasets. | -| resolution | The resolution of the output data. This parameter is only relevant for areal datasets. | - +| cell_touches | Specifies if an areal cell is part of the reference area if it only touches the geometry. | ## Development and local run @@ -125,11 +101,13 @@ Each container needs at least the following structure: |- src/ | |- tool.yml | |- run.py +| |- CITATION.cff ``` * `inputs.json` are parameters. Whichever framework runs the container, this is how parameters are passed. * `tool.yml` is the tool specification. It contains metadata about the scope of the tool, the number of endpoints (functions) and their parameters * `run.py` is the tool itself, or a Python script that handles the execution. It has to capture all outputs and either `print` them to console or create files in `/out` +* `CITATION.cff` Citation file providing bibliographic information on how to cite this tool. *Does `run.py` take runtime args?*: diff --git a/src/version.py b/src/version.py index 7ff9620..a9b029e 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "0.12.0" \ No newline at end of file +__version__ = "0.10.0" \ No newline at end of file