From cf57d8ed557fe082879fdf2ba0a1634769176709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Fri, 28 Oct 2022 11:08:06 +0200 Subject: [PATCH] first version --- Dockerfile | 7 +-- in/dataframe.csv | 151 ++++++++++++++++++++++++++++++++++++++++++++++ in/foo_csv.csv | 11 ---- in/foo_matrix.mat | 10 --- in/tool.json | 10 +-- src/run.py | 48 +++++++++++++-- src/tool.yml | 36 +++++------ 7 files changed, 213 insertions(+), 60 deletions(-) create mode 100644 in/dataframe.csv delete mode 100644 in/foo_csv.csv delete mode 100644 in/foo_matrix.mat diff --git a/Dockerfile b/Dockerfile index 37e8b1a..3803832 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,12 +2,11 @@ FROM python:3.10 # install the toolbox runner tools -RUN pip install toolbox-runner +RUN pip install toolbox-runner==0.5.0 -# Do anything you need to install tool dependencies here -RUN echo "Replace this line with a tool" - +# install pandas and pandas-profiling +RUN pip install pandas==1.5.1 pandas-profiling==3.4.0 # create the tool input structure RUN mkdir /in COPY ./in /in diff --git a/in/dataframe.csv b/in/dataframe.csv new file mode 100644 index 0000000..a0c23a4 --- /dev/null +++ b/in/dataframe.csv @@ -0,0 +1,151 @@ +sepallength,sepalwidth,petallength,petalwidth,class +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/in/foo_csv.csv b/in/foo_csv.csv deleted file mode 100644 index fb32ee6..0000000 --- a/in/foo_csv.csv +++ /dev/null @@ -1,11 +0,0 @@ -A,B,C,D -3,14,10,17 -2,0,2,24 -2,17,19,24 -3,13,1,18 -13,13,7,10 -10,7,13,14 -12,12,9,5 -16,24,3,16 -11,3,2,14 -19,4,5,20 diff --git a/in/foo_matrix.mat b/in/foo_matrix.mat deleted file mode 100644 index 948f60b..0000000 --- a/in/foo_matrix.mat +++ /dev/null @@ -1,10 +0,0 @@ -# Created by Octave 6.4.0, Thu Oct 27 08:57:13 2022 UTC -# name: foo_matrix -# type: matrix -# rows: 3 -# columns: 2 - 1 2 - 3 4 - 5 6.0999999999999996 - - diff --git a/in/tool.json b/in/tool.json index 8b61bbc..c2fd1b5 100644 --- a/in/tool.json +++ b/in/tool.json @@ -1,11 +1,5 @@ { - "foobar": { - "foo_int": 42, - "foo_float": 13.37, - "foo_string": "Never eat yellow snow", - "foo_enum": "bar", - "foo_array": [34, 55, 23, 43, 23], - "foo_matrix": "/in/foo_matrix.mat", - "foo_csv": "/in/foo_csv.csv" + "profile": { + "data": "./dataframe.csv" } } \ No newline at end of file diff --git a/src/run.py b/src/run.py index 9713143..d407b57 100644 --- a/src/run.py +++ b/src/run.py @@ -1,19 +1,57 @@ import os from datetime import datetime as dt +from pandas_profiling import ProfileReport +import numpy as np +import pandas as pd + from toolbox_runner.parameter import parse_parameter # parse parameters kwargs = parse_parameter() # check if a toolname was set in env -toolname = os.environ.get('TOOL_RUN', 'foobar').lower() +toolname = os.environ.get('TOOL_RUN', 'profile').lower() + + +def load_data(df_or_path): + if isinstance(df_or_path, str): + # oi + path = kwargs['data'] + _, ext = os.path.splitext(path) + + # check some endings + if ext.lower() in ('.xls', '.xlsx', '.odf', '.ods'): + data = pd.read_excel(df_or_path) + elif ext.lower() in ('.asc', '.dat', '.mat', '.txt'): + data = pd.read_table(df_or_path, comment='#') + else: + raise AttributeError('Got a file path, but the extension is not (yet) supported.') + elif isinstance(df_or_path, (np.ndarray, pd.Series)): + data = pd.DataFrame(df_or_path) + elif isinstance(df_or_path, pd.DataFrame): + data = df_or_path + else: + raise AttributeError(f"The passed data was of type {type(df_or_path)} which is not supported.") + return data + # switch the tool -if toolname == 'foobar': - # RUN the tool here and create the output in /out - with open('/out/STDOUT.log', 'w') as f: - f.write('This toolbox does not include any tool. Did you run the template?\n') +if toolname == 'profile': + # kwargs data will automatically be loaded as a Dataframe. If it is still a string, try + # to figure out what this is. + df = load_data(kwargs['data']) + del kwargs['data'] + + profile = ProfileReport(df, title="Dataset Report") + + # generate the output + profile.to_file('/out/report.html') + js = profile.to_json() + + with open('/out/report.json') as f: + f.write(js) + # In any other case, it was not clear which tool to run else: diff --git a/src/tool.yml b/src/tool.yml index dd00d75..8928a99 100644 --- a/src/tool.yml +++ b/src/tool.yml @@ -1,25 +1,17 @@ tools: - foobar: - title: Foo Bar - description: A dummy tool to exemplify the YAML file - version: 0.1 + profile: + title: Dataset Profile + description: | + Create a HTML or JSON profiling report for any kind of tabular data. + This image is Python and uses pandas-profiling: https://pypi.org/project/pandas-profiling/ + version: 0.2 parameters: - foo_int: - type: integer - foo_float: - type: float - foo_string: - type: string - foo_enum: - type: enum - values: - - foo - - bar - - baz - foo_array: - type: integer - array: true - foo_matrix: - type: file - foo_csv: + data: type: file + description: | + A CSV file containing the tabular data. From Python SDK, you can pass any pandas.DataFrame. + It is also possible to pass other file formats, but this is still + options: + type: dict + description: Experimental. Do not use. +