From 771b5b1e58d2d7ca207d453b5a32274ac296301f Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Sun, 10 Nov 2024 19:23:46 +0100 Subject: [PATCH] feat: add basic validation --- README.md | 123 ++++++++++++++++------ scystream/sdk/core.py | 33 ++++-- scystream/sdk/env/settings.py | 30 ++++++ setup.py | 3 +- tests/test_config_files/valid_config.yaml | 2 +- tests/test_core.py | 2 +- tests/test_settings.py | 50 +++++++++ 7 files changed, 202 insertions(+), 41 deletions(-) create mode 100644 scystream/sdk/env/settings.py create mode 100644 tests/test_settings.py diff --git a/README.md b/README.md index 50052fa..a5a87dc 100644 --- a/README.md +++ b/README.md @@ -8,40 +8,27 @@ You can install the package via pip once it's published: pip install scystream-sdk ``` -## Usage - -```python3 -from scystream.sdk.core import entrypoint -from scystream.sdk.scheduler import Scheduler - - -@entrypoint -def example_task(): - print("Executing example_task...") +### Compute Blocks and their configs +One of the central concepts of scystream are the so-called **Compute Blocks**. +A Compute Block describes an independent programm, that acts as some kind of worker +which will be scheduled using the scystream-core application. +This worker executes a task (e.g. a NLP task, a crwaling task). -@entrypoint -def another_task(task_name): - print(f"Executing another_task with task name: {task_name}") +Each worker can have multiple entrypoints, each aiming to solve one task. +These entrypoints can be configured from the outside using the **Settings**. +These are basically ENV-Variables, which will be parsed & validated using pydantic. +This SDK aims to implement helper functions and other requirements we expect each +Compute Block to have. -def main(): - Scheduler.list_entrypoints() - Scheduler.execute_function("example_task") - Scheduler.execute_function("another_task", "ScheduledTask") +To understand the concept of such a Compute Block even more, take a look at the +config below. - -if __name__ == "__main__": - main() - -``` - -### Compute Block Config Files We expect every repository which will be used within the scystream application -to contain a `Compute Block Config File`, the `cbc.yaml`, within the root directory. - -This yaml-file describes the compute block itself. -It shows the entrypoints, their inputs and outputs. +to contain a **Compute Block Config File**, the `cbc.yaml`, within the root directory. +This `cbc.yaml` will be used to define the entrypoints, the inputs & outputs each +Compute Block offers, necessary for the scystream-frontend to understand. This is an example `cbc.yaml`: @@ -85,7 +72,7 @@ entrypoints: description: "Analyze the runtimes" inputs: run_durations: - description: "Teble that contains all runtimes and dates" + description: "Table that contains all runtimes and dates" type: "db_table" config: RUN_DURATIONS_TABLE_NAME: "run_durations_nlp" @@ -97,7 +84,10 @@ entrypoints: CSV_OUTPUT_PATH: "outputs/statistics.csv" ``` -To read and validate such a config file u can proceed as follows: +For now, you have to write this config file on your own. However, at some +point you will be able to generate this config from your code. + +To read and validate such a config file you can proceed as follows: ```python3 from scystream.sdk.config.config_loader import load_config @@ -121,15 +111,86 @@ load_config(config_file_name="test.yaml", config_path="configs/") the `config_path` is the path relative to your root directory +## Basic Usage of the SDK + +```python3 +from scystream.sdk.core import entrypoint +from scystream.sdk.scheduler import Scheduler + + +@entrypoint +def example_task(): + print("Executing example_task...") + + +@entrypoint +def another_task(task_name): + print(f"Executing another_task with task name: {task_name}") + + +def main(): + Scheduler.list_entrypoints() + Scheduler.execute_function("example_task") + Scheduler.execute_function("another_task", "ScheduledTask") + + +if __name__ == "__main__": + main() + +``` + +## Defining Settings and Using them. + +Earlier, we already wrote about **Settings**. +Each Input & Output can be configured using these settings. +There are also Global Settings, refered to as `envs` in the `cbc.yaml` + +Below you can find a simple example of how we define & validate these settings. +Therefore you should use the `BaseENVSettings` class. + +```python3 +from scystream.sdk.core import entrypoint +from scystream.sdk.env.settings import BaseENVSettings + +class GlobalSettings(BaseENVSettings): + LANGUAGE: str = "de" + +class TopicModellingEntrypointSettings(BaseENVSettings): + TXT_SRC_PATH: str # if no default provided, setting this ENV manually is a MUST + +@entrypoint(TopicModellingEntrypointSettings) # Pass it to the Entrypoint +def topic_modelling(settings): + print(f"Running topic modelling, using file: {settings.TXT_SRC_PATH}") + +@entrypoint +def test_entrypint(): + print("This entrypoint does not have any configs.") +``` + +We recommend defining your `GlobalSettings` in an extra file and "exporting" the loaded +Settings to make them accessible to other files. +See an example below: + +```python3 +from scystream.sdk.env.settings import BaseENVSettings + +class GlobalSettings(BaseENVSettings): + LANGUAGE: str = "de" + +GLOBAL_SETTINGS = GlobalSettings.load_settings() +``` + +You can then use the loaded `GLOBAL_SETTINGS` in your other files, by importing them. ## Development of the SDK ### Installation -1. Create a venv +1. Create a venv and use it ```bash python3 -m venv .venv +source .venv/bin/activate ``` 2. Install the package within the venv diff --git a/scystream/sdk/core.py b/scystream/sdk/core.py index 3965d1c..07ee5b1 100644 --- a/scystream/sdk/core.py +++ b/scystream/sdk/core.py @@ -1,15 +1,34 @@ import functools +from typing import Callable, Type, Optional +from .env.settings import BaseENVSettings +from pydantic import ValidationError + _registered_functions = {} -def entrypoint(func): - """Decorator to mark a function as an entrypoint.""" - @functools.wraps(func) - def wrapper(*args, **kwargs): - return func(*args, **kwargs) - _registered_functions[func.__name__] = func - return wrapper +def entrypoint(settings_class: Optional[Type[BaseENVSettings]] = None): + """ + Decorator to mark a function as an entrypoint. + It also loads and injects the settings of the entrypoint. + """ + def decorator(func: Callable): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if settings_class is not None: + # Load settings + try: + settings = settings_class.load_settings() + except ValidationError as e: + raise ValueError(f"Invalid environment configuration: {e}") + + return func(settings, *args, **kwargs) + else: + return func(*args, **kwargs) + + _registered_functions[func.__name__] = wrapper + return wrapper + return decorator def get_registered_functions(): diff --git a/scystream/sdk/env/settings.py b/scystream/sdk/env/settings.py new file mode 100644 index 0000000..9eab887 --- /dev/null +++ b/scystream/sdk/env/settings.py @@ -0,0 +1,30 @@ +from pydantic_settings import BaseSettings, SettingsConfigDict +from typing import Type + +ENV_FILE_ENCODING = "utf-8" + + +class BaseENVSettings(BaseSettings): + """ + This class acts as the BaseClass which can be used to define custom + ENV-Variables which can be used across the ComputeBlock & for entrypoints + This definition, and pydantic, will then take care of validating the envs + """ + + model_config = SettingsConfigDict( + env_file_encoding=ENV_FILE_ENCODING, + case_sensitive=True, + extra="ignore" + ) + + @classmethod + def load_settings( + cls: Type["BaseENVSettings"], + env_file: str = ".env" + ) -> "BaseENVSettings": + """ + load_settings loads the env file. The name of the env_file can be + passed as an argument. + Returns the parsed ENVs + """ + return cls(_env_file=env_file, _env_file_encoding=ENV_FILE_ENCODING) diff --git a/setup.py b/setup.py index 1a176d3..411a077 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,8 @@ packages=find_packages(), install_requires=[ "pydantic>=2.9.2", - "PyYAML>=6.0.2" + "PyYAML>=6.0.2", + "pydantic-settings>=2.6.1" ], classifiers=[ "Programming Language :: Python :: 3", diff --git a/tests/test_config_files/valid_config.yaml b/tests/test_config_files/valid_config.yaml index ed443a1..73d0c3c 100644 --- a/tests/test_config_files/valid_config.yaml +++ b/tests/test_config_files/valid_config.yaml @@ -39,7 +39,7 @@ entrypoints: description: "Analyze the runtimes" inputs: run_durations: - description: "Teble that contains all runtimes and dates" + description: "Table that contains all runtimes and dates" type: "db_table" config: RUN_DURATIONS_TABLE_NAME: "run_durations_nlp" diff --git a/tests/test_core.py b/tests/test_core.py index 775ae75..f9a19ec 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -4,7 +4,7 @@ class TestEntrypoint(unittest.TestCase): def test_entrypoint_registration(self): - @entrypoint + @entrypoint() def dummy_function(): return "Hello" diff --git a/tests/test_settings.py b/tests/test_settings.py new file mode 100644 index 0000000..755f6d9 --- /dev/null +++ b/tests/test_settings.py @@ -0,0 +1,50 @@ +import unittest +import os +from scystream.sdk.core import entrypoint +from scystream.sdk.env.settings import BaseENVSettings + + +class WithDefaultSettings(BaseENVSettings): + DUMMY_SETTING: str = "this is a dummy setting" + + +class NoDefaultSetting(BaseENVSettings): + DUMMY_SETTING: str + + +class TestSettings(unittest.TestCase): + def test_entrypoint_with_setting_default(self): + @entrypoint(WithDefaultSettings) + def with_default_settings(settings): + return settings.DUMMY_SETTING + + result = with_default_settings() + self.assertEqual(result, "this is a dummy setting") + + """ + environment is set + """ + os.environ["DUMMY_SETTING"] = "overridden setting" + result = with_default_settings() + self.assertEqual(result, "overridden setting") + del os.environ["DUMMY_SETTING"] + + def test_entrypoint_with_no_setting_default(self): + @entrypoint(NoDefaultSetting) + def with_no_default_settings(settings): + return settings.DUMMY_SETTING + + with self.assertRaises(ValueError): + with_no_default_settings() + + """ + environemnt is set + """ + os.environ["DUMMY_SETTING"] = "required setting" + result = with_no_default_settings() + self.assertEqual(result, "required setting") + del os.environ["DUMMY_SETTING"] + + +if __name__ == "__main__": + unittest.main()