From a162200d6bde75d9120f1672c6a84eea4ae6eabb Mon Sep 17 00:00:00 2001 From: Christian Munz Date: Wed, 30 Oct 2024 16:33:27 +0100 Subject: [PATCH] A setup architecture design Co-authored-by: Leon Moll Signed-off-by: Christian Munz --- .../pipelines/data_wranglers/__init__.py | 14 ++++++++++++ .../pipelines/data_wranglers/interfaces.py | 22 +++++++++++++++++++ .../data_wranglers/spark/__init__.py | 13 +++++++++++ .../spark/data_quality/__init__.py | 13 +++++++++++ .../spark/data_quality/duplicate_detection.py | 4 ++-- .../pipelines/monitoring/__init__.py | 1 - .../pipelines/monitoring/interfaces.py | 4 +++- .../data_quality/test_duplicate_detection.py | 2 +- 8 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py create mode 100644 src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py create mode 100644 src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py create mode 100644 src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py rename src/sdk/python/rtdip_sdk/pipelines/{monitoring => data_wranglers}/spark/data_quality/duplicate_detection.py (92%) diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py new file mode 100644 index 000000000..03ce505c0 --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .spark.data_quality.duplicate_detection import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py new file mode 100644 index 000000000..4a40447ee --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py @@ -0,0 +1,22 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +from ..interfaces import PipelineComponentBaseInterface + + +class WranglerBaseInterface(PipelineComponentBaseInterface): + @abstractmethod + def filter(self): + pass \ No newline at end of file diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py new file mode 100644 index 000000000..5305a429e --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py new file mode 100644 index 000000000..5305a429e --- /dev/null +++ b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/duplicate_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py similarity index 92% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/duplicate_detection.py rename to src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py index 96b58e7e4..a7865c409 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/duplicate_detection.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py @@ -14,8 +14,8 @@ from pyspark.sql.functions import desc from pyspark.sql import DataFrame as PySparkDataFrame -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from rtdip_sdk.pipelines.monitoring.interfaces import MonitoringBaseInterface +from rtdip_sdk.pipelines._pipeline_utils.models import Libraries, SystemType class DuplicateDetection(MonitoringBaseInterface): diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py index 3c244d85e..17e525274 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py @@ -12,4 +12,3 @@ # See the License for the specific language governing permissions and # limitations under the License. from .spark.data_quality.great_expectations_data_quality import * -from .spark.data_quality.duplicate_detection import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py index 2c446c5bc..d609795e8 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py +++ b/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py @@ -17,4 +17,6 @@ class MonitoringBaseInterface(PipelineComponentBaseInterface): - pass + @abstractmethod + def check(self): + pass \ No newline at end of file diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_duplicate_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_duplicate_detection.py index 1c6307550..d4b78175d 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_duplicate_detection.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_duplicate_detection.py @@ -16,7 +16,7 @@ from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.duplicate_detection import DuplicateDetection +from rtdip_sdk.pipelines.data_wranglers.spark.data_quality.duplicate_detection import DuplicateDetection @pytest.fixture(scope="session")