amosproj · Timm638 · Nov 18, 2024 · Nov 18, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/.gitignore b/.gitignore
@@ -139,4 +139,4 @@ spark-checkpoints/
 config.share
 
 # JetBrains
-.idea/
+.idea/
diff --git a/notebook/Actual Generation per Production Type_2024-2025.csv b/notebook/Actual Generation per Production Type_2024-2025.csv
diff --git a/notebook/ExampleData.pkl b/notebook/ExampleData.pkl
diff --git a/notebook/pipeline_showcase.ipynb b/notebook/pipeline_showcase.ipynb
diff --git a/notebook/showcase_notebook.ipynb b/notebook/showcase_notebook.ipynb
diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py
@@ -20,4 +20,3 @@
 from .missing_value_imputation import MissingValueImputation
 from .out_of_range_value_filter import OutOfRangeValueFilter
 from .flatline_filter import FlatlineFilter
-from .missing_value_imputation import MissingValueImputation
diff --git a/...thon/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.py b/...thon/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.py
@@ -233,7 +233,8 @@ def filter(self) -> PySparkDataFrame:
         Imputate missing values based on [Spline Interpolation, ]
         """
         if not all(
-            col_ in self.df.columns for col_ in ["TagName", "EventTime", "Value"]
+            col_ in self.df.columns
+            for col_ in ["TagName", "EventTime", "Value", "Status"]
         ):
             raise ValueError("Columns not as expected")
 

diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2022 RTDIP
+# Copyright 2025 RTDIP
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .spark.linear_regression import *
-from .spark.arima import *
-from .spark.auto_arima import *
-from .spark.data_binning import *
+from .spark import *
diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/__init__.py
@@ -11,3 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .data_binning import DataBinning
+from .linear_regression import LinearRegression
+from .arima import ArimaPrediction
+from .auto_arima import ArimaAutoPrediction
diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/arima.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/arima.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import statistics
 from enum import Enum
 from typing import List, Tuple
@@ -264,8 +265,7 @@ def _constructor_handle_input_metadata(
         if past_data_style is not None:
             return past_data_style, value_name, timestamp_name, source_name, status_name
         # Automatic calculation part
-        schema = past_data.schema
-        schema_names = schema.names
+        schema_names = past_data.schema.names.copy()
 
         assumed_past_data_style = None
         value_name = None
@@ -279,20 +279,16 @@ def pickout_column(
             rgx = regex.compile(regex_string)
             sus_columns = list(filter(rgx.search, rem_columns))
             found_column = sus_columns[0] if len(sus_columns) == 1 else None
-            if found_column is not None:
-                rem_columns.remove(found_column)
-            return found_column, rem_columns
+            return found_column
 
         # Is there a status column?
-        status_name, remaining_columns = pickout_column(schema_names, r"(?i)status")
+        status_name = pickout_column(schema_names, r"(?i)status")
         # Is there a source name / tag
-        source_name, remaining_columns = pickout_column(schema_names, r"(?i)tag")
+        source_name = pickout_column(schema_names, r"(?i)tag")
         # Is there a timestamp column?
-        timestamp_name, remaining_columns = pickout_column(
-            schema_names, r"(?i)time|index"
-        )
+        timestamp_name = pickout_column(schema_names, r"(?i)time|index")
         # Is there a value column?
-        value_name, remaining_columns = pickout_column(schema_names, r"(?i)value")
+        value_name = pickout_column(schema_names, r"(?i)value")
 
         if source_name is not None:
             assumed_past_data_style = self.InputStyle.SOURCE_BASED
@@ -400,7 +396,7 @@ def filter(self) -> PySparkDataFrame:
             # Workaround needed for PySpark versions <3.4
             pd_df = _prepare_pandas_to_convert_to_spark(pd_df)
             predicted_source_pyspark_dataframe = self.spark_session.createDataFrame(
-                pd_df, schema=self.past_data.schema
+                pd_df, schema=copy.deepcopy(self.past_data.schema)
             )
             return predicted_source_pyspark_dataframe
         elif self.past_data_style == self.InputStyle.SOURCE_BASED:
@@ -428,7 +424,12 @@ def filter(self) -> PySparkDataFrame:
             data_to_add = _prepare_pandas_to_convert_to_spark(data_to_add)
 
             predicted_source_pyspark_dataframe = self.spark_session.createDataFrame(
-                data_to_add, schema=pd_df_schema
+                _prepare_pandas_to_convert_to_spark(
+                    data_to_add[
+                        [self.source_name, self.timestamp_name, self.value_name]
+                    ]
+                ),
+                schema=pd_df_schema,
             )
 
             if self.status_name is not None:
@@ -438,7 +439,8 @@ def filter(self) -> PySparkDataFrame:
                     )
                 )
 
-            return self.past_data.union(predicted_source_pyspark_dataframe)
+            to_return = self.past_data.unionByName(predicted_source_pyspark_dataframe)
+            return to_return
 
     def validate(self, schema_dict):
         return super().validate(schema_dict, self.past_data)
diff --git a/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/data_binning.py b/src/sdk/python/rtdip_sdk/pipelines/forecasting/spark/data_binning.py
@@ -14,7 +14,7 @@
 
 import pyspark.ml.clustering as clustering
 from pyspark.sql import DataFrame
-from src.sdk.python.rtdip_sdk.pipelines.forecasting import (
+from src.sdk.python.rtdip_sdk.pipelines.forecasting.interfaces import (
     MachineLearningInterface,
 )
 from ..._pipeline_utils.models import Libraries, SystemType
-Original file line number
+Diff line change
@@ Expand Up / @@ -139,4 +139,4 @@ spark-checkpoints/ @@
     config.share
     # JetBrains
-    .idea/
+    .idea/