diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.md new file mode 100644 index 000000000..f3ef84937 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.dimensionality_reduction diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/interval_filtering.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/interval_filtering.md new file mode 100644 index 000000000..fe5f3e968 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/interval_filtering.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.interval_filtering diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.md new file mode 100644 index 000000000..70e69b3ea --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.k_sigma_anomaly_detection diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.md new file mode 100644 index 000000000..c2d5a19cb --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.denormalization diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/denormalization.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/denormalization.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization.md new file mode 100644 index 000000000..2483f8dc8 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.md new file mode 100644 index 000000000..84cb4c997 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_mean diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.md new file mode 100644 index 000000000..b0ca874ad --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_minmax diff --git a/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.md b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.md new file mode 100644 index 000000000..509474b78 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_zscore diff --git a/docs/sdk/code-reference/pipelines/machine_learning/spark/data_binning.md b/docs/sdk/code-reference/pipelines/machine_learning/spark/data_binning.md new file mode 100644 index 000000000..808285a10 --- /dev/null +++ b/docs/sdk/code-reference/pipelines/machine_learning/spark/data_binning.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.machine_learning.spark.data_binning diff --git a/docs/sdk/code-reference/pipelines/machine_learning/spark/linear_regression.md b/docs/sdk/code-reference/pipelines/machine_learning/spark/linear_regression.md new file mode 100644 index 000000000..1cade23ed --- /dev/null +++ b/docs/sdk/code-reference/pipelines/machine_learning/spark/linear_regression.md @@ -0,0 +1 @@ +::: src.sdk.python.rtdip_sdk.pipelines.machine_learning.spark.linear_regression diff --git a/src/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/linear_regression.py b/src/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/linear_regression.py index b95929a5e..4f615cdc7 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/linear_regression.py +++ b/src/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/linear_regression.py @@ -16,6 +16,7 @@ from pyspark.ml.evaluation import RegressionEvaluator from ..interfaces import MachineLearningInterface from ..._pipeline_utils.models import Libraries, SystemType +from typing import Optional class LinearRegression(MachineLearningInterface): @@ -61,7 +62,7 @@ def libraries(): def settings() -> dict: return {} - def split_data(self, train_ratio: float = 0.8): + def split_data(self, train_ratio: float = 0.8) -> tuple[DataFrame, DataFrame]: """ Splits the dataset into training and testing sets. @@ -69,7 +70,7 @@ def split_data(self, train_ratio: float = 0.8): train_ratio (float): The ratio of the data to be used for training. Default is 0.8 (80% for training). Returns: - DataFrame: Returns the training and testing datasets. + tuple[DataFrame, DataFrame]: Returns the training and testing datasets. """ train_df, test_df = self.df.randomSplit([train_ratio, 1 - train_ratio], seed=42) return train_df, test_df @@ -96,7 +97,7 @@ def predict(self, prediction_df: DataFrame): prediction_df, ) - def evaluate(self, test_df: DataFrame): + def evaluate(self, test_df: DataFrame) -> Optional[float]: """ Evaluates the trained model using RMSE. @@ -104,10 +105,9 @@ def evaluate(self, test_df: DataFrame): test_df (DataFrame): The testing dataset to evaluate the model. Returns: - float: The Root Mean Squared Error (RMSE) of the model. + Optional[float]: The Root Mean Squared Error (RMSE) of the model or None if the prediction columnd doesn't exist. """ # Check the columns of the test DataFrame - print(f"Columns in test_df: {test_df.columns}") test_df.show(5) if self.prediction_col not in test_df.columns: