From c6066c37c31b45081972516cb38afcb845f4f964 Mon Sep 17 00:00:00 2001
From: DarshAgrawal14 <agrawaldarsh.14@gmail.com>
Date: Mon, 7 Oct 2024 19:38:37 +0530
Subject: [PATCH 1/2] Updated module for handling large datsets

tried chunking the dataset but the it was getting complex
therefore used dask dataframe for better memory efficiency for loading and handling large datasets
---
 explainableai/core.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/explainableai/core.py b/explainableai/core.py
index 5a903ea..7f1be07 100644
--- a/explainableai/core.py
+++ b/explainableai/core.py
@@ -24,6 +24,7 @@
 from .model_selection import compare_models
 from reportlab.platypus import PageBreak
 
+import dask.dataframe as dd
 
 class XAIWrapper:
     def __init__(self):
@@ -112,6 +113,47 @@ def _preprocess_data(self):
         if self.is_classifier and pd.api.types.is_categorical_dtype(self.y):
             self.label_encoder = LabelEncoder()
             self.y = self.label_encoder.fit_transform(self.y)
+    def _preprocess_data_dask(self, X, y):
+        # Convert pandas DataFrames to Dask DataFrames
+        X = dd.from_pandas(X, npartitions=4)  # Adjust npartitions based on your dataset size
+        y = dd.from_pandas(y, npartitions=4)
+
+        # Identify categorical and numerical columns
+        self.categorical_columns = X.select_dtypes(include=['object', 'category']).columns
+        self.numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
+
+        # Create preprocessing steps
+        numeric_transformer = Pipeline(steps=[
+            ('imputer', SimpleImputer(strategy='mean')),
+            ('scaler', StandardScaler())
+        ])
+
+        categorical_transformer = Pipeline(steps=[
+            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
+            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
+        ])
+
+        self.preprocessor = ColumnTransformer(
+            transformers=[
+                ('num', numeric_transformer, self.numerical_columns),
+                ('cat', categorical_transformer, self.categorical_columns)
+            ]
+        )
+
+        # Fit and transform the data in parallel
+        self.X = self.preprocessor.fit_transform(X).compute()
+
+        # Update feature names after preprocessing
+        num_feature_names = self.numerical_columns.tolist()
+        cat_feature_names = []
+        if self.categorical_columns.size > 0:
+            cat_feature_names = self.preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(self.categorical_columns).tolist()
+        self.feature_names = num_feature_names + cat_feature_names
+
+        # Encode target variable if it's categorical
+        if self.is_classifier and pd.api.types.is_categorical_dtype(y):
+            self.label_encoder = LabelEncoder()
+            self.y = self.label_encoder.fit_transform(y.compute())
 
     def analyze(self):
         results = {}

From 3e9808f47368a58ba4b183f0b8e63da1f7dc1956 Mon Sep 17 00:00:00 2001
From: DarshAgrawal14 <agrawaldarsh.14@gmail.com>
Date: Thu, 10 Oct 2024 12:45:17 +0530
Subject: [PATCH 2/2] Update requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index bb24654..9fed575 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ scipy
 pillow
 xgboost
 colorama
+dask
\ No newline at end of file