diff --git a/explainableai/core.py b/explainableai/core.py index 76532da..5512599 100644 --- a/explainableai/core.py +++ b/explainableai/core.py @@ -32,6 +32,7 @@ from .model_interpretability import interpret_model from .logging_config import logger +import dask.dataframe as dd logger=logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -125,6 +126,58 @@ def _preprocess_data(self): self.X = self.preprocessor.fit_transform(self.X) # Update feature names after preprocessing + num_feature_names = self.numerical_columns.tolist() + cat_feature_names = [] + if self.categorical_columns.size > 0: + cat_feature_names = self.preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(self.categorical_columns).tolist() + self.feature_names = num_feature_names + cat_feature_names + + # Encode target variable if it's categorical + if self.is_classifier and pd.api.types.is_categorical_dtype(self.y): + self.label_encoder = LabelEncoder() + self.y = self.label_encoder.fit_transform(self.y) + def _preprocess_data_dask(self, X, y): + # Convert pandas DataFrames to Dask DataFrames + X = dd.from_pandas(X, npartitions=4) # Adjust npartitions based on your dataset size + y = dd.from_pandas(y, npartitions=4) + + # Identify categorical and numerical columns + self.categorical_columns = X.select_dtypes(include=['object', 'category']).columns + self.numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns + + # Create preprocessing steps + numeric_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='mean')), + ('scaler', StandardScaler()) + ]) + + categorical_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), + ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) + ]) + + self.preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, self.numerical_columns), + ('cat', categorical_transformer, self.categorical_columns) + ] + ) + + # Fit and transform the data in parallel + self.X = self.preprocessor.fit_transform(X).compute() + + # Update feature names after preprocessing + num_feature_names = self.numerical_columns.tolist() + cat_feature_names = [] + if self.categorical_columns.size > 0: + cat_feature_names = self.preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(self.categorical_columns).tolist() + self.feature_names = num_feature_names + cat_feature_names + + # Encode target variable if it's categorical + if self.is_classifier and pd.api.types.is_categorical_dtype(y): + self.label_encoder = LabelEncoder() + self.y = self.label_encoder.fit_transform(y.compute()) + logger.debug("Updating feature names...") try: num_feature_names = self.numerical_columns.tolist() @@ -138,7 +191,8 @@ def _preprocess_data(self): self.label_encoder = LabelEncoder() self.y = self.label_encoder.fit_transform(self.y) except Exception as e: - logger.error(f"Some error occur while updating...{str(e)}") + logger.error(f"Some error occurred while updating... {str(e)}") + def analyze(self, batch_size=None, parallel=False, instance_index=0): diff --git a/requirements.txt b/requirements.txt index bb24654..9fed575 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ scipy pillow xgboost colorama +dask \ No newline at end of file