Skip to content

Commit

Permalink
Merge pull request #79 from DarshAgrawal14/main
Browse files Browse the repository at this point in the history
Updated module for handling large datsets
  • Loading branch information
ombhojane authored Oct 12, 2024
2 parents 27092e8 + 3e9808f commit eaaefb0
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 1 deletion.
56 changes: 55 additions & 1 deletion explainableai/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from .model_interpretability import interpret_model
from .logging_config import logger

import dask.dataframe as dd

logger=logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -125,6 +126,58 @@ def _preprocess_data(self):
self.X = self.preprocessor.fit_transform(self.X)

# Update feature names after preprocessing
num_feature_names = self.numerical_columns.tolist()
cat_feature_names = []
if self.categorical_columns.size > 0:
cat_feature_names = self.preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(self.categorical_columns).tolist()
self.feature_names = num_feature_names + cat_feature_names

# Encode target variable if it's categorical
if self.is_classifier and pd.api.types.is_categorical_dtype(self.y):
self.label_encoder = LabelEncoder()
self.y = self.label_encoder.fit_transform(self.y)
def _preprocess_data_dask(self, X, y):
# Convert pandas DataFrames to Dask DataFrames
X = dd.from_pandas(X, npartitions=4) # Adjust npartitions based on your dataset size
y = dd.from_pandas(y, npartitions=4)

# Identify categorical and numerical columns
self.categorical_columns = X.select_dtypes(include=['object', 'category']).columns
self.numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing steps
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

self.preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, self.numerical_columns),
('cat', categorical_transformer, self.categorical_columns)
]
)

# Fit and transform the data in parallel
self.X = self.preprocessor.fit_transform(X).compute()

# Update feature names after preprocessing
num_feature_names = self.numerical_columns.tolist()
cat_feature_names = []
if self.categorical_columns.size > 0:
cat_feature_names = self.preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(self.categorical_columns).tolist()
self.feature_names = num_feature_names + cat_feature_names

# Encode target variable if it's categorical
if self.is_classifier and pd.api.types.is_categorical_dtype(y):
self.label_encoder = LabelEncoder()
self.y = self.label_encoder.fit_transform(y.compute())

logger.debug("Updating feature names...")
try:
num_feature_names = self.numerical_columns.tolist()
Expand All @@ -138,7 +191,8 @@ def _preprocess_data(self):
self.label_encoder = LabelEncoder()
self.y = self.label_encoder.fit_transform(self.y)
except Exception as e:
logger.error(f"Some error occur while updating...{str(e)}")
logger.error(f"Some error occurred while updating... {str(e)}")



def analyze(self, batch_size=None, parallel=False, instance_index=0):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ scipy
pillow
xgboost
colorama
dask

0 comments on commit eaaefb0

Please sign in to comment.