diff --git a/createDB.py b/createDB.py
index 5bcdc94..96e7dbb 100644
--- a/createDB.py
+++ b/createDB.py
@@ -1,14 +1,21 @@
import numpy as np
import pandas as pd
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
# Set seed for reproducibility
+logger.debug("Setting the speed...")
np.random.seed(42)
# Generate 100 rows of data
+logger.debug("Setting up the data...")
data_size = 100
# Generate 4 features with random values
# Feature 1: Age (18 to 90)
+logger.debug("Generating the features...")
age = np.random.randint(18, 90, data_size)
# Feature 2: Tumor size (1 to 10 cm)
@@ -22,11 +29,14 @@
# Generate target column 'Cancer' based on some logic
# For simplicity, assume higher risk if age > 50, tumor_size > 5, gene_mutation = 1, and smoking_history > 20
+logger.debug("Generating the column...")
cancer_risk = (age > 50) & (tumor_size > 5) & (gene_mutation == 1) & (smoking_history > 20)
cancer = np.where(cancer_risk, 1, 0)
# Create a DataFrame
+logger.debug("Creating the DataFrame...")
df = pd.DataFrame({
+
'Age': age,
'Tumor_Size': tumor_size,
'Gene_Mutation': gene_mutation,
@@ -35,6 +45,7 @@
})
# Save the DataFrame to a CSV file
+logger.info("Saving the csv file...")
df.to_csv('cancer.csv', index=False)
-print("CSV file 'cancer.csv' has been created.")
+logger.info("CSV file 'cancer.csv' has been created.")
diff --git a/explainableai/__init__.py b/explainableai/__init__.py
index d433203..255269c 100644
--- a/explainableai/__init__.py
+++ b/explainableai/__init__.py
@@ -1,4 +1,6 @@
# explainableai/__init__.py
from .core import XAIWrapper
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
__all__ = ['XAIWrapper']
\ No newline at end of file
diff --git a/explainableai/anomaly_detection.py b/explainableai/anomaly_detection.py
index c89deff..eee3b76 100644
--- a/explainableai/anomaly_detection.py
+++ b/explainableai/anomaly_detection.py
@@ -1,6 +1,18 @@
from sklearn.ensemble import IsolationForest
+import logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def detect_anomalies(X):
- iso_forest = IsolationForest(contamination=0.1, random_state=42)
- anomalies = iso_forest.fit_predict(X)
- return anomalies
\ No newline at end of file
+ try:
+ #Creating an Isolation forest
+ logger.debug("Creating isolation forest model...")
+ iso_forest = IsolationForest(contamination=0.1, random_state=42)
+
+ #Prediction
+ logger.debug("Making Prediction...")
+ anomalies = iso_forest.fit_predict(X)
+ logger.info("Prediction Maked...")
+ return anomalies
+ except Exception as e:
+ logger.error(f"Something wrong in the prediction...{str(e)}")
diff --git a/explainableai/core.py b/explainableai/core.py
index 996add6..455b1bd 100644
--- a/explainableai/core.py
+++ b/explainableai/core.py
@@ -25,8 +25,10 @@
from .report_generator import ReportGenerator
from .model_selection import compare_models
from reportlab.platypus import PageBreak
+import logging
-
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
class XAIWrapper:
def __init__(self):
self.model = None
@@ -43,40 +45,51 @@ def __init__(self):
self.results = None # Add this line to store analysis results
def fit(self, models, X, y, feature_names=None):
- if isinstance(models, dict):
- self.models = models
- else:
- self.models = {'Model': models}
- self.X = X
- self.y = y
- self.feature_names = feature_names if feature_names is not None else X.columns.tolist()
- self.is_classifier = all(hasattr(model, "predict_proba") for model in self.models.values())
-
- print(f"{Fore.BLUE}Preprocessing data...{Style.RESET_ALL}")
- self._preprocess_data()
-
- print(f"{Fore.BLUE}Fitting models and analyzing...{Style.RESET_ALL}")
- self.model_comparison_results = self._compare_models()
-
- # Select the best model based on cv_score
- best_model_name = max(self.model_comparison_results, key=lambda x: self.model_comparison_results[x]['cv_score'])
- self.model = self.models[best_model_name]
- self.model.fit(self.X, self.y)
-
- return self
+ logger.debug("Fitting the model...")
+ try:
+ if isinstance(models, dict):
+ self.models = models
+ else:
+ self.models = {'Model': models}
+ self.X = X
+ self.y = y
+ self.feature_names = feature_names if feature_names is not None else X.columns.tolist()
+ self.is_classifier = all(hasattr(model, "predict_proba") for model in self.models.values())
+
+ logger.info(f"{Fore.BLUE}Preprocessing data...{Style.RESET_ALL}")
+ self._preprocess_data()
+
+ logger.info(f"{Fore.BLUE}Fitting models and analyzing...{Style.RESET_ALL}")
+ self.model_comparison_results = self._compare_models()
+
+ # Select the best model based on cv_score
+ best_model_name = max(self.model_comparison_results, key=lambda x: self.model_comparison_results[x]['cv_score'])
+ self.model = self.models[best_model_name]
+ self.model.fit(self.X, self.y)
+
+ logger.info("Model fitting is complete...")
+ return self
+ except Exception as e:
+ logger.error(f"Some error occur while fitting the models...{str(e)}")
+
def _compare_models(self):
- from sklearn.model_selection import cross_val_score
- results = {}
- for name, model in self.models.items():
- cv_scores = cross_val_score(model, self.X, self.y, cv=5, scoring='roc_auc' if self.is_classifier else 'r2')
- model.fit(self.X, self.y)
- test_score = model.score(self.X, self.y)
- results[name] = {
- 'cv_score': cv_scores.mean(),
- 'test_score': test_score
- }
- return results
+ logger.debug("Comparing the models...")
+ try:
+ from sklearn.model_selection import cross_val_score
+ results = {}
+ for name, model in self.models.items():
+ cv_scores = cross_val_score(model, self.X, self.y, cv=5, scoring='roc_auc' if self.is_classifier else 'r2')
+ model.fit(self.X, self.y)
+ test_score = model.score(self.X, self.y)
+ results[name] = {
+ 'cv_score': cv_scores.mean(),
+ 'test_score': test_score
+ }
+ logger.info("Comparing successfully...")
+ return results
+ except Exception as e:
+ logger.error(f"Some error occur while comparing models...{str(e)}")
def _preprocess_data(self):
# Identify categorical and numerical columns
@@ -84,6 +97,7 @@ def _preprocess_data(self):
self.numerical_columns = self.X.select_dtypes(include=['int64', 'float64']).columns
# Create preprocessing steps
+ logger.debug("Creating Preprocessing Steps...")
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
@@ -99,48 +113,55 @@ def _preprocess_data(self):
('num', numeric_transformer, self.numerical_columns),
('cat', categorical_transformer, self.categorical_columns)
])
+ logger.info("Pre proccessing completed...")
# Fit and transform the data
+ logger.debug("Fitting and transforming the data...")
self.X = self.preprocessor.fit_transform(self.X)
# Update feature names after preprocessing
- num_feature_names = self.numerical_columns.tolist()
- cat_feature_names = []
- if self.categorical_columns.size > 0:
- cat_feature_names = self.preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(self.categorical_columns).tolist()
- self.feature_names = num_feature_names + cat_feature_names
-
- # Encode target variable if it's categorical
- if self.is_classifier and pd.api.types.is_categorical_dtype(self.y):
- self.label_encoder = LabelEncoder()
- self.y = self.label_encoder.fit_transform(self.y)
+ logger.debug("Updating feature names...")
+ try:
+ num_feature_names = self.numerical_columns.tolist()
+ cat_feature_names = []
+ if self.categorical_columns.size > 0:
+ cat_feature_names = self.preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(self.categorical_columns).tolist()
+ self.feature_names = num_feature_names + cat_feature_names
+
+ # Encode target variable if it's categorical
+ if self.is_classifier and pd.api.types.is_categorical_dtype(self.y):
+ self.label_encoder = LabelEncoder()
+ self.y = self.label_encoder.fit_transform(self.y)
+ except Exception as e:
+ logger.error(f"Some error occur while updating...{str(e)}")
def analyze(self):
+ logger.debug("Analysing...")
results = {}
- print("Evaluating model performance...")
+ logger.info("Evaluating model performance...")
results['model_performance'] = evaluate_model(self.model, self.X, self.y, self.is_classifier)
- print("Calculating feature importance...")
+ logger.info("Calculating feature importance...")
self.feature_importance = self._calculate_feature_importance()
results['feature_importance'] = self.feature_importance
- print("Generating visualizations...")
+ logger.info("Generating visualizations...")
self._generate_visualizations(self.feature_importance)
- print("Calculating SHAP values...")
+ logger.info("Calculating SHAP values...")
results['shap_values'] = calculate_shap_values(self.model, self.X, self.feature_names)
- print("Performing cross-validation...")
+ logger.info("Performing cross-validation...")
mean_score, std_score = cross_validate(self.model, self.X, self.y)
results['cv_scores'] = (mean_score, std_score)
- print("Model comparison results:")
+ logger.info("Model comparison results:")
results['model_comparison'] = self.model_comparison_results
self._print_results(results)
- print("Generating LLM explanation...")
+ logger.info("Generating LLM explanation...")
results['llm_explanation'] = get_llm_explanation(self.gemini_model, results)
self.results = results
@@ -162,6 +183,7 @@ def model_comparison():
report.add_table(model_comparison_data)
+
# Model Performance
def model_performance():
report.add_heading("Model Performance", level=2)
@@ -177,6 +199,7 @@ def feature_importance():
feature_importance_data = [["Feature", "Importance"]] + [[feature, f"{importance:.4f}"] for feature, importance in self.feature_importance.items()]
report.add_table(feature_importance_data)
+
# Visualizations
def visualization():
report.add_heading("Visualizations", level=2)
@@ -265,126 +288,151 @@ def llm_explanation():
else:
print("Invalid input. Please enter 'y' or 'n' ")
-
-
-
-
+
def predict(self, X):
- if self.model is None:
- raise ValueError("Model has not been fitted. Please run fit() first.")
-
- X = self._preprocess_input(X)
-
- if self.is_classifier:
- prediction = self.model.predict(X)
- probabilities = self.model.predict_proba(X)
- if self.label_encoder:
- prediction = self.label_encoder.inverse_transform(prediction)
- return prediction, probabilities
- else:
- prediction = self.model.predict(X)
- return prediction
+ logger.debug("Prediction...")
+ try:
+ if self.model is None:
+ raise ValueError("Model has not been fitted. Please run fit() first.")
+
+ X = self._preprocess_input(X)
+
+ if self.is_classifier:
+ prediction = self.model.predict(X)
+ probabilities = self.model.predict_proba(X)
+ if self.label_encoder:
+ prediction = self.label_encoder.inverse_transform(prediction)
+ logger.info("Prediction Completed...")
+ return prediction, probabilities
+ else:
+ prediction = self.model.predict(X)
+ logger.info("Prediction Completed...")
+ return prediction
+ except Exception as e:
+ logger.error(f"Error in prediction...{str(e)}")
def _preprocess_input(self, X):
# Ensure X is a DataFrame
- if not isinstance(X, pd.DataFrame):
- X = pd.DataFrame(X, columns=self.feature_names)
+ logger.debug("Preproceesing input...")
+ try:
+ if not isinstance(X, pd.DataFrame):
+ X = pd.DataFrame(X, columns=self.feature_names)
- # Apply the same preprocessing as during training
- X = self.preprocessor.transform(X)
+ # Apply the same preprocessing as during training
+ X = self.preprocessor.transform(X)
+ logger.info("Preprocessing the data...")
- return X
+ return X
+ except Exception as e:
+ logger.error(f"Some error occur in preprocessing the inpur...{str(e)}")
def explain_prediction(self, input_data):
+ logger.debug("Explaining the prediction...")
input_df = pd.DataFrame([input_data])
prediction, probabilities = self.predict(input_df)
explanation = get_prediction_explanation(self.gemini_model, input_data, prediction[0], probabilities[0], self.feature_importance)
+ logger.info("Prediction explained...")
return prediction[0], probabilities[0], explanation
def _calculate_feature_importance(self):
+ logger.debug("Calculating the features...")
perm_importance = permutation_importance(self.model, self.X, self.y, n_repeats=10, random_state=42)
feature_importance = {feature: importance for feature, importance in zip(self.feature_names, perm_importance.importances_mean)}
+ logger.info("Features calculated...")
return dict(sorted(feature_importance.items(), key=lambda item: abs(item[1]), reverse=True))
def _generate_visualizations(self, feature_importance):
- plot_feature_importance(feature_importance)
- plot_partial_dependence(self.model, self.X, feature_importance, self.feature_names)
- plot_learning_curve(self.model, self.X, self.y)
- plot_correlation_heatmap(pd.DataFrame(self.X, columns=self.feature_names))
- if self.is_classifier:
- plot_roc_curve(self.model, self.X, self.y)
- plot_precision_recall_curve(self.model, self.X, self.y)
+ logger.debug("Generating visulatization...")
+ try:
+ plot_feature_importance(feature_importance)
+ plot_partial_dependence(self.model, self.X, feature_importance, self.feature_names)
+ plot_learning_curve(self.model, self.X, self.y)
+ plot_correlation_heatmap(pd.DataFrame(self.X, columns=self.feature_names))
+ if self.is_classifier:
+ plot_roc_curve(self.model, self.X, self.y)
+ plot_precision_recall_curve(self.model, self.X, self.y)
+ logger.info("Visualizations generated.")
+ except Exception as e:
+ logger.error(f"Error in visulatization...{str(e)}")
def _print_results(self, results):
- print("\nModel Performance:")
- for metric, value in results['model_performance'].items():
- if isinstance(value, (int, float, np.float64)):
- print(f"{metric}: {value:.4f}")
- else:
- print(f"{metric}:\n{value}")
+ logger.debug("Printing results...")
+ try:
+ logger.info("\nModel Performance:")
+ for metric, value in results['model_performance'].items():
+ if isinstance(value, (int, float, np.float64)):
+ logger.info(f"{metric}: {value:.4f}")
+ else:
+ logger.info(f"{metric}:\n{value}")
- print("\nTop 5 Important Features:")
- for feature, importance in list(results['feature_importance'].items())[:5]:
- print(f"{feature}: {importance:.4f}")
+ logger.info("\nTop 5 Important Features:")
+ for feature, importance in list(results['feature_importance'].items())[:5]:
+ logger.info(f"{feature}: {importance:.4f}")
- print(f"\nCross-validation Score: {results['cv_scores'][0]:.4f} (+/- {results['cv_scores'][1]:.4f})")
+ logger.info(f"\nCross-validation Score: {results['cv_scores'][0]:.4f} (+/- {results['cv_scores'][1]:.4f})")
- print("\nVisualizations saved:")
- print("- Feature Importance: feature_importance.png")
- print("- Partial Dependence: partial_dependence.png")
- print("- Learning Curve: learning_curve.png")
- print("- Correlation Heatmap: correlation_heatmap.png")
- if self.is_classifier:
- print("- ROC Curve: roc_curve.png")
- print("- Precision-Recall Curve: precision_recall_curve.png")
+ logger.info("\nVisualizations saved:")
+ logger.info("- Feature Importance: feature_importance.png")
+ logger.info("- Partial Dependence: partial_dependence.png")
+ logger.info("- Learning Curve: learning_curve.png")
+ logger.info("- Correlation Heatmap: correlation_heatmap.png")
+ if self.is_classifier:
+ logger.info("- ROC Curve: roc_curve.png")
+ logger.info("- Precision-Recall Curve: precision_recall_curve.png")
- if results['shap_values'] is not None:
- print("\nSHAP values calculated successfully. See 'shap_summary.png' for visualization.")
- else:
- print("\nSHAP values calculation failed. Please check the console output for more details.")
+ if results['shap_values'] is not None:
+ logger.info("\nSHAP values calculated successfully. See 'shap_summary.png' for visualization.")
+ else:
+ logger.info("\nSHAP values calculation failed. Please check the console output for more details.")
+ except Exception as e:
+ logger.error(f"Error occur in printing results...{str(e)}")
@staticmethod
def perform_eda(df):
- print(f"{Fore.CYAN}Exploratory Data Analysis:{Style.RESET_ALL}")
- print(f"{Fore.GREEN}Dataset shape: {df.shape}{Style.RESET_ALL}")
- print(f"{Fore.CYAN}Dataset info:{Style.RESET_ALL}")
- df.info()
- print(f"{Fore.CYAN}Summary statistics:{Style.RESET_ALL}")
- print(df.describe())
- print(f"{Fore.CYAN}Missing values:{Style.RESET_ALL}")
- print(df.isnull().sum())
- print(f"{Fore.CYAN}Data types:{Style.RESET_ALL}")
- print(df.dtypes)
- print(f"{Fore.CYAN}Unique values in each column:{Style.RESET_ALL}")
- for col in df.columns:
- print(f"{Fore.GREEN}{col}: {df[col].nunique()}{Style.RESET_ALL}")
-
- # Additional EDA steps
- print(f"{Fore.CYAN}Correlation matrix:{Style.RESET_ALL}")
- corr_matrix = df.select_dtypes(include=[np.number]).corr()
- print(corr_matrix)
-
- # Identify highly correlated features
- high_corr = np.where(np.abs(corr_matrix) > 0.8)
- high_corr_list = [(corr_matrix.index[x], corr_matrix.columns[y]) for x, y in zip(*high_corr) if x != y and x < y]
- if high_corr_list:
- print(f"{Fore.YELLOW}Highly correlated features:{Style.RESET_ALL}")
- for feat1, feat2 in high_corr_list:
- print(f"{Fore.GREEN}{feat1} - {feat2}: {corr_matrix.loc[feat1, feat2]:.2f}{Style.RESET_ALL}")
-
- # Identify potential outliers
- print(f"{Fore.CYAN}Potential outliers (values beyond 3 standard deviations):{Style.RESET_ALL}")
- numeric_cols = df.select_dtypes(include=[np.number]).columns
- for col in numeric_cols:
- mean = df[col].mean()
- std = df[col].std()
- outliers = df[(df[col] < mean - 3 * std) | (df[col] > mean + 3 * std)]
- if not outliers.empty:
- print(f"{Fore.GREEN}{col}: {len(outliers)} potential outliers{Style.RESET_ALL}")
-
- # Class distribution for the target variable (assuming last column is target)
- target_col = df.columns[-1]
- print(f"{Fore.CYAN}Class distribution for target variable '{target_col}':{Style.RESET_ALL}")
- print(df[target_col].value_counts(normalize=True))
+ logger.debug("Performing exploratory data analysis...")
+ try:
+ logger.info(f"{Fore.CYAN}Exploratory Data Analysis:{Style.RESET_ALL}")
+ logger.info(f"{Fore.GREEN}Dataset shape: {df.shape}{Style.RESET_ALL}")
+ logger.info(f"{Fore.CYAN}Dataset info:{Style.RESET_ALL}")
+ df.info()
+ logger.info(f"{Fore.CYAN}Summary statistics:{Style.RESET_ALL}")
+ logger.info(df.describe())
+ logger.info(f"{Fore.CYAN}Missing values:{Style.RESET_ALL}")
+ logger.info(df.isnull().sum())
+ logger.info(f"{Fore.CYAN}Data types:{Style.RESET_ALL}")
+ logger.info(df.dtypes)
+ logger.info(f"{Fore.CYAN}Unique values in each column:{Style.RESET_ALL}")
+ for col in df.columns:
+ logger.info(f"{Fore.GREEN}{col}: {df[col].nunique()}{Style.RESET_ALL}")
+
+ # Additional EDA steps
+ logger.info(f"{Fore.CYAN}Correlation matrix:{Style.RESET_ALL}")
+ corr_matrix = df.select_dtypes(include=[np.number]).corr()
+ logger.info(corr_matrix)
+
+ # Identify highly correlated features
+ high_corr = np.where(np.abs(corr_matrix) > 0.8)
+ high_corr_list = [(corr_matrix.index[x], corr_matrix.columns[y]) for x, y in zip(*high_corr) if x != y and x < y]
+ if high_corr_list:
+ logger.info(f"{Fore.YELLOW}Highly correlated features:{Style.RESET_ALL}")
+ for feat1, feat2 in high_corr_list:
+ logger.info(f"{Fore.GREEN}{feat1} - {feat2}: {corr_matrix.loc[feat1, feat2]:.2f}{Style.RESET_ALL}")
+
+ # Identify potential outliers
+ logger.info(f"{Fore.CYAN}Potential outliers (values beyond 3 standard deviations):{Style.RESET_ALL}")
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
+ for col in numeric_cols:
+ mean = df[col].mean()
+ std = df[col].std()
+ outliers = df[(df[col] < mean - 3 * std) | (df[col] > mean + 3 * std)]
+ if not outliers.empty:
+ logger.info(f"{Fore.GREEN}{col}: {len(outliers)} potential outliers{Style.RESET_ALL}")
+
+ # Class distribution for the target variable (assuming last column is target)
+ target_col = df.columns[-1]
+ logger.info(f"{Fore.CYAN}Class distribution for target variable '{target_col}':{Style.RESET_ALL}")
+ logger.info(df[target_col].value_counts(normalize=True))
+ except Exception as e:
+ logger.error(f"Error occurred during exploratory data analysis...{str(e)}")
diff --git a/explainableai/fairness.py b/explainableai/fairness.py
index c55c6f7..93b9eee 100644
--- a/explainableai/fairness.py
+++ b/explainableai/fairness.py
@@ -2,48 +2,68 @@
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
def demographic_parity(y_true, y_pred, protected_attribute):
- groups = np.unique(protected_attribute)
- group_probs = {}
- for group in groups:
- group_mask = protected_attribute == group
- group_probs[group] = np.mean(y_pred[group_mask])
+ logger.debug("Calculating demographic praity...")
+ try:
+ groups = np.unique(protected_attribute)
+ group_probs = {}
+ for group in groups:
+ group_mask = protected_attribute == group
+ group_probs[group] = np.mean(y_pred[group_mask])
- max_diff = max(group_probs.values()) - min(group_probs.values())
- return max_diff, group_probs
+ max_diff = max(group_probs.values()) - min(group_probs.values())
+ return max_diff, group_probs
+ except Exception as e:
+ logger.error(f"Error occurred by calculating demographic paraity...{str(e)}")
def equal_opportunity(y_true, y_pred, protected_attribute):
- groups = np.unique(protected_attribute)
- group_tpr = {}
- for group in groups:
- group_mask = protected_attribute == group
- tn, fp, fn, tp = confusion_matrix(y_true[group_mask], y_pred[group_mask]).ravel()
- group_tpr[group] = tp / (tp + fn) if (tp + fn) > 0 else 0
+ logger.debug("Checking for equal opportunity...")
+ try:
+ groups = np.unique(protected_attribute)
+ group_tpr = {}
+ for group in groups:
+ group_mask = protected_attribute == group
+ tn, fp, fn, tp = confusion_matrix(y_true[group_mask], y_pred[group_mask]).ravel()
+ group_tpr[group] = tp / (tp + fn) if (tp + fn) > 0 else 0
- max_diff = max(group_tpr.values()) - min(group_tpr.values())
- return max_diff, group_tpr
+ max_diff = max(group_tpr.values()) - min(group_tpr.values())
+ return max_diff, group_tpr
+ except Exception as e:
+ logger.error(f"Error occurred while calulating equal opportunity...{str(e)}")
def disparate_impact(y_true, y_pred, protected_attribute):
- groups = np.unique(protected_attribute)
- group_probs = {}
- for group in groups:
- group_mask = protected_attribute == group
- group_probs[group] = np.mean(y_pred[group_mask])
+ logger.debug("Calculating disparate impact...")
+ try:
+ groups = np.unique(protected_attribute)
+ group_probs = {}
+ for group in groups:
+ group_mask = protected_attribute == group
+ group_probs[group] = np.mean(y_pred[group_mask])
- di = min(group_probs.values()) / max(group_probs.values())
- return di, group_probs
-
+ di = min(group_probs.values()) / max(group_probs.values())
+ return di, group_probs
+ except Exception as e:
+ logger.error(f"Error occurred while Calculating disparate impact...{str(e)}")
def plot_fairness_metrics(fairness_metrics):
- plt.figure(figsize=(12, 6))
- for metric, values in fairness_metrics.items():
- plt.bar(range(len(values)), list(values.values()), label=metric)
- plt.xticks(range(len(values)), list(values.keys()))
+ logger.debug("Plotting fairness metrics...")
+ try:
+ plt.figure(figsize=(12, 6))
+ for metric, values in fairness_metrics.items():
+ plt.bar(range(len(values)), list(values.values()), label=metric)
+ plt.xticks(range(len(values)), list(values.keys()))
- plt.xlabel('Protected Group')
- plt.ylabel('Metric Value')
- plt.title('Fairness Metrics Across Protected Groups')
- plt.legend()
- plt.tight_layout()
- plt.savefig('fairness_metrics.png')
- plt.close()
\ No newline at end of file
+ plt.xlabel('Protected Group')
+ plt.ylabel('Metric Value')
+ plt.title('Fairness Metrics Across Protected Groups')
+ plt.legend()
+ plt.tight_layout()
+ plt.savefig('fairness_metrics.png')
+ plt.close()
+ except Exception as e:
+ logger.error(f"Error occurred while Plotting fairness metrics...{str(e)}")
\ No newline at end of file
diff --git a/explainableai/feature_analysis.py b/explainableai/feature_analysis.py
index 03182b3..ee8bd4b 100644
--- a/explainableai/feature_analysis.py
+++ b/explainableai/feature_analysis.py
@@ -3,10 +3,14 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
+import logging
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def calculate_shap_values(model, X, feature_names):
try:
# Convert X to a DataFrame if it's not already
+ logger.debug("Convert X to Dataframe...")
X = pd.DataFrame(X, columns=feature_names)
if hasattr(model, "predict_proba"):
@@ -23,12 +27,12 @@ def calculate_shap_values(model, X, feature_names):
plt.tight_layout()
plt.show()
plt.close()
-
+ logger.info("Dataframe Created...")
return shap_values
except Exception as e:
- print(f"Error calculating SHAP values: {e}")
- print("Model type:", type(model))
- print("X shape:", X.shape)
- print("X dtype:", X.dtypes)
- print("Feature names:", feature_names)
+ logger.error(f"Error calculating SHAP values: {e}")
+ logger.error("Model type:", type(model))
+ logger.error("X shape:", X.shape)
+ logger.error("X dtype:", X.dtypes)
+ logger.error("Feature names:", feature_names)
return None
\ No newline at end of file
diff --git a/explainableai/feature_engineering.py b/explainableai/feature_engineering.py
index 2a78f0f..78e2910 100644
--- a/explainableai/feature_engineering.py
+++ b/explainableai/feature_engineering.py
@@ -1,22 +1,34 @@
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def automated_feature_engineering(X_train, X_test=None):
# Convert categorical variables to one-hot encoding
- X_train_encoded = pd.get_dummies(X_train, drop_first=True)
-
- if X_test is not None:
- X_test_encoded = pd.get_dummies(X_test, drop_first=True)
- # Ensure X_test has the same columns as X_train
- for col in X_train_encoded.columns:
- if col not in X_test_encoded.columns:
- X_test_encoded[col] = 0
- X_test_encoded = X_test_encoded[X_train_encoded.columns]
-
- feature_names = X_train_encoded.columns.tolist()
-
- if X_test is not None:
- return X_train_encoded.values, X_test_encoded.values, feature_names
-
- return X_train_encoded.values, feature_names
\ No newline at end of file
+ logger.debug("Convert categorical variables to one-hot encoding...")
+ try:
+ X_train_encoded = pd.get_dummies(X_train, drop_first=True)
+
+ if X_test is not None:
+ X_test_encoded = pd.get_dummies(X_test, drop_first=True)
+ # Ensure X_test has the same columns as X_train
+ logger.debug("Ensuring test data has the same columns as training data...")
+ for col in X_train_encoded.columns:
+ if col not in X_test_encoded.columns:
+ X_test_encoded[col] = 0
+ X_test_encoded = X_test_encoded[X_train_encoded.columns]
+
+
+ feature_names = X_train_encoded.columns.tolist()
+
+ if X_test is not None:
+ logger.info("Data Converted...")
+ return X_train_encoded.values, X_test_encoded.values, feature_names
+
+ logger.info("Data Converted...")
+ return X_train_encoded.values, feature_names
+ except Exception as e:
+ logger.error(f"Error occurred during automated feature engineering...{str(e)}")
\ No newline at end of file
diff --git a/explainableai/feature_interaction.py b/explainableai/feature_interaction.py
index de0edcf..0830a39 100644
--- a/explainableai/feature_interaction.py
+++ b/explainableai/feature_interaction.py
@@ -4,39 +4,46 @@
import matplotlib.pyplot as plt
from sklearn.inspection import partial_dependence
import time
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def analyze_feature_interactions(model, X, feature_names, top_n=5, max_interactions=10):
- print("Starting feature interaction analysis...")
- feature_importance = dict(zip(feature_names, model.feature_importances_))
- top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:top_n]
- top_feature_names = [f[0] for f in top_features]
+ logger.debug("Starting feature interaction analysis...")
+ try:
+ feature_importance = dict(zip(feature_names, model.feature_importances_))
+ top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:top_n]
+ top_feature_names = [f[0] for f in top_features]
- interactions = []
- for i, (f1, f2) in enumerate(itertools.combinations(top_feature_names, 2)):
- if i >= max_interactions:
- print(f"Reached maximum number of interactions ({max_interactions}). Stopping analysis.")
- break
-
- print(f"Analyzing interaction between {f1} and {f2}...")
- start_time = time.time()
- f1_idx = feature_names.index(f1)
- f2_idx = feature_names.index(f2)
- pd_result = partial_dependence(model, X, features=[f1_idx, f2_idx], kind="average")
- interactions.append((f1, f2, pd_result))
- print(f"Interaction analysis for {f1} and {f2} completed in {time.time() - start_time:.2f} seconds.")
+ interactions = []
+ for i, (f1, f2) in enumerate(itertools.combinations(top_feature_names, 2)):
+ if i >= max_interactions:
+ logger.info(f"Reached maximum number of interactions ({max_interactions}). Stopping analysis.")
+ break
+
+ logger.info(f"Analyzing interaction between {f1} and {f2}...")
+ start_time = time.time()
+ f1_idx = feature_names.index(f1)
+ f2_idx = feature_names.index(f2)
+ pd_result = partial_dependence(model, X, features=[f1_idx, f2_idx], kind="average")
+ interactions.append((f1, f2, pd_result))
+ logger.info(f"Interaction analysis for {f1} and {f2} completed in {time.time() - start_time:.2f} seconds.")
- for i, (f1, f2, (pd_values, (ax1_values, ax2_values))) in enumerate(interactions):
- print(f"Plotting interaction {i+1} between {f1} and {f2}...")
- fig, ax = plt.subplots(figsize=(10, 6))
- XX, YY = np.meshgrid(ax1_values, ax2_values)
- Z = pd_values.reshape(XX.shape).T
- contour = ax.contourf(XX, YY, Z, cmap="RdBu_r", alpha=0.5)
- ax.set_xlabel(f1)
- ax.set_ylabel(f2)
- ax.set_title(f'Partial Dependence of {f1} and {f2}')
- plt.colorbar(contour)
- plt.savefig(f'interaction_{i+1}_{f1}_{f2}.png')
- plt.close()
+ for i, (f1, f2, (pd_values, (ax1_values, ax2_values))) in enumerate(interactions):
+ logger.debug(f"Plotting interaction {i+1} between {f1} and {f2}...")
+ fig, ax = plt.subplots(figsize=(10, 6))
+ XX, YY = np.meshgrid(ax1_values, ax2_values)
+ Z = pd_values.reshape(XX.shape).T
+ contour = ax.contourf(XX, YY, Z, cmap="RdBu_r", alpha=0.5)
+ ax.set_xlabel(f1)
+ ax.set_ylabel(f2)
+ ax.set_title(f'Partial Dependence of {f1} and {f2}')
+ plt.colorbar(contour)
+ plt.savefig(f'interaction_{i+1}_{f1}_{f2}.png')
+ plt.close()
- print("Feature interaction analysis completed.")
- return interactions
\ No newline at end of file
+ logger.info("Feature interaction analysis completed.")
+ return interactions
+ except Exception as e:
+ logger.error(f"Some error occured in interaction...{str(e)}")
\ No newline at end of file
diff --git a/explainableai/feature_selection.py b/explainableai/feature_selection.py
index 42291b7..c570608 100644
--- a/explainableai/feature_selection.py
+++ b/explainableai/feature_selection.py
@@ -1,7 +1,16 @@
from sklearn.feature_selection import SelectKBest, f_classif
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def select_features(X, y, k=10):
- selector = SelectKBest(score_func=f_classif, k=k)
- X_selected = selector.fit_transform(X, y)
- selected_feature_indices = selector.get_support(indices=True)
- return X_selected, selected_feature_indices
\ No newline at end of file
+ logger.debug("Selection the data...")
+ try:
+ selector = SelectKBest(score_func=f_classif, k=k)
+ X_selected = selector.fit_transform(X, y)
+ selected_feature_indices = selector.get_support(indices=True)
+ logger.info("Feature selected...")
+ return X_selected, selected_feature_indices
+ except Exception as e:
+ logger.error(f"Some error occurred in feature selection...{str(e)}")
\ No newline at end of file
diff --git a/explainableai/llm_explanations.py b/explainableai/llm_explanations.py
index b611e82..5e0cd55 100644
--- a/explainableai/llm_explanations.py
+++ b/explainableai/llm_explanations.py
@@ -3,107 +3,126 @@
import google.generativeai as genai
import os
from dotenv import load_dotenv
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
def initialize_gemini():
- genai.configure(api_key=GOOGLE_API_KEY)
- generation_config = {
- "temperature": 0.7,
- "top_p": 0.95,
- "top_k": 0,
- "max_output_tokens": 8192,
- }
- safety_settings = [
- {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
- {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
- {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
- {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
- ]
- return genai.GenerativeModel(model_name="gemini-1.5-pro", generation_config=generation_config, safety_settings=safety_settings)
+ logger.debug("Initializing gemini...")
+ try:
+ genai.configure(api_key=GOOGLE_API_KEY)
+ generation_config = {
+ "temperature": 0.7,
+ "top_p": 0.95,
+ "top_k": 0,
+ "max_output_tokens": 8192,
+ }
+ safety_settings = [
+ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+ {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+ {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+ {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+ ]
+ logger.info("Gemini initialize successfully...")
+ return genai.GenerativeModel(model_name="gemini-1.5-pro", generation_config=generation_config, safety_settings=safety_settings)
+ except Exception as e:
+ logger.error(f"Error occured in gemini initalizing...{str(e)}")
def get_llm_explanation(model, results):
- prompt = f"""
- As an AI expert, please provide a clear and concise explanation of the following machine learning model results for a non-technical audience:
+ logger.debug("Generate content...")
+ try:
+ prompt = f"""
+ As an AI expert, please provide a clear and concise explanation of the following machine learning model results for a non-technical audience:
- Model Performance:
- {results['model_performance']}
+ Model Performance:
+ {results['model_performance']}
- Top 5 Important Features:
- {dict(list(results['feature_importance'].items())[:5])}
+ Top 5 Important Features:
+ {dict(list(results['feature_importance'].items())[:5])}
- Cross-validation Score:
- Mean: {results['cv_scores'][0]:.4f}
- Standard Deviation: {results['cv_scores'][1]:.4f}
+ Cross-validation Score:
+ Mean: {results['cv_scores'][0]:.4f}
+ Standard Deviation: {results['cv_scores'][1]:.4f}
- Please explain:
- 1. What these results mean in simple terms.
- 2. The model's overall performance and reliability.
- 3. Which features are most important and why they might matter.
- 4. Suggestions for potential next steps or areas of improvement.
+ Please explain:
+ 1. What these results mean in simple terms.
+ 2. The model's overall performance and reliability.
+ 3. Which features are most important and why they might matter.
+ 4. Suggestions for potential next steps or areas of improvement.
- Format your response as follows:
-
- Summary:
- [Provide a brief 2-3 sentence summary of the overall results]
+ Format your response as follows:
+
+ Summary:
+ [Provide a brief 2-3 sentence summary of the overall results]
- Model Performance:
- [Explain the model's performance metrics]
+ Model Performance:
+ [Explain the model's performance metrics]
- Important Features:
- [Discuss the top 5 important features and their potential significance]
+ Important Features:
+ [Discuss the top 5 important features and their potential significance]
- Next Steps:
- [Suggest 2-3 potential next steps or areas for improvement]
+ Next Steps:
+ [Suggest 2-3 potential next steps or areas for improvement]
- Keep the explanation under 500 words and avoid technical jargon.
- """
+ Keep the explanation under 500 words and avoid technical jargon.
+ """
- response = model.generate_content(prompt)
- return response.text
+ response = model.generate_content(prompt)
+ logger.info("Response Generated...")
+ return response.text
+ except Exception as e:
+ logger.error(f"Some error in generating response... {str(e)}")
def get_prediction_explanation(model, input_data, prediction, probabilities, feature_importance):
- if feature_importance:
- top_features = dict(sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True)[:5])
- else:
- top_features = {}
- prompt = f"""
- As an AI expert, please provide a clear and concise explanation of the following prediction for a non-technical audience:
-
- Input Data:
- {input_data}
-
- Prediction: {prediction}
- Probabilities: {probabilities}
-
- Top 5 Important Features:
- {top_features}
-
- Please explain:
- 1. What the prediction means in simple terms.
- 2. How confident the model is in its prediction.
- 3. Which input features likely contributed most to this prediction and why.
- 4. Any potential limitations or considerations for this prediction.
- 5. Suggestions for what the user might do with this information.
-
- Format your response as follows:
-
- Prediction Summary:
- [Provide a brief 2-3 sentence summary of the prediction and its confidence]
-
- Key Factors:
- [Discuss the top 3-5 input features that likely influenced this prediction]
-
- Considerations:
- [Mention any limitations or important considerations for interpreting this prediction]
-
- Next Steps:
- [Suggest 2-3 potential actions or decisions the user might make based on this prediction]
-
- Keep the explanation under 300 words and avoid technical jargon.
- """
-
- response = model.generate_content(prompt)
- return response.text
\ No newline at end of file
+ logger.debug("Predicting....")
+ try:
+ if feature_importance:
+ top_features = dict(sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True)[:5])
+ else:
+ top_features = {}
+ prompt = f"""
+ As an AI expert, please provide a clear and concise explanation of the following prediction for a non-technical audience:
+
+ Input Data:
+ {input_data}
+
+ Prediction: {prediction}
+ Probabilities: {probabilities}
+
+ Top 5 Important Features:
+ {top_features}
+
+ Please explain:
+ 1. What the prediction means in simple terms.
+ 2. How confident the model is in its prediction.
+ 3. Which input features likely contributed most to this prediction and why.
+ 4. Any potential limitations or considerations for this prediction.
+ 5. Suggestions for what the user might do with this information.
+
+ Format your response as follows:
+
+ Prediction Summary:
+ [Provide a brief 2-3 sentence summary of the prediction and its confidence]
+
+ Key Factors:
+ [Discuss the top 3-5 input features that likely influenced this prediction]
+
+ Considerations:
+ [Mention any limitations or important considerations for interpreting this prediction]
+
+ Next Steps:
+ [Suggest 2-3 potential actions or decisions the user might make based on this prediction]
+
+ Keep the explanation under 300 words and avoid technical jargon.
+ """
+
+ response = model.generate_content(prompt)
+ logger.info("Prediction successfull")
+ return response.text
+ except Exception as e:
+ logger.error(f"Some error occurred in prediction...{str(e)}")
diff --git a/explainableai/model_comparison.py b/explainableai/model_comparison.py
index 480b59e..04452a2 100644
--- a/explainableai/model_comparison.py
+++ b/explainableai/model_comparison.py
@@ -4,16 +4,24 @@
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
+import logging
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def compare_models(models, X, y, cv=5):
- results = {}
- for name, model in models.items():
- scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
- results[name] = {
- 'mean_score': np.mean(scores),
- 'std_score': np.std(scores)
- }
- return results
+ logger.debug("Comparing models...")
+ try:
+ results = {}
+ for name, model in models.items():
+ scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
+ results[name] = {
+ 'mean_score': np.mean(scores),
+ 'std_score': np.std(scores)
+ }
+ logger.info("Comparision successfull")
+ return results
+ except Exception as e:
+ logger.error(f"Some error occurred in comparision...{str(e)}")
def plot_roc_curves(models, X, y):
plt.figure(figsize=(10, 8))
@@ -34,6 +42,7 @@ def plot_roc_curves(models, X, y):
plt.close()
def mcnemar_test(model1, model2, X, y):
+ logger.debug("Testing...")
y_pred1 = model1.predict(X)
y_pred2 = model2.predict(X)
diff --git a/explainableai/model_evaluation.py b/explainableai/model_evaluation.py
index 55fd055..e13e9f2 100644
--- a/explainableai/model_evaluation.py
+++ b/explainableai/model_evaluation.py
@@ -3,37 +3,57 @@
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, confusion_matrix, classification_report
import numpy as np
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def evaluate_model(model, X, y, is_classifier):
- if is_classifier:
- return evaluate_classifier(model, X, y)
- else:
- return evaluate_regressor(model, X, y)
+ logger.debug("Evaluting model")
+ try:
+ if is_classifier:
+ return evaluate_classifier(model, X, y)
+ else:
+ return evaluate_regressor(model, X, y)
+ except Exception as e:
+ logger.error(f"Some error occurred in evaluting model...{str(e)}")
def evaluate_classifier(model, X, y):
- y_pred = model.predict(X)
- accuracy = accuracy_score(y, y_pred)
- f1 = f1_score(y, y_pred, average='weighted')
- conf_matrix = confusion_matrix(y, y_pred)
- class_report = classification_report(y, y_pred)
-
- return {
- "accuracy": accuracy,
- "f1_score": f1,
- "confusion_matrix": conf_matrix,
- "classification_report": class_report
- }
-
+ logger.debug("Evaluating report...")
+ try:
+ y_pred = model.predict(X)
+ accuracy = accuracy_score(y, y_pred)
+ f1 = f1_score(y, y_pred, average='weighted')
+ conf_matrix = confusion_matrix(y, y_pred)
+ class_report = classification_report(y, y_pred)
+ logger.info("Report Generated...")
+ return {
+ "accuracy": accuracy,
+ "f1_score": f1,
+ "confusion_matrix": conf_matrix,
+ "classification_report": class_report
+ }
+ except Exception as e:
+ logger.error(f"Some error occured in evaluating report...{str(e)}")
def evaluate_regressor(model, X, y):
- y_pred = model.predict(X)
- mse = mean_squared_error(y, y_pred)
- r2 = r2_score(y, y_pred)
-
- return {
- "mean_squared_error": mse,
- "r2_score": r2
- }
+ logger.debug("Model prediction...")
+ try:
+ y_pred = model.predict(X)
+ mse = mean_squared_error(y, y_pred)
+ r2 = r2_score(y, y_pred)
+ logger.info("Model predicted...")
+ return {
+ "mean_squared_error": mse,
+ "r2_score": r2
+ }
+ except Exception as e:
+ logger.error(f"Some error in model prediction...{str(e)}")
def cross_validate(model, X, y, cv=5):
- scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
- return scores.mean(), scores.std()
\ No newline at end of file
+ logger.debug("Cross validation...")
+ try:
+ scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
+ logger.info("validated...")
+ return scores.mean(), scores.std()
+ except Exception as e:
+ logger.error(f"Some error in validation...{str(e)}")
\ No newline at end of file
diff --git a/explainableai/model_interpretability.py b/explainableai/model_interpretability.py
index 321a4c2..6fdc051 100644
--- a/explainableai/model_interpretability.py
+++ b/explainableai/model_interpretability.py
@@ -4,14 +4,23 @@
import lime.lime_tabular
import matplotlib.pyplot as plt
import numpy as np
+import logging
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def calculate_shap_values(model, X):
- explainer = shap.Explainer(model, X)
- shap_values = explainer(X)
- return shap_values
+ logger.debug("Calculating values...")
+ try:
+ explainer = shap.Explainer(model, X)
+ shap_values = explainer(X)
+ logger.info("Values caluated...")
+ return shap_values
+ except Exception as e:
+ logger.error(f"Some error occurred in calculating values...{str(e)}")
def plot_shap_summary(shap_values, X):
+ logger.debug("Summary...")
try:
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X, plot_type="bar", show=False)
@@ -19,8 +28,8 @@ def plot_shap_summary(shap_values, X):
plt.savefig('shap_summary.png')
plt.close()
except TypeError as e:
- print(f"Error in generating SHAP summary plot: {e}")
- print("Attempting alternative SHAP visualization...")
+ logger.error(f"Error in generating SHAP summary plot: {str(e)}")
+ logger.error("Attempting alternative SHAP visualization...")
try:
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values.values, X.values, feature_names=X.columns.tolist(), plot_type="bar", show=False)
@@ -28,18 +37,23 @@ def plot_shap_summary(shap_values, X):
plt.savefig('shap_summary.png')
plt.close()
except Exception as e2:
- print(f"Alternative SHAP visualization also failed: {e2}")
- print("Skipping SHAP summary plot.")
+ logger.error(f"Alternative SHAP visualization also failed: {str(e2)}")
+ logger.error("Skipping SHAP summary plot.")
def get_lime_explanation(model, X, instance, feature_names):
- explainer = lime.lime_tabular.LimeTabularExplainer(
- X,
- feature_names=feature_names,
- class_names=['Negative', 'Positive'],
- mode='classification'
- )
- exp = explainer.explain_instance(instance, model.predict_proba)
- return exp
+ logger.debug("Explaining model...")
+ try:
+ explainer = lime.lime_tabular.LimeTabularExplainer(
+ X,
+ feature_names=feature_names,
+ class_names=['Negative', 'Positive'],
+ mode='classification'
+ )
+ exp = explainer.explain_instance(instance, model.predict_proba)
+ logger.info("Model explained...")
+ return exp
+ except Exception as e:
+ logger.error(f"Some error occurred in explaining model...{str(e)}")
def plot_lime_explanation(exp):
exp.as_pyplot_figure()
diff --git a/explainableai/model_selection.py b/explainableai/model_selection.py
index aa6aea2..f2fd184 100644
--- a/explainableai/model_selection.py
+++ b/explainableai/model_selection.py
@@ -7,8 +7,13 @@
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def get_default_models():
+ logger.info("Got default models...")
return {
'Logistic Regression': LogisticRegression(max_iter=1000),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
@@ -17,37 +22,46 @@ def get_default_models():
}
def compare_models(X_train, y_train, X_test, y_test, models=None):
- if models is None:
- models = get_default_models()
-
- results = {}
- for name, model in models.items():
- cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
- model.fit(X_train, y_train)
- test_score = model.score(X_test, y_test)
- results[name] = {
- 'cv_score': cv_scores.mean(),
- 'test_score': test_score
- }
-
- plot_roc_curves(models, X_test, y_test)
-
- return results
+ logger.debug("Comparing models...")
+ try:
+ if models is None:
+ models = get_default_models()
+
+ results = {}
+ for name, model in models.items():
+ cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
+ model.fit(X_train, y_train)
+ test_score = model.score(X_test, y_test)
+ results[name] = {
+ 'cv_score': cv_scores.mean(),
+ 'test_score': test_score
+ }
+
+ plot_roc_curves(models, X_test, y_test)
+ logger.info("Models compared successfully...")
+ return results
+ except Exception as e:
+ logger.error(f"Some error occurred in comparing models...{str(e)}")
def plot_roc_curves(models, X_test, y_test):
- plt.figure(figsize=(10, 8))
- for name, model in models.items():
- y_pred_proba = model.predict_proba(X_test)[:, 1]
- fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
- roc_auc = auc(fpr, tpr)
- plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
-
- plt.plot([0, 1], [0, 1], 'k--')
- plt.xlim([0.0, 1.0])
- plt.ylim([0.0, 1.05])
- plt.xlabel('False Positive Rate')
- plt.ylabel('True Positive Rate')
- plt.title('Receiver Operating Characteristic (ROC) Curve')
- plt.legend(loc="lower right")
- plt.savefig('model_comparison_roc_curves.png')
- plt.close()
\ No newline at end of file
+ logger.debug("Plot curves...")
+ try:
+ plt.figure(figsize=(10, 8))
+ for name, model in models.items():
+ y_pred_proba = model.predict_proba(X_test)[:, 1]
+ fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
+ roc_auc = auc(fpr, tpr)
+ plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
+
+ plt.plot([0, 1], [0, 1], 'k--')
+ plt.xlim([0.0, 1.0])
+ plt.ylim([0.0, 1.05])
+ plt.xlabel('False Positive Rate')
+ plt.ylabel('True Positive Rate')
+ plt.title('Receiver Operating Characteristic (ROC) Curve')
+ plt.legend(loc="lower right")
+ plt.savefig('model_comparison_roc_curves.png')
+ logger.info("Curve plotted...")
+ plt.close()
+ except Exception as e:
+ logger.error(f"Some error occurred in plotting the curve...{str(e)}")
\ No newline at end of file
diff --git a/explainableai/report_generator.py b/explainableai/report_generator.py
index 65d660d..cc3ac12 100644
--- a/explainableai/report_generator.py
+++ b/explainableai/report_generator.py
@@ -8,8 +8,13 @@
import io
from PIL import Image as PILImage
import re
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
class ReportGenerator:
+ logger.debug("Report generation...")
def __init__(self, filename):
self.filename = filename
self.doc = SimpleDocTemplate(filename, pagesize=letter, topMargin=0.5*inch, bottomMargin=0.5*inch)
@@ -19,6 +24,7 @@ def __init__(self, filename):
self.content = []
def setup_styles(self):
+ logger.debug("Setting up the styles...")
self.custom_styles['Heading1'] = ParagraphStyle(
'CustomHeading1',
parent=self.styles['Heading1'],
@@ -52,23 +58,27 @@ def setup_styles(self):
)
def add_heading(self, text, level=1):
+ logger.debug(f"Adding heading: {text}")
style = self.custom_styles[f'Heading{level}']
self.content.append(Paragraph(text, style))
self.content.append(Spacer(1, 12))
def add_paragraph(self, text):
+ logger.debug(f"Adding paragraph: {text}")
formatted_text = self.format_text(text)
self.content.append(Paragraph(formatted_text, self.custom_styles['BodyText']))
self.content.append(Spacer(1, 6))
def format_text(self, text):
# Convert Markdown-style formatting to ReportLab's XML-like tags
+ logger.debug(f"Fromatting text: {text}")
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # Bold
text = re.sub(r'\*(.*?)\*', r'\1', text) # Italic
text = re.sub(r'`(.*?)`', r'\1
', text) # Code
return text
def add_llm_explanation(self, explanation):
+ logger.debug("Adding LLM explanation...")
lines = explanation.split('\n')
for line in lines:
if line.startswith('##'):
@@ -79,6 +89,7 @@ def add_llm_explanation(self, explanation):
self.add_paragraph(line)
def add_image(self, image_path, width=6*inch, height=4*inch):
+ logger.debug("Adding image...")
try:
img = PILImage.open(image_path)
img.thumbnail((width, height), PILImage.LANCZOS)
@@ -89,9 +100,10 @@ def add_image(self, image_path, width=6*inch, height=4*inch):
self.content.append(img)
self.content.append(Spacer(1, 12))
except Exception as e:
- print(f"Error adding image {image_path}: {str(e)}")
+ logger.error(f"Error adding image {image_path}: {str(e)}")
def add_table(self, data, col_widths=None):
+ logger.debug("Adding table...")
table = Table(data, colWidths=col_widths)
table.setStyle(TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
@@ -115,6 +127,6 @@ def add_table(self, data, col_widths=None):
def generate(self):
try:
self.doc.build(self.content)
- print(f"Report generated successfully: {self.filename}")
+ logger.info(f"Report generated successfully: {self.filename}")
except Exception as e:
- print(f"Error generating report: {str(e)}")
\ No newline at end of file
+ logger.error(f"Error generating report: {str(e)}")
\ No newline at end of file
diff --git a/explainableai/utils.py b/explainableai/utils.py
index 8cb695a..102b78a 100644
--- a/explainableai/utils.py
+++ b/explainableai/utils.py
@@ -1,29 +1,44 @@
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.inspection import permutation_importance
import numpy as np
+import logging
-def explain_model(model, X_train, y_train, X_test, y_test, feature_names):
- result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
- feature_importance = {feature: importance for feature, importance in zip(feature_names, result.importances_mean)}
-
- # Sort feature importance by absolute value
- feature_importance = dict(sorted(feature_importance.items(), key=lambda item: abs(item[1]), reverse=True))
-
- return {
- "feature_importance": feature_importance,
- "model_type": str(type(model)),
- }
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
-def calculate_metrics(model, X_test, y_test):
- y_pred = model.predict(X_test)
-
- if len(np.unique(y_test)) == 2: # Binary classification
+def explain_model(model, X_train, y_train, X_test, y_test, feature_names):
+ logger.debug("Explaining model...")
+ try:
+ result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
+ feature_importance = {feature: importance for feature, importance in zip(feature_names, result.importances_mean)}
+
+ # Sort feature importance by absolute value
+ feature_importance = dict(sorted(feature_importance.items(), key=lambda item: abs(item[1]), reverse=True))
+
+ logger.info("Model explained...")
return {
- "accuracy": accuracy_score(y_test, y_pred),
- "f1_score": f1_score(y_test, y_pred, average='weighted')
+ "feature_importance": feature_importance,
+ "model_type": str(type(model)),
}
- else: # Regression or multi-class classification
- return {
- "mse": mean_squared_error(y_test, y_pred),
- "r2": r2_score(y_test, y_pred)
- }
\ No newline at end of file
+ except Exception as e:
+ logger.error(f"Some error occurred in explaining model...{str(e)}")
+
+def calculate_metrics(model, X_test, y_test):
+ logger.debug("Calculation of metrics...")
+ try:
+ y_pred = model.predict(X_test)
+
+ if len(np.unique(y_test)) == 2: # Binary classification
+ logger.info("Binary classification... ")
+ return {
+ "accuracy": accuracy_score(y_test, y_pred),
+ "f1_score": f1_score(y_test, y_pred, average='weighted')
+ }
+ else: # Regression or multi-class classification
+ logger.info("Multiclass classification...")
+ return {
+ "mse": mean_squared_error(y_test, y_pred),
+ "r2": r2_score(y_test, y_pred)
+ }
+ except Exception as e:
+ logger.error(f"Some error occurred in metric calculation...{str(e)}")
\ No newline at end of file
diff --git a/explainableai/visualizations.py b/explainableai/visualizations.py
index efd2118..af8dac1 100644
--- a/explainableai/visualizations.py
+++ b/explainableai/visualizations.py
@@ -8,103 +8,134 @@
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
import plotly.graph_objs as go
from plotly.subplots import make_subplots
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def plot_feature_importance(feature_importance):
- plt.figure(figsize=(12, 8))
- sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
- features, importance = zip(*sorted_features)
- plt.bar(features, importance)
- plt.title('Feature Importance')
- plt.xlabel('Features')
- plt.ylabel('Importance')
- plt.xticks(rotation=90)
- plt.tight_layout()
- plt.savefig('feature_importance.png')
- plt.close()
+ logger.debug("Plotting feature importance...")
+ try:
+ plt.figure(figsize=(12, 8))
+ sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
+ features, importance = zip(*sorted_features)
+ plt.bar(features, importance)
+ plt.title('Feature Importance')
+ plt.xlabel('Features')
+ plt.ylabel('Importance')
+ plt.xticks(rotation=90)
+ plt.tight_layout()
+ plt.savefig('feature_importance.png')
+ plt.close()
+ logger.info("Feature importance plot saved...")
+ except Exception as e:
+ logger.error(f"Some error occurred while plotting feature importance...{str(e)}")
def plot_partial_dependence(model, X, feature_importance, feature_names):
- top_features = sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True)[:3]
- top_feature_indices = [feature_names.index(feature) for feature, _ in top_features]
-
- fig, ax = plt.subplots(figsize=(15, 5))
- display = PartialDependenceDisplay.from_estimator(
- model, X, top_feature_indices,
- feature_names=feature_names,
- kind="average", subsample=1000,
- n_jobs=3, grid_resolution=20
- )
- display.plot(ax=ax)
-
- for ax in display.axes_.ravel():
- ylim = ax.get_ylim()
- if ylim[0] == ylim[1]:
- ax.set_ylim(ylim[0] - 0.1, ylim[1] + 0.1)
-
- plt.suptitle('Partial Dependence of Top 3 Features')
- plt.tight_layout()
- plt.savefig('partial_dependence.png')
- plt.close()
+ logger.debug("Plotting partial dependence...")
+ try:
+ top_features = sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True)[:3]
+ top_feature_indices = [feature_names.index(feature) for feature, _ in top_features]
+
+ fig, ax = plt.subplots(figsize=(15, 5))
+ display = PartialDependenceDisplay.from_estimator(
+ model, X, top_feature_indices,
+ feature_names=feature_names,
+ kind="average", subsample=1000,
+ n_jobs=3, grid_resolution=20
+ )
+ display.plot(ax=ax)
+
+ for ax in display.axes_.ravel():
+ ylim = ax.get_ylim()
+ if ylim[0] == ylim[1]:
+ ax.set_ylim(ylim[0] - 0.1, ylim[1] + 0.1)
+
+ plt.suptitle('Partial Dependence of Top 3 Features')
+ plt.tight_layout()
+ plt.savefig('partial_dependence.png')
+ plt.close()
+ logger.info("Partial dependence plot saved...")
+ except Exception as e:
+ logger.error(f"Some error occurred while plot partial dependence...{str(e)}")
def plot_learning_curve(model, X, y, cv=5):
- train_sizes, train_scores, test_scores = learning_curve(
- model, X, y, cv=cv, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
-
- plt.figure(figsize=(10, 6))
- plt.title("Learning Curve")
- plt.xlabel("Training examples")
- plt.ylabel("Score")
- plt.grid()
+ logger.debug("Plotting learning curve...")
+ try:
+ train_sizes, train_scores, test_scores = learning_curve(
+ model, X, y, cv=cv, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
+
+ plt.figure(figsize=(10, 6))
+ plt.title("Learning Curve")
+ plt.xlabel("Training examples")
+ plt.ylabel("Score")
+ plt.grid()
- train_scores_mean = np.mean(train_scores, axis=1)
- train_scores_std = np.std(train_scores, axis=1)
- test_scores_mean = np.mean(test_scores, axis=1)
- test_scores_std = np.std(test_scores, axis=1)
+ train_scores_mean = np.mean(train_scores, axis=1)
+ train_scores_std = np.std(train_scores, axis=1)
+ test_scores_mean = np.mean(test_scores, axis=1)
+ test_scores_std = np.std(test_scores, axis=1)
- plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
- train_scores_mean + train_scores_std, alpha=0.1, color="r")
- plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
- test_scores_mean + test_scores_std, alpha=0.1, color="g")
- plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
- plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
+ plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
+ train_scores_mean + train_scores_std, alpha=0.1, color="r")
+ plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
+ test_scores_mean + test_scores_std, alpha=0.1, color="g")
+ plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
+ plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
- plt.legend(loc="best")
- plt.savefig('learning_curve.png')
- plt.close()
+ plt.legend(loc="best")
+ plt.savefig('learning_curve.png')
+ plt.close()
+ logger.info("Learning curve plot saved.")
+ except Exception as e:
+ logger.error(f"Some error occurred while plotting learning curve..{str(e)}")
def plot_roc_curve(model, X_test, y_test):
- y_pred_proba = model.predict_proba(X_test)[:, 1]
- fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
- roc_auc = auc(fpr, tpr)
+ logger.debug("Plotting roc curve...")
+ try:
+ y_pred_proba = model.predict_proba(X_test)[:, 1]
+ fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
+ roc_auc = auc(fpr, tpr)
- plt.figure(figsize=(8, 6))
- plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
- plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
- plt.xlim([0.0, 1.0])
- plt.ylim([0.0, 1.05])
- plt.xlabel('False Positive Rate')
- plt.ylabel('True Positive Rate')
- plt.title('Receiver Operating Characteristic (ROC) Curve')
- plt.legend(loc="lower right")
- plt.savefig('roc_curve.png')
- plt.close()
+ plt.figure(figsize=(8, 6))
+ plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
+ plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
+ plt.xlim([0.0, 1.0])
+ plt.ylim([0.0, 1.05])
+ plt.xlabel('False Positive Rate')
+ plt.ylabel('True Positive Rate')
+ plt.title('Receiver Operating Characteristic (ROC) Curve')
+ plt.legend(loc="lower right")
+ plt.savefig('roc_curve.png')
+ plt.close()
+ logger.info("Plotting roc curve successfully...")
+ except Exception as e:
+ logger.error(f"Some error occurred while plotting roc curve...{str(e)}")
def plot_precision_recall_curve(model, X_test, y_test):
- y_pred_proba = model.predict_proba(X_test)[:, 1]
- precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
- average_precision = average_precision_score(y_test, y_pred_proba)
+ logger.debug("Plot precision recall curve...")
+ try:
+ y_pred_proba = model.predict_proba(X_test)[:, 1]
+ precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
+ average_precision = average_precision_score(y_test, y_pred_proba)
- plt.figure(figsize=(8, 6))
- plt.step(recall, precision, color='b', alpha=0.2, where='post')
- plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
- plt.xlabel('Recall')
- plt.ylabel('Precision')
- plt.ylim([0.0, 1.05])
- plt.xlim([0.0, 1.0])
- plt.title(f'Precision-Recall curve: AP={average_precision:.2f}')
- plt.savefig('precision_recall_curve.png')
- plt.close()
+ plt.figure(figsize=(8, 6))
+ plt.step(recall, precision, color='b', alpha=0.2, where='post')
+ plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
+ plt.xlabel('Recall')
+ plt.ylabel('Precision')
+ plt.ylim([0.0, 1.05])
+ plt.xlim([0.0, 1.0])
+ plt.title(f'Precision-Recall curve: AP={average_precision:.2f}')
+ plt.savefig('precision_recall_curve.png')
+ plt.close()
+ logger.info("Plot precision recall curve successfully...")
+ except Exception as e:
+ logger.error(f"Some error occurred while plotting precision recall curve...{str(e)}")
+
def plot_interactive_feature_importance(feature_importance):
+ logger.debug("Plot interactive feature importance...")
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
features, importance = zip(*sorted_features)
@@ -120,6 +151,7 @@ def plot_interactive_feature_importance(feature_importance):
fig.write_html('interactive_feature_importance.html')
def plot_correlation_heatmap(X):
+ logger.debug("Plot correlation heatmap")
corr = X.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm', linewidths=0.5)
diff --git a/main.py b/main.py
index b44dc14..595c29d 100644
--- a/main.py
+++ b/main.py
@@ -8,22 +8,28 @@
from sklearn.neural_network import MLPClassifier
from explainableai import XAIWrapper
import argparse
+import logging
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
def main(file_path, target_column):
# Import the dataset
- print("Importing dataset...")
+ logger.info("Importing dataset...")
df = pd.read_csv(file_path)
# Perform EDA
+ logger.debug("Performing EDA...")
XAIWrapper.perform_eda(df)
X = df.drop(columns=[target_column])
y = df[target_column]
# Split the data
+ logger.debug("Splitting the data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create models
+ logger.info("Creating the models...")
models = {
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'Logistic Regression': LogisticRegression(max_iter=1000),
@@ -32,24 +38,28 @@ def main(file_path, target_column):
}
# Create XAIWrapper instance
+ logger.debug("Creating XAIWrapper instance...")
xai = XAIWrapper()
# Fit the models and run XAI analysis
+ logger.debug("Fitting the models and run the XAI analysis...")
xai.fit(models, X_train, y_train)
results = xai.analyze()
- print("\nLLM Explanation of Results:")
- print(results['llm_explanation'])
+ logger.info("\nLLM Explanation of Results:")
+ logger.info(results['llm_explanation'])
# Generate the report
try:
+ logger.info("Generating the report...")
xai.generate_report()
except Exception as e:
- print(f"An error occurred while generating the report: {str(e)}")
+ logger.error(f"An error occurred while generating the report: {str(e)}")
# Example of using the trained model for new predictions
+ logger.debug("Making the prediction...")
while True:
- print("\nEnter values for prediction (or 'q' to quit):")
+ logger.info("\nEnter values for prediction (or 'q' to quit):")
user_input = {}
for feature in X.columns:
value = input(f"{feature}: ")
@@ -63,13 +73,13 @@ def main(file_path, target_column):
try:
prediction, probabilities, explanation = xai.explain_prediction(user_input)
- print("\nPrediction Results:")
- print(f"Prediction: {prediction}")
- print(f"Probabilities: {probabilities}")
- print("\nLLM Explanation of Prediction:")
- print(explanation)
+ logger.info("\nPrediction Results:")
+ logger.info(f"Prediction: {prediction}")
+ logger.info(f"Probabilities: {probabilities}")
+ logger.info("\nLLM Explanation of Prediction:")
+ logger.info(explanation)
except Exception as e:
- print(f"An error occurred during prediction: {str(e)}")
+ logger.error(f"An error occurred during prediction: {str(e)}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run XAI analysis on a dataset")
@@ -77,4 +87,5 @@ def main(file_path, target_column):
parser.add_argument("target_column", help="Name of the target column in the dataset")
args = parser.parse_args()
+ logger.info("Starting the program...")
main(args.file_path, args.target_column)
\ No newline at end of file
diff --git a/tests/test_xai_wrapper.py b/tests/test_xai_wrapper.py
index 101f817..11b95c0 100644
--- a/tests/test_xai_wrapper.py
+++ b/tests/test_xai_wrapper.py
@@ -10,117 +10,173 @@
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from explainableai.core import XAIWrapper
+import logging
+
+logger=logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
@pytest.fixture
def sample_data():
- X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_classes=2, random_state=42)
- feature_names = [f'feature_{i}' for i in range(20)]
- X = pd.DataFrame(X, columns=feature_names)
- y = pd.Series(y, name='target')
- return X, y
+ logger.debug("Generating sample data...")
+ try:
+ X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_classes=2, random_state=42)
+ feature_names = [f'feature_{i}' for i in range(20)]
+ X = pd.DataFrame(X, columns=feature_names)
+ y = pd.Series(y, name='target')
+ logger.info("Sample data generated.")
+ return X, y
+ except Exception as e:
+ logger.error(f"Some error occurred while generating sample data: {str(e)}")
@pytest.fixture
def sample_models():
- return {
- 'Random Forest': RandomForestClassifier(n_estimators=10, random_state=42),
- 'Logistic Regression': LogisticRegression(max_iter=1000),
- 'XGBoost': XGBClassifier(n_estimators=10, random_state=42),
- 'Neural Network': MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)
- }
-
+ logger.debug("Defining sample models...")
+ try:
+ return {
+ 'Random Forest': RandomForestClassifier(n_estimators=10, random_state=42),
+ 'Logistic Regression': LogisticRegression(max_iter=1000),
+ 'XGBoost': XGBClassifier(n_estimators=10, random_state=42),
+ 'Neural Network': MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)
+ }
+ except Exception as e:
+ logger.error(f"Some error occured while defining sample models: {str(e)}")
def test_xai_wrapper_initialization(sample_data, sample_models):
- X, y = sample_data
- xai = XAIWrapper()
- assert xai is not None
- assert xai.model is None
- assert xai.X is None
- assert xai.y is None
+ logger.debug("Testing XAIWrapper initialization...")
+ try:
+ X, y = sample_data
+ xai = XAIWrapper()
+ assert xai is not None
+ assert xai.model is None
+ assert xai.X is None
+ assert xai.y is None
+ logger.info("XAIWrapper initialization test passed.")
+ except Exception as e:
+ logger.error(f"An error occurred while testing XAIWrapper initialization: {str(e)}")
def test_xai_wrapper_fit(sample_data, sample_models):
- X, y = sample_data
- xai = XAIWrapper()
- xai.fit(sample_models, X, y)
- assert xai.model is not None
- assert xai.X is not None
- assert xai.y is not None
- assert hasattr(xai.model, 'predict')
- assert hasattr(xai.model, 'predict_proba')
+ logger.debug("Testing XAIWrapper fit method...")
+ try:
+ X, y = sample_data
+ xai = XAIWrapper()
+ xai.fit(sample_models, X, y)
+ assert xai.model is not None
+ assert xai.X is not None
+ assert xai.y is not None
+ assert hasattr(xai.model, 'predict')
+ assert hasattr(xai.model, 'predict_proba')
+ logger.info("XAIWrapper fit method test passed.")
+ except Exception as e:
+ logger.error(f"An error occurred while testing XAIWrapper fit method: {str(e)}")
@pytest.mark.parametrize("model_name", ['Random Forest', 'Logistic Regression', 'XGBoost', 'Neural Network'])
def test_xai_wrapper_analyze_with_different_models(sample_data, sample_models, model_name):
- X, y = sample_data
- models = {model_name: sample_models[model_name]}
- xai = XAIWrapper()
- xai.fit(models, X, y)
- results = xai.analyze()
- assert 'model_performance' in results
- assert 'feature_importance' in results
- assert 'shap_values' in results
- assert 'cv_scores' in results
- assert 'llm_explanation' in results
- assert 'model_comparison' in results
+ logger.debug(f"Testing XAIWrapper analyze method with {model_name}...")
+ try:
+ X, y = sample_data
+ models = {model_name: sample_models[model_name]}
+ xai = XAIWrapper()
+ xai.fit(models, X, y)
+ results = xai.analyze()
+ assert 'model_performance' in results
+ assert 'feature_importance' in results
+ assert 'shap_values' in results
+ assert 'cv_scores' in results
+ assert 'llm_explanation' in results
+ assert 'model_comparison' in results
+ logger.info(f"XAIWrapper analyze method test with {model_name} passed.")
+ except Exception as e:
+ logger.error(f"An error occurred while testing XAIWrapper analyze method with {model_name}: {str(e)}")
def test_xai_wrapper_predict(sample_data, sample_models):
- X, y = sample_data
- xai = XAIWrapper()
- xai.fit(sample_models, X, y)
-
- # Test single prediction
- single_input = X.iloc[0].to_dict()
- prediction, probabilities, explanation = xai.explain_prediction(single_input)
- assert isinstance(prediction, (int, np.integer))
- assert isinstance(probabilities, np.ndarray)
- assert isinstance(explanation, str)
+ logger.debug("Testing XAIWrapper predict method...")
+ try:
+ X, y = sample_data
+ xai = XAIWrapper()
+ xai.fit(sample_models, X, y)
+
+ # Test single prediction
+ single_input = X.iloc[0].to_dict()
+ prediction, probabilities, explanation = xai.explain_prediction(single_input)
+ assert isinstance(prediction, (int, np.integer))
+ assert isinstance(probabilities, np.ndarray)
+ assert isinstance(explanation, str)
+ logger.info("XAIWrapper predict method test passed.")
+ except Exception as e:
+ logger.error(f"An error occurred while testing XAIWrapper predict method: {str(e)}")
def test_xai_wrapper_generate_report(sample_data, sample_models, tmp_path):
- X, y = sample_data
- xai = XAIWrapper()
- xai.fit(sample_models, X, y)
- xai.analyze()
-
- report_path = tmp_path / "test_report.pdf"
- xai.generate_report(filename=str(report_path))
- assert report_path.exists()
- assert os.path.getsize(report_path) > 0 # Check if the file is not empty
+ logger.debug("Testing XAIWrapper generate report...")
+ try:
+ X, y = sample_data
+ xai = XAIWrapper()
+ xai.fit(sample_models, X, y)
+ xai.analyze()
+
+ report_path = tmp_path / "test_report.pdf"
+ xai.generate_report(filename=str(report_path))
+ assert report_path.exists()
+ assert os.path.getsize(report_path) > 0 # Check if the file is not empty
+ logger.info("XAIWrapper generate report test passed.")
+ except Exception as e:
+ logger.error(f"An error occurred while testing XAIWrapper generate report: {str(e)}")
def test_xai_wrapper_perform_eda(sample_data):
- X, y = sample_data
- df = pd.concat([X, y], axis=1)
+ logger.debug("Testing XAIWrapper perform eda...")
try:
- XAIWrapper.perform_eda(df)
+ X, y = sample_data
+ df = pd.concat([X, y], axis=1)
+ try:
+ XAIWrapper.perform_eda(df)
+ except Exception as e:
+ pytest.fail(f"perform_eda raised an exception: {e}")
except Exception as e:
- pytest.fail(f"perform_eda raised an exception: {e}")
+ logger.error(f"An error occurred while testing XAIWrapper perform data: {str(e)}")
def test_xai_wrapper_feature_importance(sample_data, sample_models):
- X, y = sample_data
- xai = XAIWrapper()
- xai.fit(sample_models, X, y)
- results = xai.analyze()
- assert 'feature_importance' in results
- assert len(results['feature_importance']) == X.shape[1]
- assert all(isinstance(importance, (float, np.float64)) for importance in results['feature_importance'].values())
+ logger.debug("Testing XAIWrapper feature importance...")
+ try:
+ X, y = sample_data
+ xai = XAIWrapper()
+ xai.fit(sample_models, X, y)
+ results = xai.analyze()
+ assert 'feature_importance' in results
+ assert len(results['feature_importance']) == X.shape[1]
+ assert all(isinstance(importance, (float, np.float64)) for importance in results['feature_importance'].values())
+ logger.info("XAIWrapper feature importance test passed.")
+ except Exception as e:
+ logger.error(f"An error occurred while testing XAIWrapper feature importance: {str(e)}")
def test_xai_wrapper_cross_validation(sample_data, sample_models):
- X, y = sample_data
- xai = XAIWrapper()
- xai.fit(sample_models, X, y)
- results = xai.analyze()
- assert 'cv_scores' in results
- assert len(results['cv_scores']) == 2 # mean and std
- assert all(isinstance(score, (float, np.float64)) for score in results['cv_scores'])
+ logger.debug("Testing XAIWrapper cross validation...")
+ try:
+ X, y = sample_data
+ xai = XAIWrapper()
+ xai.fit(sample_models, X, y)
+ results = xai.analyze()
+ assert 'cv_scores' in results
+ assert len(results['cv_scores']) == 2 # mean and std
+ assert all(isinstance(score, (float, np.float64)) for score in results['cv_scores'])
+ logger.info("XAIWrapper cross validation test passed.")
+ except Exception as e:
+ logger.error(f"An error occurred while testing XAIWrapper cross validation: {str(e)}")
def test_xai_wrapper_model_comparison(sample_data, sample_models):
- X, y = sample_data
- xai = XAIWrapper()
- xai.fit(sample_models, X, y)
- results = xai.analyze()
- assert 'model_comparison' in results
- assert len(results['model_comparison']) == len(sample_models)
- for model_name, scores in results['model_comparison'].items():
- assert 'cv_score' in scores
- assert 'test_score' in scores
- assert isinstance(scores['cv_score'], (float, np.float64))
- assert isinstance(scores['test_score'], (float, np.float64))
+ logger.debug("Testing XAIWrapper model comparision...")
+ try:
+ X, y = sample_data
+ xai = XAIWrapper()
+ xai.fit(sample_models, X, y)
+ results = xai.analyze()
+ assert 'model_comparison' in results
+ assert len(results['model_comparison']) == len(sample_models)
+ for model_name, scores in results['model_comparison'].items():
+ assert 'cv_score' in scores
+ assert 'test_score' in scores
+ assert isinstance(scores['cv_score'], (float, np.float64))
+ assert isinstance(scores['test_score'], (float, np.float64))
+ logger.info("XAIWrapper model comparision test passed.")
+ except Exception as e:
+ logger.error(f"An error occurred while testing XAIWrapper model comparision: {str(e)}")
@pytest.mark.parametrize("invalid_input", [
{}, # Empty dictionary
@@ -128,8 +184,12 @@ def test_xai_wrapper_model_comparison(sample_data, sample_models):
{f'feature_{i}': 'invalid' for i in range(20)}, # Invalid data type
])
def test_xai_wrapper_predict_invalid_input(sample_data, sample_models, invalid_input):
- X, y = sample_data
- xai = XAIWrapper()
- xai.fit(sample_models, X, y)
- with pytest.raises(Exception):
- xai.explain_prediction(invalid_input)
\ No newline at end of file
+ logger.debug("Testing XAIWrapper predict invalid input...")
+ try:
+ X, y = sample_data
+ xai = XAIWrapper()
+ xai.fit(sample_models, X, y)
+ with pytest.raises(Exception):
+ xai.explain_prediction(invalid_input)
+ except Exception as e:
+ logger.error(f"An error occurred while testing XAIWrapper predict invalid input: {str(e)}")
\ No newline at end of file