From 3b3d7cd4a3bae604769317ad17d5496ce924b1a4 Mon Sep 17 00:00:00 2001 From: Rafael-Silva-Oliveira Date: Sun, 3 Mar 2024 20:17:38 +0100 Subject: [PATCH] Update cross-validation strategy logging and launch configuration --- .gitignore | 309 ++++++++++++++++++++ .vscode/launch.json | 30 +- catboost_info/catboost_training.json | 133 +++++++++ preprocessing/src/config.json | 403 +++++++++++++++------------ preprocessing/src/main.py | 35 ++- preprocessing/src/preprocessing.py | 78 +++++- preprocessing/src/train_model.py | 289 ++++++++++++++++--- preprocessing/src/utils.py | 4 +- 8 files changed, 1032 insertions(+), 249 deletions(-) create mode 100644 catboost_info/catboost_training.json diff --git a/.gitignore b/.gitignore index f0a8f2e..c5b302c 100644 --- a/.gitignore +++ b/.gitignore @@ -780,3 +780,312 @@ preprocessing/reports/PipelineRun_2024_02_28-09_50_17_AM/Plots/TrainingCurve_Rid preprocessing/reports/PipelineRun_2024_02_28-09_50_17_AM/Plots/TrainingCurve_SGDClassifier.png preprocessing/reports/PipelineRun_2024_02_28-09_50_17_AM/Plots/TrainingCurve_SVC.png preprocessing/reports/PipelineRun_2024_02_28-09_50_17_AM/Plots/TrainingCurve_XGBClassifier.png +test_na.xlsx +test2.xlsx +test4.xlsx +catboost_info/learn_error.tsv +catboost_info/time_left.tsv +catboost_info/learn/events.out.tfevents +catboost_info/tmp/cat_feature_index.7bab4e20-bfdf59bc-3205bca-68b4cf9a.tmp +catboost_info/tmp/cat_feature_index.b8bae20f-32489242-e592c2bc-899fda70.tmp +catboost_info/tmp/cat_feature_index.b84c19d7-a5052183-80a8f1dd-1feab2c1.tmp +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_AdaBoostClassifier_2024_02_28_07_58_48_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_ComplementNB_2024_02_28_07_14_44_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_DecisionTreeClassifier_2024_02_28_07_15_42_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_HistGradientBoostingClassifier_2024_02_28_07_39_53_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_KNeighborsClassifier_2024_02_28_07_00_06_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_LogisticRegression_2024_02_28_06_58_06_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_MLPClassifier_2024_02_28_08_02_25_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_Perceptron_2024_02_28_06_58_30_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_RandomForestClassifier_2024_02_28_07_20_06_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_RidgeClassifier_2024_02_28_06_59_10_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_SGDClassifier_2024_02_28_06_58_51_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_SVC_2024_02_28_07_14_37_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Files/Test_classification_XGBClassifier_2024_02_28_08_10_40_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Model/Best Optimized Model/HistGradientBoostingClassifier.joblib +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/AUC.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/RankedModelsByMetric.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_AdaBoostClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_ComplementNB.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_HistGradientBoostingClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_MLPClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_Perceptron.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_SVC.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/Test_ConfMatrix_XGBClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_AdaBoostClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_ComplementNB.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_HistGradientBoostingClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_MLPClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_Perceptron.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_SVC.png +preprocessing/reports/PipelineRun_2024_02_28-06_32_18_PM/Plots/TrainingCurve_XGBClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_AdaBoostClassifier_2024_02_29_09_20_45_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_ComplementNB_2024_02_29_05_49_24_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_DecisionTreeClassifier_2024_02_29_05_50_04_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_HistGradientBoostingClassifier_2024_02_29_08_45_11_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_KNeighborsClassifier_2024_02_29_05_40_40_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_LogisticRegression_2024_02_29_05_37_41_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_MLPClassifier_2024_02_29_10_26_59_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_Perceptron_2024_02_29_05_38_03_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_RandomForestClassifier_2024_02_29_05_53_06_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_RidgeClassifier_2024_02_29_05_38_46_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_SGDClassifier_2024_02_29_05_38_25_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_SVC_2024_02_29_05_49_20_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Files/Test_classification_XGBClassifier_2024_02_29_10_38_13_PM.xlsx +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Model/Best Optimized Model/XGBClassifier.joblib +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/AUC.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/RankedModelsByMetric.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_AdaBoostClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_ComplementNB.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_HistGradientBoostingClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_MLPClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_Perceptron.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_SVC.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/Test_ConfMatrix_XGBClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_AdaBoostClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_ComplementNB.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_HistGradientBoostingClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_MLPClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_Perceptron.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_SVC.png +preprocessing/reports/PipelineRun_2024_02_29-05_22_36_PM/Plots/TrainingCurve_XGBClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Files/Test_classification_KNeighborsClassifier_2024_03_01_02_38_18_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Files/Test_classification_LogisticRegression_2024_03_01_02_36_27_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Files/Test_classification_Perceptron_2024_03_01_02_36_40_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Files/Test_classification_RidgeClassifier_2024_03_01_02_37_08_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Files/Test_classification_SGDClassifier_2024_03_01_02_36_56_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Plots/Test_ConfMatrix_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Plots/Test_ConfMatrix_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Plots/Test_ConfMatrix_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Plots/Test_ConfMatrix_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Plots/TrainingCurve_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Plots/TrainingCurve_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Plots/TrainingCurve_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-02_31_34_PM/Plots/TrainingCurve_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-02_39_33_PM/Files/Test_classification_LogisticRegression_2024_03_01_02_43_26_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-02_39_33_PM/Files/Test_classification_Perceptron_2024_03_01_02_43_39_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-02_39_33_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_01-02_39_33_PM/Plots/Test_ConfMatrix_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_01-02_39_33_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_01-02_39_33_PM/Plots/TrainingCurve_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Files/Test_classification_ComplementNB_2024_03_01_06_51_58_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Files/Test_classification_DecisionTreeClassifier_2024_03_01_06_53_18_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Files/Test_classification_HistGradientBoostingClassifier_2024_03_01_07_35_19_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Files/Test_classification_KNeighborsClassifier_2024_03_01_06_48_22_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Files/Test_classification_LogisticRegression_2024_03_01_06_46_37_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Files/Test_classification_Perceptron_2024_03_01_06_46_50_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Files/Test_classification_RandomForestClassifier_2024_03_01_06_56_26_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Files/Test_classification_RidgeClassifier_2024_03_01_06_47_14_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Files/Test_classification_SGDClassifier_2024_03_01_06_47_03_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Files/Test_classification_SVC_2024_03_01_06_51_54_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/Test_ConfMatrix_ComplementNB.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/Test_ConfMatrix_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/Test_ConfMatrix_HistGradientBoostingClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/Test_ConfMatrix_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/Test_ConfMatrix_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/Test_ConfMatrix_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/Test_ConfMatrix_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/Test_ConfMatrix_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/Test_ConfMatrix_SVC.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/TrainingCurve_ComplementNB.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/TrainingCurve_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/TrainingCurve_HistGradientBoostingClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/TrainingCurve_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/TrainingCurve_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/TrainingCurve_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/TrainingCurve_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/TrainingCurve_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_01-06_42_24_PM/Plots/TrainingCurve_SVC.png +preprocessing/reports/PipelineRun_2024_03_02-01_34_49_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_02-01_50_54_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Files/Test_classification_KNeighborsClassifier_2024_03_02_05_23_56_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Files/Test_classification_LogisticRegression_2024_03_02_05_23_25_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Files/Test_classification_Perceptron_2024_03_02_05_23_30_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Files/Test_classification_RidgeClassifier_2024_03_02_05_23_39_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Files/Test_classification_SGDClassifier_2024_03_02_05_23_35_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Plots/Test_ConfMatrix_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Plots/Test_ConfMatrix_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Plots/Test_ConfMatrix_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Plots/Test_ConfMatrix_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Plots/TrainingCurve_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Plots/TrainingCurve_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Plots/TrainingCurve_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_19_58_PM/Plots/TrainingCurve_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Files/Test_classification_KNeighborsClassifier_2024_03_02_05_36_24_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Files/Test_classification_LogisticRegression_2024_03_02_05_36_04_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Files/Test_classification_Perceptron_2024_03_02_05_36_08_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Files/Test_classification_RidgeClassifier_2024_03_02_05_36_16_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Files/Test_classification_SGDClassifier_2024_03_02_05_36_13_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Plots/Test_ConfMatrix_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Plots/Test_ConfMatrix_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Plots/Test_ConfMatrix_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Plots/Test_ConfMatrix_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Plots/TrainingCurve_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Plots/TrainingCurve_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Plots/TrainingCurve_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_31_45_PM/Plots/TrainingCurve_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_AdaBoostClassifier_2024_03_02_06_50_57_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_CatBoostClassifier_2024_03_03_11_47_24_AM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_ComplementNB_2024_03_02_05_45_10_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_DecisionTreeClassifier_2024_03_02_05_46_30_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_HistGradientBoostingClassifier_2024_03_02_06_11_01_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_KNeighborsClassifier_2024_03_02_05_41_51_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_LogisticRegression_2024_03_02_05_41_23_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_MLPClassifier_2024_03_02_07_00_31_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_Perceptron_2024_03_02_05_41_28_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_RandomForestClassifier_2024_03_02_05_50_23_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_RidgeClassifier_2024_03_02_05_41_40_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_SGDClassifier_2024_03_02_05_41_36_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_SVC_2024_03_02_05_45_08_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Files/Test_classification_XGBClassifier_2024_03_02_07_06_38_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_AdaBoostClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_CatBoostClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_ComplementNB.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_HistGradientBoostingClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_MLPClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_SVC.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/Test_ConfMatrix_XGBClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_AdaBoostClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_CatBoostClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_ComplementNB.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_HistGradientBoostingClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_MLPClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_SVC.png +preprocessing/reports/PipelineRun_2024_03_02-05_37_58_PM/Plots/TrainingCurve_XGBClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-05_45_49_PM/Files/Test_classification_LogisticRegression_2024_03_03_06_06_37_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-05_45_49_PM/Model/Best Optimized Model/LogisticRegression.joblib +preprocessing/reports/PipelineRun_2024_03_03-05_45_49_PM/Plots/AUC.png +preprocessing/reports/PipelineRun_2024_03_03-05_45_49_PM/Plots/RankedModelsByMetric.png +preprocessing/reports/PipelineRun_2024_03_03-05_45_49_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_03-05_45_49_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_03-06_08_29_PM/Files/Test_classification_LogisticRegression_2024_03_03_06_28_42_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-06_08_29_PM/Model/Best Optimized Model/LogisticRegression.joblib +preprocessing/reports/PipelineRun_2024_03_03-06_08_29_PM/Plots/AUC.png +preprocessing/reports/PipelineRun_2024_03_03-06_08_29_PM/Plots/RankedModelsByMetric.png +preprocessing/reports/PipelineRun_2024_03_03-06_08_29_PM/Plots/SHAP_Waterfall_Plot.png +preprocessing/reports/PipelineRun_2024_03_03-06_08_29_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_03-06_08_29_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_03-12_21_38_PM/Files/Test_classification_LGBMClassifier_2024_03_03_12_28_20_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_21_38_PM/Plots/Test_ConfMatrix_LGBMClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_21_38_PM/Plots/TrainingCurve_LGBMClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_32_02_PM/Files/Test_classification_LGBMClassifier_2024_03_03_12_36_04_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_32_02_PM/Plots/Test_ConfMatrix_LGBMClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_32_02_PM/Plots/TrainingCurve_LGBMClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_38_16_PM/Files/Test_classification_LGBMClassifier_2024_03_03_12_42_34_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_38_16_PM/Model/Best Optimized Model/LGBMClassifier.joblib +preprocessing/reports/PipelineRun_2024_03_03-12_38_16_PM/Plots/AUC.png +preprocessing/reports/PipelineRun_2024_03_03-12_38_16_PM/Plots/RankedModelsByMetric.png +preprocessing/reports/PipelineRun_2024_03_03-12_38_16_PM/Plots/Test_ConfMatrix_LGBMClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_38_16_PM/Plots/TrainingCurve_LGBMClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Files/Test_classification_ComplementNB_2024_03_03_12_51_38_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Files/Test_classification_DecisionTreeClassifier_2024_03_03_12_52_40_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Files/Test_classification_KNeighborsClassifier_2024_03_03_12_47_26_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Files/Test_classification_LogisticRegression_2024_03_03_12_46_40_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Files/Test_classification_Perceptron_2024_03_03_12_46_45_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Files/Test_classification_RandomForestClassifier_2024_03_03_12_57_15_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Files/Test_classification_RidgeClassifier_2024_03_03_12_47_00_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Files/Test_classification_SGDClassifier_2024_03_03_12_46_56_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Files/Test_classification_SVC_2024_03_03_12_51_35_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/Test_ConfMatrix_ComplementNB.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/Test_ConfMatrix_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/Test_ConfMatrix_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/Test_ConfMatrix_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/Test_ConfMatrix_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/Test_ConfMatrix_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/Test_ConfMatrix_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/Test_ConfMatrix_SVC.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/TrainingCurve_ComplementNB.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/TrainingCurve_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/TrainingCurve_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/TrainingCurve_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/TrainingCurve_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/TrainingCurve_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/TrainingCurve_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_43_22_PM/Plots/TrainingCurve_SVC.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_AdaBoostClassifier_2024_03_03_04_19_16_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_ComplementNB_2024_03_03_01_21_35_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_DecisionTreeClassifier_2024_03_03_01_36_16_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_HistGradientBoostingClassifier_2024_03_03_03_28_24_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_KNeighborsClassifier_2024_03_03_01_05_51_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_LogisticRegression_2024_03_03_01_03_42_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_MLPClassifier_2024_03_03_05_08_45_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_Perceptron_2024_03_03_01_03_56_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_RandomForestClassifier_2024_03_03_02_00_10_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_RidgeClassifier_2024_03_03_01_04_20_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_SGDClassifier_2024_03_03_01_04_11_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_SVC_2024_03_03_01_21_15_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Files/Test_classification_XGBClassifier_2024_03_03_05_34_05_PM.xlsx +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_AdaBoostClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_ComplementNB.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_HistGradientBoostingClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_MLPClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_SVC.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/Test_ConfMatrix_XGBClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_AdaBoostClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_ComplementNB.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_DecisionTreeClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_HistGradientBoostingClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_KNeighborsClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_LogisticRegression.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_MLPClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_Perceptron.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_RandomForestClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_RidgeClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_SGDClassifier.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_SVC.png +preprocessing/reports/PipelineRun_2024_03_03-12_59_52_PM/Plots/TrainingCurve_XGBClassifier.png diff --git a/.vscode/launch.json b/.vscode/launch.json index 10daee9..07a5188 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,16 +1,16 @@ { - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Python: Current File", - "type": "python", - "request": "launch", - "program": "${file}", - "console": "integratedTerminal", - "justMyCode": true - } - ] -} \ No newline at end of file + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "justMyCode": true + } + ] +} diff --git a/catboost_info/catboost_training.json b/catboost_info/catboost_training.json new file mode 100644 index 0000000..0fc7b16 --- /dev/null +++ b/catboost_info/catboost_training.json @@ -0,0 +1,133 @@ +{ +"meta":{"test_sets":[],"test_metrics":[],"learn_metrics":[{"best_value":"Min","name":"Logloss"}],"launch_mode":"Train","parameters":"","iteration_count":1000,"learn_sets":["learn"],"name":"experiment"}, +"iterations":[ +{"learn":[0.6883371991],"iteration":0,"passed_time":0.5897550884,"remaining_time":589.1653333}, +{"learn":[0.6849897039],"iteration":1,"passed_time":1.187687649,"remaining_time":592.6561367}, +{"learn":[0.6813085268],"iteration":2,"passed_time":1.799819596,"remaining_time":598.1400458}, +{"learn":[0.6766436236],"iteration":3,"passed_time":2.394584261,"remaining_time":596.251481}, +{"learn":[0.6728130128],"iteration":4,"passed_time":3.000550225,"remaining_time":597.1094947}, +{"learn":[0.6690699493],"iteration":5,"passed_time":3.593251275,"remaining_time":595.2819611}, +{"learn":[0.6652317087],"iteration":6,"passed_time":4.177751719,"remaining_time":592.6439224}, +{"learn":[0.6619248218],"iteration":7,"passed_time":4.786182625,"remaining_time":593.4866456}, +{"learn":[0.6578150004],"iteration":8,"passed_time":5.394714473,"remaining_time":594.0180048}, +{"learn":[0.6539802907],"iteration":9,"passed_time":6.005211823,"remaining_time":594.5159705}, +{"learn":[0.6505922907],"iteration":10,"passed_time":6.618117736,"remaining_time":595.0289491}, +{"learn":[0.6473904205],"iteration":11,"passed_time":7.205823959,"remaining_time":593.2795059}, +{"learn":[0.644521791],"iteration":12,"passed_time":7.821837614,"remaining_time":593.8579788}, +{"learn":[0.6407377941],"iteration":13,"passed_time":8.418556906,"remaining_time":592.9069364}, +{"learn":[0.6369871366],"iteration":14,"passed_time":9.007965337,"remaining_time":591.5230571}, +{"learn":[0.633097111],"iteration":15,"passed_time":9.587356145,"remaining_time":589.6224029}, +{"learn":[0.6297596283],"iteration":16,"passed_time":10.20146429,"remaining_time":589.8846707}, +{"learn":[0.6264339319],"iteration":17,"passed_time":10.7849311,"remaining_time":588.3779077}, +{"learn":[0.622517565],"iteration":18,"passed_time":11.35832983,"remaining_time":586.4485036}, +{"learn":[0.6189781207],"iteration":19,"passed_time":11.96102383,"remaining_time":586.0901675}, +{"learn":[0.614891844],"iteration":20,"passed_time":12.55943334,"remaining_time":585.508821}, +{"learn":[0.6112319396],"iteration":21,"passed_time":13.12664415,"remaining_time":583.5389991}, +{"learn":[0.6078995127],"iteration":22,"passed_time":13.72043904,"remaining_time":582.8203888}, +{"learn":[0.6043469811],"iteration":23,"passed_time":14.30089813,"remaining_time":581.5698573}, +{"learn":[0.6009359144],"iteration":24,"passed_time":14.86045648,"remaining_time":579.5578027}, +{"learn":[0.597810164],"iteration":25,"passed_time":15.44499837,"remaining_time":578.5934006}, +{"learn":[0.5944324293],"iteration":26,"passed_time":16.03185389,"remaining_time":577.7405123}, +{"learn":[0.5906859395],"iteration":27,"passed_time":16.60986365,"remaining_time":576.5995524}, +{"learn":[0.5870805433],"iteration":28,"passed_time":17.18989798,"remaining_time":575.5652047}, +{"learn":[0.583553606],"iteration":29,"passed_time":17.7855844,"remaining_time":575.0672289}, +{"learn":[0.5804469559],"iteration":30,"passed_time":18.38185485,"remaining_time":574.5812048}, +{"learn":[0.5777068204],"iteration":31,"passed_time":18.99339344,"remaining_time":574.5501515}, +{"learn":[0.5750315315],"iteration":32,"passed_time":19.59031715,"remaining_time":574.0556571}, +{"learn":[0.5721755113],"iteration":33,"passed_time":20.16550665,"remaining_time":572.93763}, +{"learn":[0.5695312201],"iteration":34,"passed_time":20.75657917,"remaining_time":572.2885399}, +{"learn":[0.5666199482],"iteration":35,"passed_time":21.34408789,"remaining_time":571.5472424}, +{"learn":[0.5635877528],"iteration":36,"passed_time":21.92614231,"remaining_time":570.6722984}, +{"learn":[0.5612142249],"iteration":37,"passed_time":22.49868862,"remaining_time":569.5720646}, +{"learn":[0.5581252003],"iteration":38,"passed_time":23.08822871,"remaining_time":568.9176357}, +{"learn":[0.5546548047],"iteration":39,"passed_time":23.67937912,"remaining_time":568.3050989}, +{"learn":[0.5516636951],"iteration":40,"passed_time":24.29208519,"remaining_time":568.1977975}, +{"learn":[0.5490912517],"iteration":41,"passed_time":24.92510334,"remaining_time":568.5297381}, +{"learn":[0.546783947],"iteration":42,"passed_time":25.66325347,"remaining_time":571.1565947}, +{"learn":[0.5439302142],"iteration":43,"passed_time":26.28247857,"remaining_time":571.0465798}, +{"learn":[0.5411590415],"iteration":44,"passed_time":26.95499626,"remaining_time":572.0449206}, +{"learn":[0.5383552222],"iteration":45,"passed_time":27.63202213,"remaining_time":573.0641112}, +{"learn":[0.5349591336],"iteration":46,"passed_time":28.28663996,"remaining_time":573.5567635}, +{"learn":[0.5324778918],"iteration":47,"passed_time":28.89373871,"remaining_time":573.0591512}, +{"learn":[0.5298075007],"iteration":48,"passed_time":29.48112755,"remaining_time":572.1745368}, +{"learn":[0.5271983529],"iteration":49,"passed_time":30.07979139,"remaining_time":571.5160363}, +{"learn":[0.5245196434],"iteration":50,"passed_time":30.74697146,"remaining_time":572.1348219}, +{"learn":[0.522480269],"iteration":51,"passed_time":31.36928571,"remaining_time":571.8862087}, +{"learn":[0.5202081219],"iteration":52,"passed_time":31.99439527,"remaining_time":571.67344}, +{"learn":[0.5176574295],"iteration":53,"passed_time":32.59369419,"remaining_time":570.9932353}, +{"learn":[0.5149622626],"iteration":54,"passed_time":33.18225985,"remaining_time":570.1315556}, +{"learn":[0.5121084264],"iteration":55,"passed_time":33.79355552,"remaining_time":569.662793}, +{"learn":[0.5104709002],"iteration":56,"passed_time":34.40009825,"remaining_time":569.1103974}, +{"learn":[0.5082245484],"iteration":57,"passed_time":35.01813422,"remaining_time":568.7428007}, +{"learn":[0.5063720641],"iteration":58,"passed_time":35.63463605,"remaining_time":568.3422462}, +{"learn":[0.5034715164],"iteration":59,"passed_time":36.2459967,"remaining_time":567.8539483}, +{"learn":[0.5014089854],"iteration":60,"passed_time":36.84431034,"remaining_time":567.1607772}, +{"learn":[0.4990082327],"iteration":61,"passed_time":37.46327598,"remaining_time":566.7831107}, +{"learn":[0.4965486394],"iteration":62,"passed_time":38.06121818,"remaining_time":566.0851022}, +{"learn":[0.493733377],"iteration":63,"passed_time":38.68254915,"remaining_time":565.7322813}, +{"learn":[0.4913317912],"iteration":64,"passed_time":39.30046774,"remaining_time":565.3221128}, +{"learn":[0.4889768819],"iteration":65,"passed_time":39.88614378,"remaining_time":564.449368}, +{"learn":[0.4870395262],"iteration":66,"passed_time":40.54811603,"remaining_time":564.6476457}, +{"learn":[0.4853275731],"iteration":67,"passed_time":41.19816658,"remaining_time":564.6572243}, +{"learn":[0.4829123608],"iteration":68,"passed_time":41.81805237,"remaining_time":564.2406776}, +{"learn":[0.480085898],"iteration":69,"passed_time":42.43752536,"remaining_time":563.8128369}, +{"learn":[0.4773598818],"iteration":70,"passed_time":43.07911795,"remaining_time":563.6690222}, +{"learn":[0.474927284],"iteration":71,"passed_time":43.7036856,"remaining_time":563.2919477}, +{"learn":[0.4727803503],"iteration":72,"passed_time":44.31707578,"remaining_time":562.766154}, +{"learn":[0.470746707],"iteration":73,"passed_time":44.94146827,"remaining_time":562.3756705}, +{"learn":[0.469145809],"iteration":74,"passed_time":45.56078983,"remaining_time":561.9164078}, +{"learn":[0.4673539586],"iteration":75,"passed_time":46.19994484,"remaining_time":561.6940662}, +{"learn":[0.4653330282],"iteration":76,"passed_time":46.8279224,"remaining_time":561.3269139}, +{"learn":[0.4629332073],"iteration":77,"passed_time":47.45549751,"remaining_time":560.9483167}, +{"learn":[0.461455625],"iteration":78,"passed_time":48.08567549,"remaining_time":560.5937611}, +{"learn":[0.4589075141],"iteration":79,"passed_time":48.70650966,"remaining_time":560.1248611}, +{"learn":[0.4572562213],"iteration":80,"passed_time":49.34735463,"remaining_time":559.8792457}, +{"learn":[0.4550896079],"iteration":81,"passed_time":49.97140368,"remaining_time":559.4359582}, +{"learn":[0.4529972409],"iteration":82,"passed_time":50.59338196,"remaining_time":558.9654368}, +{"learn":[0.4508416438],"iteration":83,"passed_time":51.20695941,"remaining_time":558.3997002}, +{"learn":[0.4483993361],"iteration":84,"passed_time":51.87347723,"remaining_time":558.4027254}, +{"learn":[0.4465511643],"iteration":85,"passed_time":52.52178308,"remaining_time":558.1966249}, +{"learn":[0.4447563341],"iteration":86,"passed_time":53.2167855,"remaining_time":558.4704041}, +{"learn":[0.4426378815],"iteration":87,"passed_time":53.97495745,"remaining_time":559.3768317}, +{"learn":[0.4403107091],"iteration":88,"passed_time":54.68105686,"remaining_time":559.7128405}, +{"learn":[0.4386413034],"iteration":89,"passed_time":55.30474307,"remaining_time":559.1924022}, +{"learn":[0.436209122],"iteration":90,"passed_time":55.9357208,"remaining_time":558.7425297}, +{"learn":[0.4339061358],"iteration":91,"passed_time":56.52480875,"remaining_time":557.8752864}, +{"learn":[0.4323550755],"iteration":92,"passed_time":57.13760773,"remaining_time":557.2452711}, +{"learn":[0.4298656851],"iteration":93,"passed_time":57.77018037,"remaining_time":556.8062066}, +{"learn":[0.4277456214],"iteration":94,"passed_time":58.38802047,"remaining_time":556.2227214}, +{"learn":[0.4256634719],"iteration":95,"passed_time":59.01425543,"remaining_time":555.717572}, +{"learn":[0.4234715671],"iteration":96,"passed_time":59.60482696,"remaining_time":554.8779252}, +{"learn":[0.4214558458],"iteration":97,"passed_time":60.2197512,"remaining_time":554.267506}, +{"learn":[0.41920427],"iteration":98,"passed_time":60.8689845,"remaining_time":553.9692428}, +{"learn":[0.4173560088],"iteration":99,"passed_time":61.51418262,"remaining_time":553.6276436}, +{"learn":[0.415247437],"iteration":100,"passed_time":62.18599253,"remaining_time":553.5169038}, +{"learn":[0.4135330358],"iteration":101,"passed_time":62.82241164,"remaining_time":553.0835848}, +{"learn":[0.4116642926],"iteration":102,"passed_time":63.49063654,"remaining_time":552.9233105}, +{"learn":[0.4100296579],"iteration":103,"passed_time":64.10720745,"remaining_time":552.3082488}, +{"learn":[0.408585013],"iteration":104,"passed_time":64.71153045,"remaining_time":551.5887595}, +{"learn":[0.4069770162],"iteration":105,"passed_time":65.29487328,"remaining_time":550.6944973}, +{"learn":[0.405438881],"iteration":106,"passed_time":65.90525614,"remaining_time":550.0317171}, +{"learn":[0.4037825713],"iteration":107,"passed_time":66.51263033,"remaining_time":549.3450579}, +{"learn":[0.4023672899],"iteration":108,"passed_time":67.10965729,"remaining_time":548.5752719}, +{"learn":[0.400733606],"iteration":109,"passed_time":67.71025538,"remaining_time":547.8375208}, +{"learn":[0.3991515476],"iteration":110,"passed_time":68.35400905,"remaining_time":547.4478743}, +{"learn":[0.3973838822],"iteration":111,"passed_time":69.00053318,"remaining_time":547.0756559}, +{"learn":[0.3954570656],"iteration":112,"passed_time":69.61302302,"remaining_time":546.4314285}, +{"learn":[0.3935102679],"iteration":113,"passed_time":70.24954863,"remaining_time":545.9745622}, +{"learn":[0.3913648742],"iteration":114,"passed_time":70.88590953,"remaining_time":545.5133038}, +{"learn":[0.3890903913],"iteration":115,"passed_time":71.52397284,"remaining_time":545.0619999}, +{"learn":[0.3878660192],"iteration":116,"passed_time":72.18511074,"remaining_time":544.7816477}, +{"learn":[0.3859520437],"iteration":117,"passed_time":72.85663239,"remaining_time":544.5724556}, +{"learn":[0.3844859767],"iteration":118,"passed_time":73.51858511,"remaining_time":544.2846511}, +{"learn":[0.3823509789],"iteration":119,"passed_time":74.14153168,"remaining_time":543.7045656}, +{"learn":[0.3808779491],"iteration":120,"passed_time":74.75052938,"remaining_time":543.0224407}, +{"learn":[0.379343205],"iteration":121,"passed_time":75.34032947,"remaining_time":542.2033547}, +{"learn":[0.3778756983],"iteration":122,"passed_time":75.93713484,"remaining_time":541.4379452}, +{"learn":[0.3759205283],"iteration":123,"passed_time":76.50914277,"remaining_time":540.5000731}, +{"learn":[0.3743925077],"iteration":124,"passed_time":77.10508808,"remaining_time":539.7356166}, +{"learn":[0.3728343177],"iteration":125,"passed_time":77.70636566,"remaining_time":539.0108221}, +{"learn":[0.37147335],"iteration":126,"passed_time":78.33222552,"remaining_time":538.4569518}, +{"learn":[0.3696634524],"iteration":127,"passed_time":78.91101072,"remaining_time":537.5812605}, +{"learn":[0.3685633831],"iteration":128,"passed_time":79.49336659,"remaining_time":536.7342814}, +{ \ No newline at end of file diff --git a/preprocessing/src/config.json b/preprocessing/src/config.json index bd93bc1..4f3d8a8 100644 --- a/preprocessing/src/config.json +++ b/preprocessing/src/config.json @@ -1,6 +1,6 @@ { "path_backbone": "C://Users//rafaelo//OneDrive - NTNU//Documents//Projects//preprocessing//preprocessing//preprocessing", - "data": "data//raw//METABRIC_RNA_Mutation.csv", + "data": "data//raw//full_METABRIC_RNA_Mutation.csv", "preprocessing": { "numerical_preprocessing": { "StandardScaler": { @@ -535,7 +535,180 @@ "transformed__tulp4", "transformed__ugt2b15", "transformed__ugt2b17", - "transformed__ugt2b7" + "transformed__ugt2b7", + "transformed__pik3ca_mut", + "transformed__tp53_mut", + "transformed__muc16_mut", + "transformed__ahnak2_mut", + "transformed__kmt2c_mut", + "transformed__syne1_mut", + "transformed__gata3_mut", + "transformed__map3k1_mut", + "transformed__ahnak_mut", + "transformed__dnah11_mut", + "transformed__cdh1_mut", + "transformed__dnah2_mut", + "transformed__kmt2d_mut", + "transformed__ush2a_mut", + "transformed__ryr2_mut", + "transformed__dnah5_mut", + "transformed__herc2_mut", + "transformed__pde4dip_mut", + "transformed__akap9_mut", + "transformed__tg_mut", + "transformed__birc6_mut", + "transformed__utrn_mut", + "transformed__tbx3_mut", + "transformed__col6a3_mut", + "transformed__arid1a_mut", + "transformed__lama2_mut", + "transformed__notch1_mut", + "transformed__cbfb_mut", + "transformed__ncor2_mut", + "transformed__col12a1_mut", + "transformed__col22a1_mut", + "transformed__pten_mut", + "transformed__akt1_mut", + "transformed__atr_mut", + "transformed__thada_mut", + "transformed__ncor1_mut", + "transformed__stab2_mut", + "transformed__myh9_mut", + "transformed__runx1_mut", + "transformed__nf1_mut", + "transformed__map2k4_mut", + "transformed__ros1_mut", + "transformed__lamb3_mut", + "transformed__arid1b_mut", + "transformed__erbb2_mut", + "transformed__sf3b1_mut", + "transformed__shank2_mut", + "transformed__ep300_mut", + "transformed__ptprd_mut", + "transformed__usp9x_mut", + "transformed__setd2_mut", + "transformed__setd1a_mut", + "transformed__thsd7a_mut", + "transformed__afdn_mut", + "transformed__erbb3_mut", + "transformed__rb1_mut", + "transformed__myo1a_mut", + "transformed__alk_mut", + "transformed__fanca_mut", + "transformed__adgra2_mut", + "transformed__ubr5_mut", + "transformed__pik3r1_mut", + "transformed__myo3a_mut", + "transformed__asxl2_mut", + "transformed__apc_mut", + "transformed__ctcf_mut", + "transformed__asxl1_mut", + "transformed__fancd2_mut", + "transformed__taf1_mut", + "transformed__kdm6a_mut", + "transformed__ctnna3_mut", + "transformed__brca1_mut", + "transformed__ptprm_mut", + "transformed__foxo3_mut", + "transformed__usp28_mut", + "transformed__gldc_mut", + "transformed__brca2_mut", + "transformed__cacna2d3_mut", + "transformed__arid2_mut", + "transformed__aff2_mut", + "transformed__lifr_mut", + "transformed__sbno1_mut", + "transformed__kdm3a_mut", + "transformed__ncoa3_mut", + "transformed__bap1_mut", + "transformed__l1cam_mut", + "transformed__pbrm1_mut", + "transformed__chd1_mut", + "transformed__jak1_mut", + "transformed__setdb1_mut", + "transformed__fam20c_mut", + "transformed__arid5b_mut", + "transformed__egfr_mut", + "transformed__map3k10_mut", + "transformed__smarcc2_mut", + "transformed__erbb4_mut", + "transformed__npnt_mut", + "transformed__nek1_mut", + "transformed__agmo_mut", + "transformed__zfp36l1_mut", + "transformed__smad4_mut", + "transformed__sik1_mut", + "transformed__casp8_mut", + "transformed__prkcq_mut", + "transformed__smarcc1_mut", + "transformed__palld_mut", + "transformed__dcaf4l2_mut", + "transformed__bcas3_mut", + "transformed__cdkn1b_mut", + "transformed__gps2_mut", + "transformed__men1_mut", + "transformed__stk11_mut", + "transformed__sik2_mut", + "transformed__ptpn22_mut", + "transformed__brip1_mut", + "transformed__flt3_mut", + "transformed__nrg3_mut", + "transformed__fbxw7_mut", + "transformed__ttyh1_mut", + "transformed__taf4b_mut", + "transformed__or6a2_mut", + "transformed__map3k13_mut", + "transformed__hdac9_mut", + "transformed__prkacg_mut", + "transformed__rpgr_mut", + "transformed__large1_mut", + "transformed__foxp1_mut", + "transformed__clk3_mut", + "transformed__prkcz_mut", + "transformed__lipi_mut", + "transformed__ppp2r2a_mut", + "transformed__prkce_mut", + "transformed__gh1_mut", + "transformed__gpr32_mut", + "transformed__kras_mut", + "transformed__nf2_mut", + "transformed__chek2_mut", + "transformed__ldlrap1_mut", + "transformed__clrn2_mut", + "transformed__acvrl1_mut", + "transformed__agtr2_mut", + "transformed__cdkn2a_mut", + "transformed__ctnna1_mut", + "transformed__magea8_mut", + "transformed__prr16_mut", + "transformed__dtwd2_mut", + "transformed__akt2_mut", + "transformed__braf_mut", + "transformed__foxo1_mut", + "transformed__nt5e_mut", + "transformed__ccnd3_mut", + "transformed__nr3c1_mut", + "transformed__prkg1_mut", + "transformed__tbl1xr1_mut", + "transformed__frmd3_mut", + "transformed__smad2_mut", + "transformed__sgcd_mut", + "transformed__spaca1_mut", + "transformed__rasgef1b_mut", + "transformed__hist1h2bc_mut", + "transformed__nr2f1_mut", + "transformed__klrg1_mut", + "transformed__mbl2_mut", + "transformed__mtap_mut", + "transformed__ppp2cb_mut", + "transformed__smarcd1_mut", + "transformed__nras_mut", + "transformed__ndfip1_mut", + "transformed__hras_mut", + "transformed__prps2_mut", + "transformed__smarcb1_mut", + "transformed__stmn2_mut", + "transformed__siah1_mut" ] }, "feature_selection": { @@ -557,7 +730,7 @@ "RFECV": { "usage": true, "params": { - "min_features_to_select": 40, + "min_features_to_select": 50, "scoring": "accuracy", "step": 0.1, "cv": { @@ -573,7 +746,6 @@ } } }, - "feature_engineering": {}, "classification": { "usage": 1, "train_test_split": { @@ -585,24 +757,28 @@ "DummyClassifier": { "strategy": "most_frequent" }, + "cv_settings": { + "n_iter": 1, + "scoring": ["accuracy"], + "cv": { + "StratifiedKFold": { + "n_splits": 10, + "shuffle": true, + "random_state": 0 + } + }, + "n_jobs": 2 + }, + "optimization": { + "optuna": { "usage": true }, + "RandomizedSearchCV": { "usage": false } + }, "models": { "linear_model": { "LogisticRegression": { "usage": 1, "run_feature_selection": true, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "penalty": ["l1", "l2"], "C": "np.logspace(-3, 2, 6)", @@ -612,21 +788,9 @@ } }, "Perceptron": { - "usage": 1, + "usage": 0, "run_feature_selection": true, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "penalty": ["l2", "l1", "elasticnet"], "alpha": [0.0001, 0.001, 0.01], @@ -639,21 +803,9 @@ } }, "SGDClassifier": { - "usage": 1, + "usage": 0, "run_feature_selection": true, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "loss": [ "hinge", @@ -677,21 +829,9 @@ } }, "RidgeClassifier": { - "usage": 1, + "usage": 0, "run_feature_selection": true, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "alpha": [0.1, 1.0, 10.0], "solver": [ @@ -710,21 +850,9 @@ }, "neighbors": { "KNeighborsClassifier": { - "usage": 1, + "usage": 0, "run_feature_selection": false, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "n_neighbors": [3, 5, 7, 9], "weights": ["uniform", "distance"], @@ -737,21 +865,9 @@ }, "svm": { "SVC": { - "usage": 1, + "usage": 0, "run_feature_selection": false, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "C": [0.1, 1, 10], "kernel": ["linear", "rbf", "poly"], @@ -764,21 +880,9 @@ }, "naive_bayes": { "ComplementNB": { - "usage": 1, + "usage": 0, "run_feature_selection": false, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "alpha": [ 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 0.001, 0.01, 0.1, 0.5, 1 @@ -789,21 +893,9 @@ }, "tree": { "DecisionTreeClassifier": { - "usage": 1, + "usage": 0, "run_feature_selection": false, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "criterion": ["gini", "entropy"], "splitter": ["best", "random"], @@ -816,21 +908,9 @@ }, "ensemble": { "RandomForestClassifier": { - "usage": 1, + "usage": 0, "run_feature_selection": false, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "n_estimators": [10, 50, 100, 200, 300], "criterion": ["gini", "entropy"], @@ -842,21 +922,9 @@ } }, "HistGradientBoostingClassifier": { - "usage": 1, + "usage": 0, "run_feature_selection": false, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "max_leaf_nodes": [15, 31, 63, 127], "max_depth": [10, 20, 30], @@ -866,21 +934,9 @@ } }, "AdaBoostClassifier": { - "usage": 1, + "usage": 0, "run_feature_selection": false, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "n_estimators": [10, 50, 100, 200, 300], "algorithm": ["SAMME.R"], @@ -892,21 +948,9 @@ }, "neural_network": { "MLPClassifier": { - "usage": 1, + "usage": 0, "run_feature_selection": false, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "activation": ["identity", "logistic", "tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], @@ -921,21 +965,9 @@ }, "XGBoost": { "XGBClassifier": { - "usage": 1, + "usage": 0, "run_feature_selection": false, "hyperparameters": { - "hyperparameter_settings": { - "n_iter": 3, - "scoring": ["accuracy"], - "cv": { - "RepeatedStratifiedKFold": { - "n_splits": 7, - "n_repeats": 3, - "random_state": 0 - } - }, - "n_jobs": 2 - }, "param_distribution": { "n_estimators": [10, 50, 100, 200, 300], "max_depth": [3, 5, 7, 9, 12], @@ -943,6 +975,33 @@ } } } + }, + "CatBoost": { + "CatBoostClassifier": { + "usage": 0, + "run_feature_selection": false, + "hyperparameters": { + "param_distribution": { + "depth": [4, 6, 8, 10], + "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3], + "min_child_samples": [1, 5, 10, 20, 30] + } + } + } + }, + "LightGBM": { + "LGBMClassifier": { + "usage": 0, + "run_feature_selection": false, + "hyperparameters": { + "param_distribution": { + "n_estimators": [10, 50, 100, 200, 300], + "max_depth": [3, 5, 7, 9, 12], + "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3], + "num_leaves": [15, 31, 63, 127, 255] + } + } + } } } } diff --git a/preprocessing/src/main.py b/preprocessing/src/main.py index 4479e16..d5d905b 100644 --- a/preprocessing/src/main.py +++ b/preprocessing/src/main.py @@ -57,27 +57,35 @@ class Orchestrator(object): - def __init__(self, config, path_backbone, data_dict, saving_path): + def __init__(self, config, path_backbone, data_dict, saving_path, encoder_dict): self.config = config self.path_backbone = path_backbone self.data_dict = data_dict self.saving_path = saving_path + self.encoder_dict = encoder_dict def run_PreProcessor(self) -> dict: logger.warning( "Note: It is important that numerical columns are pre-processed first, before categorical ones. This is to avoid OneHotEncoded columns (binary 0 and 1) to be seen as numerical during numerical pre-processing." ) config = self.config["preprocessing"] - PreProcessorPipeline = PreProcessor(config=config) + PreProcessorPipeline = PreProcessor( + config=config, encoder_dict=self.encoder_dict + ) data = self.data_dict["raw_data"].copy() + data = PreProcessorPipeline.NA_solver(data=data) + # Start by pre-processing selected numerical columns - data = PreProcessorPipeline.encoders(data=data, dtype="numerical") + data, encoder_dict = PreProcessorPipeline.encoders(data=data, dtype="numerical") # Finally, process categorical columns - data = PreProcessorPipeline.encoders(data=data, dtype="categorical") + data, encoder_dict = PreProcessorPipeline.encoders( + data=data, dtype="categorical" + ) + self.encoder_dict = encoder_dict self.data_dict.setdefault("preprocessed_data", data) def run_ModelTraining(self) -> dict: @@ -100,15 +108,6 @@ def run_ModelTraining(self) -> dict: model_settings = config[modelling_problem_type] - # Load full dataset - # TODO: add NA_solver here - data.fillna(0, inplace=True) - - # Drop all rows that contain NA - # logger.info(f"Data shape before dropping NAs {data.shape}") - # data.dropna(inplace=True) - # logger.info(f"Data shape after dropping NAs {data.shape}") - if "all" in features: features = data.columns.tolist() features.remove(target) @@ -131,6 +130,9 @@ def run_ModelTraining(self) -> dict: X_train, X_test, y_train, y_test = ModelTrainingPipeline.tabular_data_split( X, y, modelling_problem_type ) + self.data_dict["original_X_test"] = X_test.copy() + self.data_dict["original_X_train"] = X_train.copy() + # Loop through each model, optimize them, find the model that generalizes best and get the predictions from this best model. if modelling_problem_type == "classification": best_clf, cv_results_dict, param_distribution, label_encoder, X_test_new = ( @@ -147,6 +149,11 @@ def run_ModelTraining(self) -> dict: # Save the best model ModelTrainingPipeline.save_best_model(best_clf) + # Shap values and feature importance analysis + ModelTrainingPipeline.shap_analysis( + best_clf, X_test_new, self.data_dict["original_X_test"], y_test + ) + elif modelling_problem_type == "regression": baseline_model = ModelTrainingPipeline.find_best_reg( model_settings, X_train, X_test, y_train, y_test, modelling_problem_type @@ -190,11 +197,13 @@ def main(CONFIG_PATH: str): logger.info(f"Directory where outputs will be saved: {directory}") data_dict = {"raw_data": data} + encoder_dict = {"numerical_encoder": {}, "categorical_encoder": {}} ORCHESTRATOR = Orchestrator( config=config, path_backbone=path_backbone, data_dict=data_dict, saving_path=directory, + encoder_dict=encoder_dict, ) ORCHESTRATOR.run_PreProcessor() ORCHESTRATOR.run_ModelTraining() diff --git a/preprocessing/src/preprocessing.py b/preprocessing/src/preprocessing.py index b0cff2c..689df9f 100644 --- a/preprocessing/src/preprocessing.py +++ b/preprocessing/src/preprocessing.py @@ -16,14 +16,22 @@ from loguru import logger import pandas as pd import warnings +from sklearn.linear_model import LinearRegression +from typing import List +import pandas as pd +import numpy as np +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import OrdinalEncoder +from sklearn.ensemble import HistGradientBoostingClassifier warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning) class PreProcessor: - def __init__(self, config: dict): + def __init__(self, config: dict, encoder_dict: dict): self.config = config + self.encoder_dict = encoder_dict def encoders(self, data: pd.DataFrame, dtype: str): @@ -116,6 +124,7 @@ def encoders(self, data: pd.DataFrame, dtype: str): remainder="drop", verbose_feature_names_out=True, ) + # Create a new dataframe with just the new transformed columns new_data = pd.DataFrame( preprocessor.fit_transform(data_cp), @@ -139,6 +148,69 @@ def encoders(self, data: pd.DataFrame, dtype: str): ], axis=1, ) - return data_cp - def NA_solver(self): ... + if dtype == "categorical": + self.encoder_dict["categorical_encoder"][ + preprocessor_name + ] = preprocessor + elif dtype == "numerical": + self.encoder_dict["numerical_encoder"][preprocessor_name] = preprocessor + + return data_cp, self.encoder_dict + + def NA_solver(self, data: pd.DataFrame): + + logger.info(f"Running NaN solver") + from sklearn.ensemble import HistGradientBoostingRegressor + import re + + # Create a copy of the original dataframe for regression + data_copy = data.copy() + + # Convert object/string predictors in X to OrdinalEncoded integers + for column in data.columns: + if column != "mutation_count" and "_mut" in column: + data[column] = data_copy[column].apply( + lambda x: 1 if isinstance(x, str) else x + ) + data[column] = data[column].astype(object) + else: + encoder = OrdinalEncoder() + data[column] = encoder.fit_transform( + data_copy[column].values.reshape(-1, 1) + ) + for column in data.columns: + if data[column].dtype == "object": + # Fill NAs with majority voting + majority_vote = data[column].mode().iloc[0] + data[column].fillna(majority_vote, inplace=True) + elif data[column].dtype == "float" or data[column].dtype == "int": + # Fill NAs with regression imputation + missing_indices = data[column].isnull() + non_missing_indices = ~missing_indices + + if missing_indices.sum() > 0: + # Create a regression model + model = HistGradientBoostingRegressor() + + # Prepare the input data for regression + X = data[non_missing_indices].drop(column, axis=1) + y = data[column][non_missing_indices] + + # Fit the model using non-missing values + model.fit(X, y) + + # Predict the missing values + X_missing = data[missing_indices].drop(column, axis=1) + predicted_values = model.predict(X_missing) + + # Adjust the result if the column is count data + if not data[column].apply(float.is_integer).all(): + predicted_values = np.round(predicted_values) + + # Fill the missing values with the predicted values + data[column][missing_indices] = predicted_values + else: + # Skip invalid column type + continue + return data diff --git a/preprocessing/src/train_model.py b/preprocessing/src/train_model.py index 02ebf6e..bb09ba7 100644 --- a/preprocessing/src/train_model.py +++ b/preprocessing/src/train_model.py @@ -26,6 +26,9 @@ ) from sklearn.neighbors import KNeighborsClassifier import xgboost +import catboost +import lightgbm +import optuna # from xgboost import XGBClassifier from sklearn.neural_network import MLPClassifier @@ -45,6 +48,10 @@ HistGradientBoostingRegressor, AdaBoostClassifier, ) + +from catboost import CatBoostClassifier +from lightgbm import LGBMClassifier + from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.feature_selection import ( SequentialFeatureSelector, @@ -196,6 +203,29 @@ def train_and_optimize_clf( y_train = y_train.T.to_numpy()[0] y_test = y_test.T.to_numpy()[0] + # Gather CV settings + cv_settings = model_settings["cv_settings"] + eval_score = cv_settings["scoring"][0] + + # Load cross validation strategy dynamically + cv = _return_cross_validation(cv_settings) + + # Get optimization type + optimization = model_settings["optimization"] + + optimization_type_list = [ + key + for key, value in model_settings["optimization"].items() + if value["usage"] + ] + + if len(optimization_type_list) > 1: + raise ValueError( + f"Only one optimization type can be used. You currently have {len(optimization_type_list)} types selected. Please, adjust the settings." + ) + + optimization_type = optimization_type_list[0] + for model_type, model_config in models.items(): for model_name, model_utils in model_config.items(): if model_utils["usage"] == 1: @@ -207,6 +237,10 @@ def train_and_optimize_clf( if "XGB" in model_name: model = eval(f"xgboost.{model_name}()") + elif "CatBoost" in model_name: + model = eval(f"catboost.{model_name}()") + elif "LGBMClassifier" in model_name: + model = eval(f"lightgbm.{model_name}()") else: model = getattr( importlib.import_module(f"sklearn.{model_type}"), model_name @@ -244,12 +278,6 @@ def train_and_optimize_clf( pipeline = make_pipeline(model) - hyperparameter_settings = model_utils["hyperparameters"][ - "hyperparameter_settings" - ] - - logger.info(f"Running CV for {model_name}.") - # Transform strings in the param_distribution to actual ranges param_distribution_final = { key: ( @@ -264,45 +292,64 @@ def train_and_optimize_clf( # Optimizing the best baseline model on training data logger.info( - f"Optimizing baseline model using RandomizedSearchCV. \n Baseline model is {model_name} and it will be optimized with the following parameters: \n{param_distribution_final}" + f"Optimizing baseline model using '{optimization_type}'. \n Baseline model is {model_name} and it will be optimized with the following parameters: \n{param_distribution_final}" ) - # Load cross validation strategy dynamically - cv = _return_cross_validation(hyperparameter_settings) - - # Optimize hyperparameters - randomized_search = RandomizedSearchCV( - model, # Choose just the model and not the imputer - param_distributions=param_distribution_final, - n_iter=hyperparameter_settings["n_iter"], - scoring=hyperparameter_settings["scoring"], - cv=cv, - random_state=0, - n_jobs=hyperparameter_settings["n_jobs"], - verbose=3, - refit=hyperparameter_settings["scoring"][0], - return_train_score=True, - ).fit(X_train_cp, y_train_cp) + if optimization_type == "optuna": + study = self.optimize_clf( + param_distribution_final, + model, + X_train_cp, + y_train_cp, + cv, + cv_settings, + ) + + best_trial = study.best_trial + best_params = best_trial.params + best_cv_sore = best_trial.value + + print(f"Best hyperparameters for {model_name}: {best_params}") + print(f"Best score: {best_cv_sore}") + + optimized_clf = model.set_params(**best_params) + + optimized_clf.fit(X_train_cp, y_train_cp) + + elif optimization_type == "RandomizedSearchCV": + # Optimize hyperparameters + optimized_clf = RandomizedSearchCV( + model, # Choose just the model and not the imputer + param_distributions=param_distribution_final, + n_iter=cv_settings["n_iter"], + scoring=cv_settings["scoring"], + cv=cv, + random_state=0, + n_jobs=cv_settings["n_jobs"], + verbose=3, + refit=cv_settings["scoring"][0], + return_train_score=True, + ).fit(X_train_cp, y_train_cp) # Apply cross validation to get the best model - if hasattr(randomized_search, "best_estimator_"): + if hasattr(optimized_clf, "best_estimator_"): cv_results = cross_validate( - randomized_search.best_estimator_, + optimized_clf.best_estimator_, X_train_cp, y_train_cp, cv=cv, - scoring=hyperparameter_settings["scoring"], + scoring=cv_settings["scoring"], return_estimator=True, return_train_score=True, error_score="raise", ) else: cv_results = cross_validate( - randomized_search, + optimized_clf, X_train_cp, y_train_cp, cv=cv, - scoring=hyperparameter_settings["scoring"], + scoring=cv_settings["scoring"], return_estimator=True, return_train_score=True, error_score="raise", @@ -311,7 +358,6 @@ def train_and_optimize_clf( cv_results_dict[model_name] = cv_results # This "test_scoring_type" is actually the validation set. The actual test set will only be used to calculate the final classification report to avoid data leakage.Do note that if more scorings are added to the list, only the first one will be used to evaluate the best model score - eval_score = hyperparameter_settings["scoring"][0] cv_results_test = cv_results[f"test_{eval_score}"] cv_results_train = cv_results[f"train_{eval_score}"] current_model_score = cv_results_test.mean() @@ -323,7 +369,7 @@ def train_and_optimize_clf( # NOTE: it doesn't make sense to add the Confusion Matrix for the traning data since the model has already been trained and seen the training data. If we fit the model again on the training data, it will provide much better results in the confusion matrix than the results that were seen in the training and validation scores. Confusion Matrix is only used on the test data self.plot_training_curves( - randomized_search, + optimized_clf, eval_score=eval_score, test_label_name="val", model_name=model_name, @@ -333,7 +379,7 @@ def train_and_optimize_clf( # Use the optimized model to predict on the actual test set (unseen data) test_results, pred_metrics_dict, y_pred_decoded = self.predict_clf( - randomized_search, + optimized_clf, model_settings, X_test_cp, y_test, @@ -354,7 +400,7 @@ def train_and_optimize_clf( ][eval_score] logger.info( - f"\n\n- Score being used to attain the best model is '{eval_score}'.\n\n- The scores for the optimized version with RandomGridSearchCV of the '{model_name}' model are:\n\n- Validation score:\n" + f"\n\n- Score being used to attain the best model is '{eval_score}'.\n\n- The scores for the optimized version with '{optimization_type}' of the '{model_name}' model are:\n\n- Validation score:\n" f"{cv_results_test.mean():.3f} ± {cv_results_test.std():.3f}.\n\n- Training score:\n" f"{cv_results_train.mean():.3f} ± {cv_results_train.std():.3f}. \n\n- Test score:\n{test_model_score:.3f}" ) @@ -362,7 +408,7 @@ def train_and_optimize_clf( # Check if test score is better than the previous best score. If so, save the current parameters and results as well as the best classifier (best classifier = the classifier that performed best on the unseen test data) if test_model_score > best_score: best_score = test_model_score - best_baseline_model = randomized_search + best_baseline_model = optimized_clf param_distribution = model_utils["hyperparameters"][ "param_distribution" ] @@ -370,9 +416,14 @@ def train_and_optimize_clf( best_model_test_results = pred_metrics_dict best_model_X_test = X_test_cp - logger.info( - f"Best model is: {best_baseline_model.best_estimator_.__class__.__name__} and it has the following optimized parameters:\n\n {best_baseline_model.best_params_}\n\n... with a test '{eval_score}' of {best_score:.3f}" - ) + if hasattr(best_baseline_model, "best_estimator_"): + logger.info( + f"Best model is: {best_baseline_model.best_estimator_.__class__.__name__} and it has the following optimized parameters:\n\n {best_baseline_model.best_params_}\n\n... with a test '{eval_score}' of {best_score:.3f}" + ) + else: + logger.info( + f"Best model is: {best_baseline_model.__class__.__name__} and it has the following optimized parameters:\n\n {best_baseline_model.get_params}\n\n... with a test '{eval_score}' of {best_score:.3f}" + ) # Plot the training metrics self.plot_CV_results( @@ -398,6 +449,63 @@ def train_and_optimize_clf( best_model_X_test, ) + # Define the objective function + + def optimize_clf( + self, + param_distribution, + model, + X_train_cp, + y_train_cp, + cv, + cv_settings, + ): + from sklearn.model_selection import cross_val_score + + def objective( + trial, + param_distribution, + model, + X_train_cp, + y_train_cp, + cv, + cv_settings, + ): + from sklearn.metrics import accuracy_score + + params = { + k: trial.suggest_categorical(k, v) + for k, v in param_distribution.items() + } + model.set_params(**params) + return cross_val_score( + model, + X_train_cp, + y_train_cp, + cv=cv, + scoring=cv_settings["scoring"][0], + ).mean() + + return cross_val_score + + # Create a study object and optimize it + study = optuna.create_study(direction="maximize") + + study.optimize( + lambda trial: objective( + trial=trial, + param_distribution=param_distribution, + model=model, + X_train_cp=X_train_cp, + y_train_cp=y_train_cp, + cv=cv, + cv_settings=cv_settings, + ), + n_trials=25, + ) + + return study + def predict_clf( self, best_baseline_model, @@ -410,7 +518,10 @@ def predict_clf( ): # Get information for the best optimized model - best_model = best_baseline_model.best_estimator_ + if hasattr(best_baseline_model, "best_estimator_"): + best_model = best_baseline_model.best_estimator_ + else: + best_model = best_baseline_model best_model_name = best_model.__class__.__name__ # Get test score (no need for this as its already in the classifiction report down below) @@ -476,7 +587,6 @@ def save_best_model(self, best_model): f"{self.saving_path}/Model/Best Optimized Model/{best_model_name}.joblib", ) - def shapley_values(self): ... def plot_conf_matrix(self, clf, X, y, model_name, prefix): from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay @@ -503,9 +613,12 @@ def plot_training_curves( cv_results, use_from_cross_val=False, ): - - val_scores = clf.cv_results_[f"mean_test_{eval_score}"] - train_scores = clf.cv_results_[f"mean_train_{eval_score}"] + if hasattr(clf, "cv_results_"): + val_scores = clf.cv_results_[f"mean_test_{eval_score}"] + train_scores = clf.cv_results_[f"mean_train_{eval_score}"] + else: + val_scores = cv_results[f"test_{eval_score}"] + train_scores = cv_results[f"train_{eval_score}"] plt.plot(val_scores, label=test_label_name) plt.plot(train_scores, label="train") @@ -553,10 +666,12 @@ def feature_selection(self, selector_type, model, config, X_train, y_train): importance_getter=local_config["importance_getter"], ) - selector_fitted = selector.fit(X_train, y_train) - selected_features = [ - col for col in selector_fitted.get_feature_names_out(X_train.columns) - ] + selector_fitted = selector.fit(X_train, y_train) + selected_features = ( + selector_fitted.get_feature_names_out(data.columns) + if selector_type == "SequentialFeatureSelector" + else selector_fitted.get_feature_names_out(X_train.columns) + ) return selector_fitted, selected_features @@ -697,3 +812,87 @@ def plot_clfs( with plt.rc_context(): # Use this to set figure params like size and dpi plt.savefig(f"{saving_path}\\Plots\\AUC.png", bbox_inches="tight") # plt.close() + + def shap_analysis(self, model, X_test, X_test_original, y_test): + import shap + + shap.initjs() + + # Check the type of the model and return the appropriate explainer + X_test_original_cp = X_test_original.copy() + + for col_type, preprocessor_dict in self.encoder_dict.items(): + if col_type == "categorical_encoder": + for col, encoder in preprocessor_dict.items(): + X_test_original_cp = encoder.inverse_transform(X_test_original_cp) + if col_type == "numerical_encoder": + for col, encoder in preprocessor_dict.items(): + X_test_original_cp = encoder.inverse_transform(X_test_original_cp) + if isinstance( + model, (LogisticRegression, Perceptron, SGDClassifier, RidgeClassifierCV) + ): + explainer = shap.LinearExplainer(model, X_test_original_cp) + elif isinstance( + model, + ( + RandomForestClassifier, + DecisionTreeClassifier, + HistGradientBoostingClassifier, + AdaBoostClassifier, + XGBClassifier, + CatBoostClassifier, + LGBMClassifier, + ), + ): + explainer = shap.TreeExplainer(model) + elif isinstance( + model, (KNeighborsClassifier, SVC, ComplementNB, MLPClassifier) + ): + explainer = shap.KernelExplainer(model.predict_proba, X_test_original_cp) + else: + raise ValueError(f"Unsupported model: {type(model)}") + + explainer_shap = explainer(X_test_original_cp) + shap_values = explainer.shap_values(X_test_original_cp) + shap.plots.bar(explainer_shap) + with plt.rc_context(): # Use this to set figure params like size and dpi + plt.savefig( + f"{self.saving_path}\\Plots\\SHAP_Bar_Plot.png", bbox_inches="tight" + ) + plt.close() + + shap.summary_plot(explainer_shap, X_test_original_cp) + with plt.rc_context(): # Use this to set figure params like size and dpi + plt.savefig( + f"{self.saving_path}\\Plots\\SHAP_Summary_Plot.png", bbox_inches="tight" + ) + plt.close() + + # Create a SHAP decision plot for the first instance + shap.decision_plot( + explainer.expected_value, shap_values[0, :], X_test_original_cp.iloc[0, :] + ) + with plt.rc_context(): + # Save the plots to the reports folder + plt.savefig( + f"{self.saving_path}\\Plots\\SHAP_Decision_Plot.png", + bbox_inches="tight", + ) + plt.close() + + # add a force plot + shap.force_plot( + explainer.expected_value, + shap_values[0, :], + X_test_original_cp.iloc[0, :], + matplotlib=True, + ) + # save the force plot with rc + with plt.rc_context(): + plt.savefig( + f"{self.saving_path}\\Plots\\SHAP_Force_Plot.png", + bbox_inches="tight", + ) + plt.close() + + return shap_values, explainer diff --git a/preprocessing/src/utils.py b/preprocessing/src/utils.py index 748fbac..c90d4e5 100644 --- a/preprocessing/src/utils.py +++ b/preprocessing/src/utils.py @@ -23,7 +23,9 @@ def _return_cross_validation(local_config: dict): cv_class_name = str(list(local_config["cv"].keys())[0]) cv_params = local_config["cv"][cv_class_name] - logger.info(f"Loading cross-validation strategy: {cv_class_name}. ") + logger.info( + f"\n\nLoading cross-validation strategy '{cv_class_name}' with the following parameters:\n{cv_params}. " + ) cv = getattr( importlib.import_module(f"sklearn.model_selection"), str(list(local_config["cv"].keys())[0]),