diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 7863392..2497188 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -56,6 +56,7 @@ "Gradient Boosting", "K-Nearest Neighbors", "Stochastic Gradient Descent", + "AdaBoost", # "Bagging Classification", # "Decision Tree", # Histogram-based Gradient Boosting, diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py index 3a97f27..fc75c9d 100644 --- a/geochemistrypi/data_mining/model/classification.py +++ b/geochemistrypi/data_mining/model/classification.py @@ -10,7 +10,7 @@ from flaml import AutoML from multipledispatch import dispatch from rich import print -from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier +from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.metrics import classification_report from sklearn.neighbors import KNeighborsClassifier @@ -23,6 +23,7 @@ from ..plot.statistic_plot import basic_statistic from ..utils.base import clear_output, save_data, save_data_without_data_identifier, save_fig, save_text from ._base import LinearWorkflowMixin, TreeWorkflowMixin, WorkflowBase +from .func.algo_classification._adaboost import adaboost_manual_hyper_parameters from .func.algo_classification._common import ( cross_validation, plot_2d_decision_boundary, @@ -44,6 +45,7 @@ MLPSpecialFunction, RandomForestSpecialFunction, XGBoostSpecialFunction, + AdaboostSpecialFunction, ) from .func.algo_classification._extra_trees import extra_trees_manual_hyper_parameters from .func.algo_classification._gradient_boosting import gradient_boosting_manual_hyper_parameters @@ -2988,6 +2990,176 @@ def special_components(self, is_automl: bool, **kwargs) -> None: ) +class AdaBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase): + """The automation workflow of using AdaBoosting algorithm to make insightful products.""" + + name = "AdaBoost" + special_function = [func.value for func in AdaboostSpecialFunction] + + def __init__( + self, + estimator: object = None, + *, + n_estimators: int = 50, + learning_rate: float = 1.0, + random_state: Union[int] = None, + max_depth: int = 3, + # algorithm: str = "SAMME", # may deprecated in new sklearn version 1.6 + ) -> None: + """ + Parameters + ---------- + estimator : object, default=None + The base estimator from which the boosted ensemble is built. + Support for sample weighting is required, as well as proper + ``classes_`` and ``n_classes_`` attributes. If ``None``, then + the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier` + initialized with `max_depth=1`. + .. versionadded:: 1.2 + `base_estimator` was renamed to `estimator`. + n_estimators : int, default=50 + The maximum number of estimators at which boosting is terminated. + In case of perfect fit, the learning procedure is stopped early. + Values must be in the range `[1, inf)`. + learning_rate : float, default=1.0 + Weight applied to each classifier at each boosting iteration. A higher + learning rate increases the contribution of each classifier. There is + a trade-off between the `learning_rate` and `n_estimators` parameters. + Values must be in the range `(0.0, inf)`. + algorithm : {'SAMME'}, default='SAMME' + Use the SAMME discrete boosting algorithm. + .. deprecated:: 1.6 + `algorithm` is deprecated and will be removed in version 1.8. This + estimator only implements the 'SAMME' algorithm. + random_state : int, RandomState instance or None, default=None + Controls the random seed given at each `estimator` at each + boosting iteration. + Thus, it is only used when `estimator` exposes a `random_state`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + References + ---------- + Scikit-learn API: sklearn.ensemble.AdaBoostClassifier + https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.AdaBoostClassifier.html + """ + super().__init__() + self.estimator = DecisionTreeClassifier( + max_depth=max_depth, + ) + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.max_depth = max_depth + + # if 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase' + if random_state: + self.random_state = (random_state,) + else: + self.random_state = (self.random_state,) + + self.model = AdaBoostClassifier( + base_estimator=self.estimator, + n_estimators=self.n_estimators, + learning_rate=self.learning_rate, + # algorithm=self.algorithm, # deprecated in version 1.6 of sklearn + random_state=self.random_state[0], + ) + + self.naming = AdaBoostClassification.name + self.customized = True + self.customized_name = "AdaBoost" + + @property + def settings(self) -> Dict: + """The configuration of AdaBoosting to implenment AutoML by FLAML framework.""" + configuration = { + "time_budget": 10, # total running time in seconds + "metric": "accuracy", + "estimator_list": [self.customized_name], # list of ML learners + "task": "classification", # task type + } + return configuration + + @property + def customization(self) -> object: + """The customized AdaBoosting of FLAML framework""" + from flaml import tune + from flaml.data import CLASSIFICATION + from flaml.model import SKLearnEstimator + from sklearn.ensemble import AdaBoostClassifier + + class MyAdaBoostClassification(SKLearnEstimator): + def __init__(self, task="classification", n_jobs=None, **config): + super().__init__(task, **config) + if task in CLASSIFICATION: + self.estimator_class = AdaBoostClassifier + + @classmethod + def search_space(cls, data_size, task): + space = { # FLAML can only change these two hyperparameter + "n_estimators": { + "domain": tune.lograndint(lower=4, upper=512), + "init_value": 50, + }, + "learning_rate": { + "domain": tune.loguniform(lower=0.001, upper=1.0), + "init_value": 0.1, + }, + } + return space + + return MyAdaBoostClassification + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-* {cls.name} - Hyper-parameters Specification -*-*") + hyper_parameters = adaboost_manual_hyper_parameters() + clear_output() + return hyper_parameters + + @dispatch() + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=AdaBoostClassification.X_train, + name_column=DecisionTreeClassification.name_train, + trained_model=self.model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + self._plot_tree( + trained_model=self.model.estimators_[0], + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + + @dispatch(bool) + def special_components(self, is_automl: bool, **kwargs) -> None: + """Invoke all special application functions for this algorithms by FLAML frameworks""" + GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") + self._plot_feature_importance( + X_train=AdaBoostClassification.X_train, + name_column=DecisionTreeClassification.name_train, + trained_model=self.auto_model, + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + self._plot_tree( + trained_model=self.auto_model.estimators_[0], + image_config=self.image_config, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + + class KNNClassification(ClassificationWorkflowBase): """The automation workflow of using KNN algorithm to make insightful products.""" diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_adaboost.py b/geochemistrypi/data_mining/model/func/algo_classification/_adaboost.py new file mode 100644 index 0000000..2b16353 --- /dev/null +++ b/geochemistrypi/data_mining/model/func/algo_classification/_adaboost.py @@ -0,0 +1,32 @@ +from typing import Dict + +from rich import print + +from ....constants import SECTION +from ....data.data_readiness import float_input, num_input + + +def adaboost_manual_hyper_parameters() -> Dict: + """ + Manually set hyperparameters. + Returns + ------- + hyper_parameters : dict + """ + print("N Estimators: The number of trees in the AdaBoost.") + print("Please specify the number of trees in the forest. A good starting range could be between 50 and 500, such as 100.") + n_estimators = num_input(SECTION[2], "@N Estimators: ") + + print("Learning Rate: It controls the step-size in updating the weights. It shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators.") + print("Please specify the initial learning rate of Xgboost, such as 0.1.") + learning_rate = float_input(0.01, SECTION[2], "@Learning Rate: ") + print("Max Depth: The maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.") + print("Please specify the maximum depth of a tree. A good starting value could be between 1 and 20, such as 3.") + max_depth = num_input(SECTION[2], "@Max Depth: ") + + hyper_parameters = { + "n_estimators": n_estimators, + "learning_rate": learning_rate, + "max_depth": max_depth, + } + return hyper_parameters diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_enum.py b/geochemistrypi/data_mining/model/func/algo_classification/_enum.py index 0d3d5e3..ff2b87e 100644 --- a/geochemistrypi/data_mining/model/func/algo_classification/_enum.py +++ b/geochemistrypi/data_mining/model/func/algo_classification/_enum.py @@ -45,3 +45,7 @@ class ExtraTreesSpecialFunction(Enum): class GradientBoostingSpecialFunction(Enum): FEATURE_IMPORTANCE = "Feature Importance" TREE_DIAGRAM = "Tree Diagram" + + +class AdaboostSpecialFunction(Enum): + SPECIAL_FUNCTION = ["Feature Importance Diagram", "Single Tree Diagram"] diff --git a/geochemistrypi/data_mining/process/classify.py b/geochemistrypi/data_mining/process/classify.py index 07ab6b9..0300e3c 100644 --- a/geochemistrypi/data_mining/process/classify.py +++ b/geochemistrypi/data_mining/process/classify.py @@ -6,6 +6,7 @@ from ..constants import MLFLOW_ARTIFACT_DATA_PATH from ..model.classification import ( + AdaBoostClassification, ClassificationWorkflowBase, DecisionTreeClassification, ExtraTreesClassification, @@ -139,6 +140,13 @@ def activate( subsample=hyper_parameters["subsample"], loss=hyper_parameters["loss"], ) + elif self.model_name == "AdaBoost": + hyper_parameters = AdaBoostClassification.manual_hyper_parameters() + self.clf_workflow = AdaBoostClassification( + n_estimators=hyper_parameters["n_estimators"], + learning_rate=hyper_parameters["learning_rate"], + max_depth=hyper_parameters["max_depth"], + ) elif self.model_name == "K-Nearest Neighbors": hyper_parameters = KNNClassification.manual_hyper_parameters() self.clf_workflow = KNNClassification( @@ -240,6 +248,8 @@ def activate( self.clf_workflow = ExtraTreesClassification() elif self.model_name == "Gradient Boosting": self.clf_workflow = GradientBoostingClassification() + elif self.model_name == "AdaBoost": + self.clf_workflow = AdaBoostClassification() elif self.model_name == "K-Nearest Neighbors": self.clf_workflow = KNNClassification() elif self.model_name == "Stochastic Gradient Descent":