Merge pull request #421 from ZJUEarthData/dev/Haibin_Lai

feat: Adding AdaBoost Classification
ZJUEarthData · Jan 20, 2025 · e68ce44 · e68ce44
2 parents 8a14bad + d759cf0
commit e68ce44
Show file tree

Hide file tree

Showing 5 changed files with 220 additions and 1 deletion.
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -56,6 +56,7 @@
     "Gradient Boosting",
     "K-Nearest Neighbors",
     "Stochastic Gradient Descent",
+    "AdaBoost",
     # "Bagging Classification",
     # "Decision Tree",
     # Histogram-based Gradient Boosting,

diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
@@ -10,7 +10,7 @@
 from flaml import AutoML
 from multipledispatch import dispatch
 from rich import print
-from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
+from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
 from sklearn.linear_model import LogisticRegression, SGDClassifier
 from sklearn.metrics import classification_report
 from sklearn.neighbors import KNeighborsClassifier
@@ -23,6 +23,7 @@
 from ..plot.statistic_plot import basic_statistic
 from ..utils.base import clear_output, save_data, save_data_without_data_identifier, save_fig, save_text
 from ._base import LinearWorkflowMixin, TreeWorkflowMixin, WorkflowBase
+from .func.algo_classification._adaboost import adaboost_manual_hyper_parameters
 from .func.algo_classification._common import (
     cross_validation,
     plot_2d_decision_boundary,
@@ -44,6 +45,7 @@
     MLPSpecialFunction,
     RandomForestSpecialFunction,
     XGBoostSpecialFunction,
+    AdaboostSpecialFunction,
 )
 from .func.algo_classification._extra_trees import extra_trees_manual_hyper_parameters
 from .func.algo_classification._gradient_boosting import gradient_boosting_manual_hyper_parameters
@@ -2988,6 +2990,176 @@ def special_components(self, is_automl: bool, **kwargs) -> None:
         )
 
 
+class AdaBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
+    """The automation workflow of using AdaBoosting algorithm to make insightful products."""
+
+    name = "AdaBoost"
+    special_function = [func.value for func in AdaboostSpecialFunction]
+
+    def __init__(
+        self,
+        estimator: object = None,
+        *,
+        n_estimators: int = 50,
+        learning_rate: float = 1.0,
+        random_state: Union[int] = None,
+        max_depth: int = 3,
+        # algorithm: str = "SAMME",  # may deprecated in new sklearn version 1.6
+    ) -> None:
+        """
+        Parameters
+        ----------
+        estimator : object, default=None
+            The base estimator from which the boosted ensemble is built.
+            Support for sample weighting is required, as well as proper
+            ``classes_`` and ``n_classes_`` attributes. If ``None``, then
+            the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier`
+            initialized with `max_depth=1`.
+            .. versionadded:: 1.2
+               `base_estimator` was renamed to `estimator`.
+        n_estimators : int, default=50
+            The maximum number of estimators at which boosting is terminated.
+            In case of perfect fit, the learning procedure is stopped early.
+            Values must be in the range `[1, inf)`.
+        learning_rate : float, default=1.0
+            Weight applied to each classifier at each boosting iteration. A higher
+            learning rate increases the contribution of each classifier. There is
+            a trade-off between the `learning_rate` and `n_estimators` parameters.
+            Values must be in the range `(0.0, inf)`.
+        algorithm : {'SAMME'}, default='SAMME'
+            Use the SAMME discrete boosting algorithm.
+            .. deprecated:: 1.6
+                `algorithm` is deprecated and will be removed in version 1.8. This
+                estimator only implements the 'SAMME' algorithm.
+        random_state : int, RandomState instance or None, default=None
+            Controls the random seed given at each `estimator` at each
+            boosting iteration.
+            Thus, it is only used when `estimator` exposes a `random_state`.
+            Pass an int for reproducible output across multiple function calls.
+            See :term:`Glossary <random_state>`.
+        References
+        ----------
+        Scikit-learn API: sklearn.ensemble.AdaBoostClassifier
+        https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
+        """
+        super().__init__()
+        self.estimator = DecisionTreeClassifier(
+            max_depth=max_depth,
+        )
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.max_depth = max_depth
+
+        # if 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
+        if random_state:
+            self.random_state = (random_state,)
+        else:
+            self.random_state = (self.random_state,)
+
+        self.model = AdaBoostClassifier(
+            base_estimator=self.estimator,
+            n_estimators=self.n_estimators,
+            learning_rate=self.learning_rate,
+            # algorithm=self.algorithm,  # deprecated in version 1.6 of sklearn
+            random_state=self.random_state[0],
+        )
+
+        self.naming = AdaBoostClassification.name
+        self.customized = True
+        self.customized_name = "AdaBoost"
+
+    @property
+    def settings(self) -> Dict:
+        """The configuration of AdaBoosting to implenment AutoML by FLAML framework."""
+        configuration = {
+            "time_budget": 10,  # total running time in seconds
+            "metric": "accuracy",
+            "estimator_list": [self.customized_name],  # list of ML learners
+            "task": "classification",  # task type
+        }
+        return configuration
+
+    @property
+    def customization(self) -> object:
+        """The customized AdaBoosting of FLAML framework"""
+        from flaml import tune
+        from flaml.data import CLASSIFICATION
+        from flaml.model import SKLearnEstimator
+        from sklearn.ensemble import AdaBoostClassifier
+
+        class MyAdaBoostClassification(SKLearnEstimator):
+            def __init__(self, task="classification", n_jobs=None, **config):
+                super().__init__(task, **config)
+                if task in CLASSIFICATION:
+                    self.estimator_class = AdaBoostClassifier
+
+            @classmethod
+            def search_space(cls, data_size, task):
+                space = {  # FLAML can only change these two hyperparameter
+                    "n_estimators": {
+                        "domain": tune.lograndint(lower=4, upper=512),
+                        "init_value": 50,
+                    },
+                    "learning_rate": {
+                        "domain": tune.loguniform(lower=0.001, upper=1.0),
+                        "init_value": 0.1,
+                    },
+                }
+                return space
+
+        return MyAdaBoostClassification
+
+    @classmethod
+    def manual_hyper_parameters(cls) -> Dict:
+        """Manual hyper-parameters specification."""
+        print(f"-*-* {cls.name} - Hyper-parameters Specification -*-*")
+        hyper_parameters = adaboost_manual_hyper_parameters()
+        clear_output()
+        return hyper_parameters
+
+    @dispatch()
+    def special_components(self, **kwargs) -> None:
+        """Invoke all special application functions for this algorithms by Scikit-learn framework."""
+        GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
+        self._plot_feature_importance(
+            X_train=AdaBoostClassification.X_train,
+            name_column=DecisionTreeClassification.name_train,
+            trained_model=self.model,
+            image_config=self.image_config,
+            algorithm_name=self.naming,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+            mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+        )
+        self._plot_tree(
+            trained_model=self.model.estimators_[0],
+            image_config=self.image_config,
+            algorithm_name=self.naming,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+            mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+        )
+
+    @dispatch(bool)
+    def special_components(self, is_automl: bool, **kwargs) -> None:
+        """Invoke all special application functions for this algorithms by FLAML frameworks"""
+        GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
+        self._plot_feature_importance(
+            X_train=AdaBoostClassification.X_train,
+            name_column=DecisionTreeClassification.name_train,
+            trained_model=self.auto_model,
+            image_config=self.image_config,
+            algorithm_name=self.naming,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+            mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+        )
+        self._plot_tree(
+            trained_model=self.auto_model.estimators_[0],
+            image_config=self.image_config,
+            algorithm_name=self.naming,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+            mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+        )
+
+
 class KNNClassification(ClassificationWorkflowBase):
     """The automation workflow of using KNN algorithm to make insightful products."""
 

diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_adaboost.py b/geochemistrypi/data_mining/model/func/algo_classification/_adaboost.py
@@ -0,0 +1,32 @@
+from typing import Dict
+
+from rich import print
+
+from ....constants import SECTION
+from ....data.data_readiness import float_input, num_input
+
+
+def adaboost_manual_hyper_parameters() -> Dict:
+    """
+    Manually set hyperparameters.
+        Returns
+        -------
+        hyper_parameters : dict
+    """
+    print("N Estimators: The number of trees in the AdaBoost.")
+    print("Please specify the number of trees in the forest. A good starting range could be between 50 and 500, such as 100.")
+    n_estimators = num_input(SECTION[2], "@N Estimators: ")
+
+    print("Learning Rate: It controls the step-size in updating the weights. It shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators.")
+    print("Please specify the initial learning rate of Xgboost, such as 0.1.")
+    learning_rate = float_input(0.01, SECTION[2], "@Learning Rate: ")
+    print("Max Depth: The maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.")
+    print("Please specify the maximum depth of a tree. A good starting value could be between 1 and 20, such as 3.")
+    max_depth = num_input(SECTION[2], "@Max Depth: ")
+
+    hyper_parameters = {
+        "n_estimators": n_estimators,
+        "learning_rate": learning_rate,
+        "max_depth": max_depth,
+    }
+    return hyper_parameters
diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_enum.py b/geochemistrypi/data_mining/model/func/algo_classification/_enum.py
@@ -45,3 +45,7 @@ class ExtraTreesSpecialFunction(Enum):
 class GradientBoostingSpecialFunction(Enum):
     FEATURE_IMPORTANCE = "Feature Importance"
     TREE_DIAGRAM = "Tree Diagram"
+
+
+class AdaboostSpecialFunction(Enum):
+    SPECIAL_FUNCTION = ["Feature Importance Diagram", "Single Tree Diagram"]
diff --git a/geochemistrypi/data_mining/process/classify.py b/geochemistrypi/data_mining/process/classify.py
@@ -6,6 +6,7 @@
 
 from ..constants import MLFLOW_ARTIFACT_DATA_PATH
 from ..model.classification import (
+    AdaBoostClassification,
     ClassificationWorkflowBase,
     DecisionTreeClassification,
     ExtraTreesClassification,
@@ -139,6 +140,13 @@ def activate(
                 subsample=hyper_parameters["subsample"],
                 loss=hyper_parameters["loss"],
             )
+        elif self.model_name == "AdaBoost":
+            hyper_parameters = AdaBoostClassification.manual_hyper_parameters()
+            self.clf_workflow = AdaBoostClassification(
+                n_estimators=hyper_parameters["n_estimators"],
+                learning_rate=hyper_parameters["learning_rate"],
+                max_depth=hyper_parameters["max_depth"],
+            )
         elif self.model_name == "K-Nearest Neighbors":
             hyper_parameters = KNNClassification.manual_hyper_parameters()
             self.clf_workflow = KNNClassification(
@@ -240,6 +248,8 @@ def activate(
             self.clf_workflow = ExtraTreesClassification()
         elif self.model_name == "Gradient Boosting":
             self.clf_workflow = GradientBoostingClassification()
+        elif self.model_name == "AdaBoost":
+            self.clf_workflow = AdaBoostClassification()
         elif self.model_name == "K-Nearest Neighbors":
             self.clf_workflow = KNNClassification()
         elif self.model_name == "Stochastic Gradient Descent":