Skip to content

Commit

Permalink
Merge pull request #421 from ZJUEarthData/dev/Haibin_Lai
Browse files Browse the repository at this point in the history
feat: Adding AdaBoost Classification
  • Loading branch information
SanyHe authored Jan 20, 2025
2 parents 8a14bad + d759cf0 commit e68ce44
Show file tree
Hide file tree
Showing 5 changed files with 220 additions and 1 deletion.
1 change: 1 addition & 0 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
"Gradient Boosting",
"K-Nearest Neighbors",
"Stochastic Gradient Descent",
"AdaBoost",
# "Bagging Classification",
# "Decision Tree",
# Histogram-based Gradient Boosting,
Expand Down
174 changes: 173 additions & 1 deletion geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from flaml import AutoML
from multipledispatch import dispatch
from rich import print
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
Expand All @@ -23,6 +23,7 @@
from ..plot.statistic_plot import basic_statistic
from ..utils.base import clear_output, save_data, save_data_without_data_identifier, save_fig, save_text
from ._base import LinearWorkflowMixin, TreeWorkflowMixin, WorkflowBase
from .func.algo_classification._adaboost import adaboost_manual_hyper_parameters
from .func.algo_classification._common import (
cross_validation,
plot_2d_decision_boundary,
Expand All @@ -44,6 +45,7 @@
MLPSpecialFunction,
RandomForestSpecialFunction,
XGBoostSpecialFunction,
AdaboostSpecialFunction,
)
from .func.algo_classification._extra_trees import extra_trees_manual_hyper_parameters
from .func.algo_classification._gradient_boosting import gradient_boosting_manual_hyper_parameters
Expand Down Expand Up @@ -2988,6 +2990,176 @@ def special_components(self, is_automl: bool, **kwargs) -> None:
)


class AdaBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
"""The automation workflow of using AdaBoosting algorithm to make insightful products."""

name = "AdaBoost"
special_function = [func.value for func in AdaboostSpecialFunction]

def __init__(
self,
estimator: object = None,
*,
n_estimators: int = 50,
learning_rate: float = 1.0,
random_state: Union[int] = None,
max_depth: int = 3,
# algorithm: str = "SAMME", # may deprecated in new sklearn version 1.6
) -> None:
"""
Parameters
----------
estimator : object, default=None
The base estimator from which the boosted ensemble is built.
Support for sample weighting is required, as well as proper
``classes_`` and ``n_classes_`` attributes. If ``None``, then
the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier`
initialized with `max_depth=1`.
.. versionadded:: 1.2
`base_estimator` was renamed to `estimator`.
n_estimators : int, default=50
The maximum number of estimators at which boosting is terminated.
In case of perfect fit, the learning procedure is stopped early.
Values must be in the range `[1, inf)`.
learning_rate : float, default=1.0
Weight applied to each classifier at each boosting iteration. A higher
learning rate increases the contribution of each classifier. There is
a trade-off between the `learning_rate` and `n_estimators` parameters.
Values must be in the range `(0.0, inf)`.
algorithm : {'SAMME'}, default='SAMME'
Use the SAMME discrete boosting algorithm.
.. deprecated:: 1.6
`algorithm` is deprecated and will be removed in version 1.8. This
estimator only implements the 'SAMME' algorithm.
random_state : int, RandomState instance or None, default=None
Controls the random seed given at each `estimator` at each
boosting iteration.
Thus, it is only used when `estimator` exposes a `random_state`.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
References
----------
Scikit-learn API: sklearn.ensemble.AdaBoostClassifier
https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
"""
super().__init__()
self.estimator = DecisionTreeClassifier(
max_depth=max_depth,
)
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth

# if 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
if random_state:
self.random_state = (random_state,)
else:
self.random_state = (self.random_state,)

self.model = AdaBoostClassifier(
base_estimator=self.estimator,
n_estimators=self.n_estimators,
learning_rate=self.learning_rate,
# algorithm=self.algorithm, # deprecated in version 1.6 of sklearn
random_state=self.random_state[0],
)

self.naming = AdaBoostClassification.name
self.customized = True
self.customized_name = "AdaBoost"

@property
def settings(self) -> Dict:
"""The configuration of AdaBoosting to implenment AutoML by FLAML framework."""
configuration = {
"time_budget": 10, # total running time in seconds
"metric": "accuracy",
"estimator_list": [self.customized_name], # list of ML learners
"task": "classification", # task type
}
return configuration

@property
def customization(self) -> object:
"""The customized AdaBoosting of FLAML framework"""
from flaml import tune
from flaml.data import CLASSIFICATION
from flaml.model import SKLearnEstimator
from sklearn.ensemble import AdaBoostClassifier

class MyAdaBoostClassification(SKLearnEstimator):
def __init__(self, task="classification", n_jobs=None, **config):
super().__init__(task, **config)
if task in CLASSIFICATION:
self.estimator_class = AdaBoostClassifier

@classmethod
def search_space(cls, data_size, task):
space = { # FLAML can only change these two hyperparameter
"n_estimators": {
"domain": tune.lograndint(lower=4, upper=512),
"init_value": 50,
},
"learning_rate": {
"domain": tune.loguniform(lower=0.001, upper=1.0),
"init_value": 0.1,
},
}
return space

return MyAdaBoostClassification

@classmethod
def manual_hyper_parameters(cls) -> Dict:
"""Manual hyper-parameters specification."""
print(f"-*-* {cls.name} - Hyper-parameters Specification -*-*")
hyper_parameters = adaboost_manual_hyper_parameters()
clear_output()
return hyper_parameters

@dispatch()
def special_components(self, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by Scikit-learn framework."""
GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._plot_feature_importance(
X_train=AdaBoostClassification.X_train,
name_column=DecisionTreeClassification.name_train,
trained_model=self.model,
image_config=self.image_config,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)
self._plot_tree(
trained_model=self.model.estimators_[0],
image_config=self.image_config,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)

@dispatch(bool)
def special_components(self, is_automl: bool, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by FLAML frameworks"""
GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._plot_feature_importance(
X_train=AdaBoostClassification.X_train,
name_column=DecisionTreeClassification.name_train,
trained_model=self.auto_model,
image_config=self.image_config,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)
self._plot_tree(
trained_model=self.auto_model.estimators_[0],
image_config=self.image_config,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)


class KNNClassification(ClassificationWorkflowBase):
"""The automation workflow of using KNN algorithm to make insightful products."""

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import Dict

from rich import print

from ....constants import SECTION
from ....data.data_readiness import float_input, num_input


def adaboost_manual_hyper_parameters() -> Dict:
"""
Manually set hyperparameters.
Returns
-------
hyper_parameters : dict
"""
print("N Estimators: The number of trees in the AdaBoost.")
print("Please specify the number of trees in the forest. A good starting range could be between 50 and 500, such as 100.")
n_estimators = num_input(SECTION[2], "@N Estimators: ")

print("Learning Rate: It controls the step-size in updating the weights. It shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators.")
print("Please specify the initial learning rate of Xgboost, such as 0.1.")
learning_rate = float_input(0.01, SECTION[2], "@Learning Rate: ")
print("Max Depth: The maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.")
print("Please specify the maximum depth of a tree. A good starting value could be between 1 and 20, such as 3.")
max_depth = num_input(SECTION[2], "@Max Depth: ")

hyper_parameters = {
"n_estimators": n_estimators,
"learning_rate": learning_rate,
"max_depth": max_depth,
}
return hyper_parameters
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,7 @@ class ExtraTreesSpecialFunction(Enum):
class GradientBoostingSpecialFunction(Enum):
FEATURE_IMPORTANCE = "Feature Importance"
TREE_DIAGRAM = "Tree Diagram"


class AdaboostSpecialFunction(Enum):
SPECIAL_FUNCTION = ["Feature Importance Diagram", "Single Tree Diagram"]
10 changes: 10 additions & 0 deletions geochemistrypi/data_mining/process/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from ..constants import MLFLOW_ARTIFACT_DATA_PATH
from ..model.classification import (
AdaBoostClassification,
ClassificationWorkflowBase,
DecisionTreeClassification,
ExtraTreesClassification,
Expand Down Expand Up @@ -139,6 +140,13 @@ def activate(
subsample=hyper_parameters["subsample"],
loss=hyper_parameters["loss"],
)
elif self.model_name == "AdaBoost":
hyper_parameters = AdaBoostClassification.manual_hyper_parameters()
self.clf_workflow = AdaBoostClassification(
n_estimators=hyper_parameters["n_estimators"],
learning_rate=hyper_parameters["learning_rate"],
max_depth=hyper_parameters["max_depth"],
)
elif self.model_name == "K-Nearest Neighbors":
hyper_parameters = KNNClassification.manual_hyper_parameters()
self.clf_workflow = KNNClassification(
Expand Down Expand Up @@ -240,6 +248,8 @@ def activate(
self.clf_workflow = ExtraTreesClassification()
elif self.model_name == "Gradient Boosting":
self.clf_workflow = GradientBoostingClassification()
elif self.model_name == "AdaBoost":
self.clf_workflow = AdaBoostClassification()
elif self.model_name == "K-Nearest Neighbors":
self.clf_workflow = KNNClassification()
elif self.model_name == "Stochastic Gradient Descent":
Expand Down

0 comments on commit e68ce44

Please sign in to comment.