From c038fbca074740a2dd3ed29b190f0da941c397b2 Mon Sep 17 00:00:00 2001 From: Daniel Grindrod Date: Wed, 18 Dec 2024 02:06:58 +0000 Subject: [PATCH] fix: KeyError no longer occurs when using groupfolds for regression tasks. (#1385) * fix: Now resetting indexes for regression datasets when using group folds * refactor: Simplified if statement to include all fold types * docs: Updated docs to make it clear that group folds can be used for regression tasks --------- Co-authored-by: Daniel Grindrod Co-authored-by: Li Jiang --- flaml/automl/automl.py | 6 +++--- flaml/automl/task/generic_task.py | 4 ++-- flaml/automl/task/task.py | 2 +- test/automl/test_split.py | 36 ++++++++++++++++++++++++++++--- 4 files changed, 39 insertions(+), 9 deletions(-) diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index 115f9748d0..d2d9f98956 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -203,7 +203,7 @@ def custom_metric( * Valid str options depend on different tasks. For classification tasks, valid choices are ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified. - For regression tasks, valid choices are ["auto", 'uniform', 'time']. + For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group']. "auto" -> uniform. For time series forecast tasks, must be "auto" or 'time'. For ranking task, must be "auto" or 'group'. @@ -739,7 +739,7 @@ def retrain_from_log( * Valid str options depend on different tasks. For classification tasks, valid choices are ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified. - For regression tasks, valid choices are ["auto", 'uniform', 'time']. + For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group']. "auto" -> uniform. For time series forecast tasks, must be "auto" or 'time'. For ranking task, must be "auto" or 'group'. @@ -1358,7 +1358,7 @@ def custom_metric( * Valid str options depend on different tasks. For classification tasks, valid choices are ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified. - For regression tasks, valid choices are ["auto", 'uniform', 'time']. + For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group']. "auto" -> uniform. For time series forecast tasks, must be "auto" or 'time'. For ranking task, must be "auto" or 'group'. diff --git a/flaml/automl/task/generic_task.py b/flaml/automl/task/generic_task.py index c9273cb6eb..12a2792693 100644 --- a/flaml/automl/task/generic_task.py +++ b/flaml/automl/task/generic_task.py @@ -442,8 +442,8 @@ def prepare_data( X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED) if data_is_df: X_train_all.reset_index(drop=True, inplace=True) - if isinstance(y_train_all, pd.Series): - y_train_all.reset_index(drop=True, inplace=True) + if isinstance(y_train_all, pd.Series): + y_train_all.reset_index(drop=True, inplace=True) X_train, y_train = X_train_all, y_train_all state.groups_all = state.groups diff --git a/flaml/automl/task/task.py b/flaml/automl/task/task.py index 6b321c8b25..540f13fe81 100644 --- a/flaml/automl/task/task.py +++ b/flaml/automl/task/task.py @@ -192,7 +192,7 @@ def prepare_data( * Valid str options depend on different tasks. For classification tasks, valid choices are ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified. - For regression tasks, valid choices are ["auto", 'uniform', 'time']. + For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group']. "auto" -> uniform. For time series forecast tasks, must be "auto" or 'time'. For ranking task, must be "auto" or 'group'. diff --git a/test/automl/test_split.py b/test/automl/test_split.py index 6c8e678b27..2e7d2e1970 100644 --- a/test/automl/test_split.py +++ b/test/automl/test_split.py @@ -1,4 +1,5 @@ -from sklearn.datasets import fetch_openml +import numpy as np +from sklearn.datasets import fetch_openml, load_iris from sklearn.metrics import accuracy_score from sklearn.model_selection import GroupKFold, KFold, train_test_split @@ -48,7 +49,7 @@ def test_time(): _test(split_type="time") -def test_groups(): +def test_groups_for_classification_task(): from sklearn.externals._arff import ArffException try: @@ -88,6 +89,35 @@ def test_groups(): automl.fit(X, y, **automl_settings) +def test_groups_for_regression_task(): + """Append nonsensical groups to iris dataset and use it to test that GroupKFold works for regression tasks""" + iris_dict_data = load_iris(as_frame=True) # numpy arrays + iris_data = iris_dict_data["frame"] # pandas dataframe data + target + + rng = np.random.default_rng(42) + iris_data["cluster"] = rng.integers( + low=0, high=5, size=iris_data.shape[0] + ) # np.random.randint(0, 5, iris_data.shape[0]) + + automl = AutoML() + X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy() + y = iris_data["petal width (cm)"] + X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split( + X, y, iris_data["cluster"], random_state=42 + ) + automl_settings = { + "max_iter": 5, + "time_budget": -1, + "metric": "r2", + "task": "regression", + "estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"], + "eval_method": "cv", + "split_type": "uniform", + "groups": groups_train, + } + automl.fit(X_train, y_train, **automl_settings) + + def test_stratified_groupkfold(): from minio.error import ServerError from sklearn.model_selection import StratifiedGroupKFold @@ -204,4 +234,4 @@ def get_n_splits(self, X=None, y=None, groups=None): if __name__ == "__main__": - test_groups() + test_groups_for_classification_task()