Skip to content

Commit

Permalink
add math_features and other featuretools
Browse files Browse the repository at this point in the history
  • Loading branch information
EdenWuyifan committed Jun 2, 2024
1 parent 8c3ae48 commit 96ce4a1
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 4 deletions.
5 changes: 4 additions & 1 deletion alpha_automl/data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


def profile_data(X):
metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False}
metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'catagorical_columns': []}
mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER',
TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'}

Expand Down Expand Up @@ -43,6 +43,9 @@ def profile_data(X):
if 'missing_values_ratio' in profiled_column:
metadata['missing_values'] = True

metadata['numeric_columns'] = list(X.select_dtypes(include=['int64', 'float64']).columns)
metadata['catagorical_columns'] = list(X.select_dtypes(include=['object', 'category']).columns)

logger.debug(f'Results of profiling data: non-numeric features = {str(metadata["nonnumeric_columns"].keys())}, '
f'useless columns = {str(metadata["useless_columns"])}, '
f'missing values = {str(metadata["missing_values"])}')
Expand Down
16 changes: 16 additions & 0 deletions alpha_automl/pipeline_synthesis/pipeline_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from alpha_automl.utils import create_object, COLUMN_TRANSFORMER_ID, COLUMN_SELECTOR_ID, NATIVE_PRIMITIVE, \
ADDED_PRIMITIVE
from alpha_automl.primitive_loader import PRIMITIVE_TYPES
from feature_engine.creation import MathFeatures

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -37,6 +38,17 @@ def change_default_hyperparams(primitive_object):
primitive_object.set_params(algorithm='SAMME')


def create_math_features(primitive_type, columns):
if primitive_type == "sum":
return MathFeatures(variables=columns, func='sum')
elif primitive_type == "mean":
return MathFeatures(variables=columns, func='mean')
elif primitive_type == "std":
return MathFeatures(variables=columns, func='std')
elif primitive_type == "prod":
return MathFeatures(variables=columns, func='prod')


def extract_estimators(pipeline_primitives, all_primitives):
estimators = []
estimator_name, estimator_obj = pipeline_primitives.pop()
Expand Down Expand Up @@ -87,6 +99,7 @@ def make_primitive_objects(self, primitives):
transformers = []
nonnumeric_columns = self.metadata['nonnumeric_columns']
useless_columns = self.metadata['useless_columns']
numeric_columns = self.metadata['numeric_columns']

if len(useless_columns) > 0 and len(nonnumeric_columns) == 0: # Add the transformer to the first step
selector = (COLUMN_SELECTOR_ID, 'drop', [col_index for col_index, _ in useless_columns])
Expand All @@ -105,6 +118,9 @@ def make_primitive_objects(self, primitives):
elif primitive_type == 'CLASSIFICATION_MULTI_ENSEMBLER' or primitive_type == 'REGRESSION_MULTI_ENSEMBLER':
estimators = extract_estimators(pipeline_primitives, self.all_primitives)
primitive_object = create_object(primitive_name, {'estimators': estimators})
elif "feature_engine.creation" in primitive_name:
primitive_name, primitive_name_type = primitive_name.split('-')
primitive_object = create_math_features(primitive_name_type, numeric_columns)
elif self.all_primitives[primitive_name]['origin'] == NATIVE_PRIMITIVE: # It's an installed primitive
primitive_object = create_object(primitive_name, EXTRA_PARAMS.get(primitive_name, None))
else:
Expand Down
10 changes: 8 additions & 2 deletions alpha_automl/resource/primitives_hierarchy.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@
"alpha_automl.builtin_primitives.image_encoder.HogTransformer"
],
"FEATURE_GENERATOR": [
"sklearn.preprocessing.PolynomialFeatures"
"sklearn.preprocessing.PolynomialFeatures",
"feature_engine.creation.MathFeatures-sum",
"feature_engine.creation.MathFeatures-mean",
"feature_engine.creation.MathFeatures-prod",
"feature_engine.creation.MathFeatures-std"
],
"FEATURE_SCALER": [
"sklearn.preprocessing.MaxAbsScaler",
Expand All @@ -30,7 +34,9 @@
"FEATURE_SELECTOR": [
"sklearn.feature_selection.GenericUnivariateSelect",
"sklearn.feature_selection.SelectPercentile",
"sklearn.feature_selection.SelectKBest"
"sklearn.feature_selection.SelectKBest",
"feature_engine.selection.SmartCorrelatedSelection",
"feature_engine.selection.DropHighPSIFeatures"
],
"COLUMN_TRANSFORMER": [
"sklearn.compose.ColumnTransformer"
Expand Down
4 changes: 3 additions & 1 deletion tests/test_data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ def test_profile_data():
(5, 'country'), (8, 'duration'), (9, 'listed_in'),
(10, 'description')],
'DATETIME_ENCODER': [(6, 'date_added')]},
'useless_columns': [], 'missing_values': True}
'useless_columns': [], 'missing_values': True,
'numeric_columns': ['show_id', 'release_year'],
'catagorical_columns': ['type', 'title', 'director', 'cast', 'country', 'date_added', 'duration', 'listed_in', 'description']}

assert actual_metadata == expected_metadata

0 comments on commit 96ce4a1

Please sign in to comment.