diff --git a/alpha_automl/automl_manager.py b/alpha_automl/automl_manager.py index b3fe9bb..c950c29 100644 --- a/alpha_automl/automl_manager.py +++ b/alpha_automl/automl_manager.py @@ -52,8 +52,8 @@ def search_pipelines(self, X, y, scoring, splitting_strategy, automl_hyperparams def _search_pipelines(self, automl_hyperparams): search_start_time = time.time() automl_hyperparams = self.check_automl_hyperparams(automl_hyperparams) + metadata = profile_data(self.X) X, y, is_sample = sample_dataset(self.X, self.y, SAMPLE_SIZE, self.task) - metadata = profile_data(X) internal_splitting_strategy = make_splitter(SPLITTING_STRATEGY) self.found_pipelines = 0 need_rescoring = True diff --git a/alpha_automl/data_profiler.py b/alpha_automl/data_profiler.py index 5cf50ac..61825d5 100644 --- a/alpha_automl/data_profiler.py +++ b/alpha_automl/data_profiler.py @@ -12,11 +12,11 @@ def profile_data(X): - metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'categorical_columns': []} + metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': X.isnull().values.any(), 'numeric_columns': [], 'categorical_columns': []} mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER', TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'} - profiled_data = datamart_profiler.process_dataset(X, coverage=False, indexes=False) + profiled_data = datamart_profiler.process_dataset(X.sample(n=100, replace=True, random_state=1), coverage=False, indexes=False) for index_column, profiled_column in enumerate(profiled_data['columns']): column_name = profiled_column['name'] @@ -33,16 +33,13 @@ def profile_data(X): add_nonnumeric_column(column_type, metadata, index_column, column_name) elif TEXT_COLUMN == profiled_column['structural_type']: - samples = X[column_name].dropna().sample(5) + samples = X[column_name].dropna().sample(5).astype('string') if samples.apply(lambda x: x.endswith(('jpg', 'png', 'jpeg', 'gif'))).all(): column_type = mapping_encoders[IMAGE_COLUMN] else: column_type = mapping_encoders[TEXT_COLUMN] add_nonnumeric_column(column_type, metadata, index_column, column_name) - if 'missing_values_ratio' in profiled_column: - metadata['missing_values'] = True - metadata['numeric_columns'] = list(X.select_dtypes(include=['int64', 'float64']).columns) metadata['categorical_columns'] = list(X.select_dtypes(include=['object', 'category']).columns)