Skip to content

Commit

Permalink
bugfix during profiling dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
EdenWuyifan committed Jul 19, 2024
1 parent b8c07ab commit fb7e334
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 7 deletions.
2 changes: 1 addition & 1 deletion alpha_automl/automl_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def search_pipelines(self, X, y, scoring, splitting_strategy, automl_hyperparams
def _search_pipelines(self, automl_hyperparams):
search_start_time = time.time()
automl_hyperparams = self.check_automl_hyperparams(automl_hyperparams)
metadata = profile_data(self.X)
X, y, is_sample = sample_dataset(self.X, self.y, SAMPLE_SIZE, self.task)
metadata = profile_data(X)
internal_splitting_strategy = make_splitter(SPLITTING_STRATEGY)
self.found_pipelines = 0
need_rescoring = True
Expand Down
9 changes: 3 additions & 6 deletions alpha_automl/data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@


def profile_data(X):
metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'categorical_columns': []}
metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': X.isnull().values.any(), 'numeric_columns': [], 'categorical_columns': []}
mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER',
TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'}

profiled_data = datamart_profiler.process_dataset(X, coverage=False, indexes=False)
profiled_data = datamart_profiler.process_dataset(X.sample(n=100, replace=True, random_state=1), coverage=False, indexes=False)

for index_column, profiled_column in enumerate(profiled_data['columns']):
column_name = profiled_column['name']
Expand All @@ -33,16 +33,13 @@ def profile_data(X):
add_nonnumeric_column(column_type, metadata, index_column, column_name)

elif TEXT_COLUMN == profiled_column['structural_type']:
samples = X[column_name].dropna().sample(5)
samples = X[column_name].dropna().sample(5).astype('string')
if samples.apply(lambda x: x.endswith(('jpg', 'png', 'jpeg', 'gif'))).all():
column_type = mapping_encoders[IMAGE_COLUMN]
else:
column_type = mapping_encoders[TEXT_COLUMN]
add_nonnumeric_column(column_type, metadata, index_column, column_name)

if 'missing_values_ratio' in profiled_column:
metadata['missing_values'] = True

metadata['numeric_columns'] = list(X.select_dtypes(include=['int64', 'float64']).columns)
metadata['categorical_columns'] = list(X.select_dtypes(include=['object', 'category']).columns)

Expand Down

0 comments on commit fb7e334

Please sign in to comment.