bugfix during profiling dataset

VIDA-NYU · Jul 19, 2024 · fb7e334 · fb7e334
1 parent b8c07ab
commit fb7e334
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 7 deletions.
diff --git a/alpha_automl/automl_manager.py b/alpha_automl/automl_manager.py
@@ -52,8 +52,8 @@ def search_pipelines(self, X, y, scoring, splitting_strategy, automl_hyperparams
     def _search_pipelines(self, automl_hyperparams):
         search_start_time = time.time()
         automl_hyperparams = self.check_automl_hyperparams(automl_hyperparams)
+        metadata = profile_data(self.X)
         X, y, is_sample = sample_dataset(self.X, self.y, SAMPLE_SIZE, self.task)
-        metadata = profile_data(X)
         internal_splitting_strategy = make_splitter(SPLITTING_STRATEGY)
         self.found_pipelines = 0
         need_rescoring = True

diff --git a/alpha_automl/data_profiler.py b/alpha_automl/data_profiler.py
@@ -12,11 +12,11 @@
 
 
 def profile_data(X):
-    metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'categorical_columns': []}
+    metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': X.isnull().values.any(), 'numeric_columns': [], 'categorical_columns': []}
     mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER',
                         TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'}
 
-    profiled_data = datamart_profiler.process_dataset(X, coverage=False, indexes=False)
+    profiled_data = datamart_profiler.process_dataset(X.sample(n=100, replace=True, random_state=1), coverage=False, indexes=False)
 
     for index_column, profiled_column in enumerate(profiled_data['columns']):
         column_name = profiled_column['name']
@@ -33,16 +33,13 @@ def profile_data(X):
             add_nonnumeric_column(column_type, metadata, index_column, column_name)
 
         elif TEXT_COLUMN == profiled_column['structural_type']:
-            samples = X[column_name].dropna().sample(5)
+            samples = X[column_name].dropna().sample(5).astype('string')
             if samples.apply(lambda x: x.endswith(('jpg', 'png', 'jpeg', 'gif'))).all():
                 column_type = mapping_encoders[IMAGE_COLUMN]
             else:
                 column_type = mapping_encoders[TEXT_COLUMN]
             add_nonnumeric_column(column_type, metadata, index_column, column_name)
 
-        if 'missing_values_ratio' in profiled_column:
-            metadata['missing_values'] = True
-
     metadata['numeric_columns'] = list(X.select_dtypes(include=['int64', 'float64']).columns)
     metadata['categorical_columns'] = list(X.select_dtypes(include=['object', 'category']).columns)