Merge pull request #82 from VIDA-NYU/streamlit

Add streamlit app
VIDA-NYU · Nov 9, 2023 · 8af98ca · 8af98ca
2 parents 6e3c463 + 6c3155f
commit 8af98ca
Show file tree

Hide file tree

Showing 9 changed files with 414 additions and 19 deletions.
diff --git a/.gitignore b/.gitignore
@@ -59,3 +59,4 @@ lightning_logs
 **/lightning_logs
 .cache
 **/smac3_output
+/venv
diff --git a/alpha_automl/automl_api.py b/alpha_automl/automl_api.py
@@ -4,12 +4,13 @@
 import pandas as pd
 from multiprocessing import set_start_method
 from sklearn.preprocessing import LabelEncoder
+from sklearn.utils.validation import check_is_fitted
 from alpha_automl.automl_manager import AutoMLManager
 from alpha_automl.scorer import make_scorer, make_splitter, make_str_metric, get_sign_sorting
 from alpha_automl.utils import make_d3m_pipelines, hide_logs, get_start_method, check_input_for_multiprocessing, \
     setup_output_folder, SemiSupervisedSplitter, SemiSupervisedLabelEncoder
 from alpha_automl.visualization import plot_comparison_pipelines
-
+from alpha_automl.pipeline_serializer import PipelineSerializer
 
 logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
 logger = logging.getLogger(__name__)
@@ -62,6 +63,7 @@ def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bo
         set_start_method(self._start_method, force=True)
         check_input_for_multiprocessing(self._start_method, self.scorer._score_func, 'metric')
         check_input_for_multiprocessing(self._start_method, self.splitter, 'split strategy')
+        self.label_encoder = None
 
     def fit(self, X, y):
         """
@@ -115,7 +117,6 @@ def fit(self, X, y):
                 )
             else:
                 leaderboard_data.append([index, pipeline.get_summary(), pipeline.get_score()])
-
 
         self.leaderboard = pd.DataFrame(leaderboard_data, columns=['ranking', 'pipeline', self.metric])
 
@@ -283,6 +284,16 @@ def _score(self, X, y, pipeline_id):
 
         return {'metric': self.metric, 'score': score}
 
+    def get_serialized_pipeline(self, pipeline_id=None):
+        pipeline = self.get_pipeline(pipeline_id)
+        try:
+            check_is_fitted(pipeline)
+        except:  # It's not fitted, then fit it.
+            pipeline.fit(self.X, self.y)
+        serialized_pipeline = PipelineSerializer(pipeline, self.label_encoder)
+
+        return serialized_pipeline
+
 
 class AutoMLClassifier(BaseAutoML):
 
@@ -306,22 +317,23 @@ def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdo
         :param verbose: Whether or not to show additional logs.
         """
 
-        self.label_enconder = LabelEncoder()
         task = 'CLASSIFICATION'
         super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
                          split_strategy_kwargs, output_folder, start_mode, verbose)
 
+        self.label_encoder = LabelEncoder()
+
     def fit(self, X, y):
-        y = self.label_enconder.fit_transform(y)
+        y = self.label_encoder.fit_transform(y)
         super().fit(X, y)
 
     def predict(self, X):
         predictions = super().predict(X)
 
-        return self.label_enconder.inverse_transform(predictions)
+        return self.label_encoder.inverse_transform(predictions)
 
     def score(self, X, y):
-        y = self.label_enconder.transform(y)
+        y = self.label_encoder.transform(y)
 
         return super().score(X, y)
 
@@ -331,10 +343,10 @@ def fit_pipeline(self, pipeline_id):
     def predict_pipeline(self, X, pipeline_id):
         predictions = super().predict_pipeline(X, pipeline_id)
 
-        return self.label_enconder.inverse_transform(predictions)
+        return self.label_encoder.inverse_transform(predictions)
 
     def score_pipeline(self, X, y, pipeline_id):
-        y = self.label_enconder.transform(y)
+        y = self.label_encoder.transform(y)
 
         return super().score_pipeline(X, y, pipeline_id)
 
@@ -365,7 +377,7 @@ def __init__(self, time_bound=15, metric='mean_absolute_error', split_strategy='
         super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
                          split_strategy_kwargs, output_folder, start_mode, verbose)
 
-        
+
 class AutoMLTimeSeries(BaseAutoML):
     def __init__(self, time_bound=15, metric='mean_squared_error', split_strategy='timeseries', time_bound_run=5,
                  score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None,
@@ -390,17 +402,18 @@ def __init__(self, time_bound=15, metric='mean_squared_error', split_strategy='t
         task = 'TIME_SERIES_FORECAST'
         self.date_column = date_column
         self.target_column = target_column
+
         super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
                          split_strategy_kwargs, output_folder, start_mode, verbose)
-        
+
     def _column_parser(self, X):
         cols = list(X.columns.values)
         cols.remove(self.date_column)
         cols.remove(self.target_column)
         X = X[[self.date_column, self.target_column] + cols]
         y = X[[self.target_column]]
         return X, y
-        
+
     def fit(self, X, y=None):
         X, y = self._column_parser(X)
         super().fit(X, y)
@@ -413,7 +426,7 @@ def __init__(self, time_bound=15, metric='f1_score', split_strategy='holdout', t
                  output_folder=None, start_mode='auto', verbose=logging.INFO):
         """
         Create/instantiate an AutoMLSemiSupervisedClassifier object.
-        
+
         :param time_bound: Limit time in minutes to perform the search.
         :param metric: A str (see in the documentation the list of available metrics) or a callable object/function.
         :param split_strategy: Method to score the pipeline: `holdout`, `cross_validation` or an instance of
@@ -428,7 +441,7 @@ def __init__(self, time_bound=15, metric='f1_score', split_strategy='holdout', t
         :param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`.
         :param verbose: Whether or not to show additional logs.
         """
-        self.label_enconder = SemiSupervisedLabelEncoder()
+
         task = 'SEMISUPERVISED'
         super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
                          split_strategy_kwargs, output_folder, start_mode, verbose)
@@ -437,18 +450,19 @@ def __init__(self, time_bound=15, metric='f1_score', split_strategy='holdout', t
             split_strategy_kwargs = {'test_size': 0.2}
 
         self.splitter = SemiSupervisedSplitter(**split_strategy_kwargs)
+        self.label_encoder = SemiSupervisedLabelEncoder()
 
     def fit(self, X, y):
-        y = self.label_enconder.fit_transform(y)
+        y = self.label_encoder.fit_transform(y)
         super().fit(X, y)
 
     def predict(self, X):
         predictions = super().predict(X)
 
-        return self.label_enconder.inverse_transform(predictions)
+        return self.label_encoder.inverse_transform(predictions)
 
     def score(self, X, y):
-        y = self.label_enconder.transform(y)
+        y = self.label_encoder.transform(y)
 
         return super().score(X, y)
 
@@ -458,9 +472,9 @@ def fit_pipeline(self, pipeline_id):
     def predict_pipeline(self, X, pipeline_id):
         predictions = super().predict_pipeline(X, pipeline_id)
 
-        return self.label_enconder.inverse_transform(predictions)
+        return self.label_encoder.inverse_transform(predictions)
 
     def score_pipeline(self, X, y, pipeline_id):
-        y = self.label_enconder.transform(y)
+        y = self.label_encoder.transform(y)
 
         return super().score_pipeline(X, y, pipeline_id)
diff --git a/alpha_automl/pipeline_serializer.py b/alpha_automl/pipeline_serializer.py
@@ -0,0 +1,24 @@
+from sklearn.pipeline import Pipeline as Pipeline
+
+
+class PipelineSerializer(Pipeline):
+
+    def __init__(self, pipeline_sk, label_encoder=None):
+        self.label_encoder = label_encoder
+        super().__init__(pipeline_sk.steps)
+
+    def fit(self, X, y):
+        if self.label_encoder is not None:
+            y = self.label_encoder.fit_transform(y)
+        super().fit(X, y)
+
+    def predict(self, X):
+        predictions = super().predict(X)
+
+        if self.label_encoder is not None:
+            return self.label_encoder.inverse_transform(predictions)
+        else:
+            return predictions
+
+    def predict_proba(self, X):
+        return super().predict_proba(X)
diff --git a/streamlit/.streamlit/config.toml b/streamlit/.streamlit/config.toml
@@ -0,0 +1,3 @@
+[server]
+maxUploadSize = 2000
+
diff --git a/streamlit/Home.py b/streamlit/Home.py
@@ -0,0 +1,32 @@
+import streamlit as st
+from sklearn import set_config
+
+
+set_config(display="html")
+
+st.set_page_config(
+    page_title="Alpha-AutoML App",
+)
+
+
+with st.columns(3)[1]:
+    st.image("https://github.com/VIDA-NYU/alpha-automl/raw/devel/Alpha-AutoML_logo.png")
+
+st.markdown(
+    "<p style='text-align: center;'>An extensible open-source AutoML system that supports multiple ML tasks </p>",
+    unsafe_allow_html=True,
+)
+
+st.divider()
+
+st.markdown(
+    "<p style='text-align: justify;'>Alpha-AutoML leverages in reinforcement learning and neural network components "
+    "and it relies on standard, open-source infrastructure to specify and run pipelines. It is compatible with "
+    "state-of-the-art ML techniques: by using the Sklearn pipeline infrastructure, Alpha-AutoML is fully compatible "
+    "with other standard libraries like XGBoost, Hugging Face, Keras, PyTorch. In addition, primitives can be added on "
+    "the fly through the standard Sklearn’s fit/predict API, making it possible for Alpha-AutoML to leverage new "
+    "developments in ML and keep up with the fast pace in the area. </p>",
+    unsafe_allow_html=True,
+)
+
+st.divider()
diff --git a/streamlit/README.md b/streamlit/README.md
@@ -0,0 +1,36 @@
+# Alpha-AutoML Streamlit App
+
+This directory contains a Streamlit app that implements model training and prediction using the `alpha-automl` system.
+
+## Required dependencies
+
+Make sure to install the following dependencies:
+- alpha-automl[image]
+- streamlit
+
+We recommend creating a virtual environment to avoid dependency conflicts:
+```
+python -m venv venv
+source venv/bin/activate
+pip install alpha-automl[image]@"git+https://github.com/VIDA-NYU/alpha-automl@devel"
+pip install streamlit
+```
+
+In the commands above, we are installing a specific version of alpha-automl, including the optional `image` dependencies that are required to support building models for image classification.
+
+## Running the app
+
+To run the app execute the following command:
+
+```
+streamlit run Home.py
+```
+
+The command will print a message like this:
+```
+  You can now view your Streamlit app in your browser.
+
+  Local URL: http://localhost:8501
+  Network URL: http://192.168.1.150:8501
+```
+Now you can open the app using your browser using one of the addresses specified above, e.g., http://localhost:8501.