Skip to content

Commit

Permalink
Merge pull request #82 from VIDA-NYU/streamlit
Browse files Browse the repository at this point in the history
Add streamlit app
  • Loading branch information
roquelopez authored Nov 9, 2023
2 parents 6e3c463 + 6c3155f commit 8af98ca
Show file tree
Hide file tree
Showing 9 changed files with 414 additions and 19 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,4 @@ lightning_logs
**/lightning_logs
.cache
**/smac3_output
/venv
50 changes: 32 additions & 18 deletions alpha_automl/automl_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import pandas as pd
from multiprocessing import set_start_method
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_is_fitted
from alpha_automl.automl_manager import AutoMLManager
from alpha_automl.scorer import make_scorer, make_splitter, make_str_metric, get_sign_sorting
from alpha_automl.utils import make_d3m_pipelines, hide_logs, get_start_method, check_input_for_multiprocessing, \
setup_output_folder, SemiSupervisedSplitter, SemiSupervisedLabelEncoder
from alpha_automl.visualization import plot_comparison_pipelines

from alpha_automl.pipeline_serializer import PipelineSerializer

logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -62,6 +63,7 @@ def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bo
set_start_method(self._start_method, force=True)
check_input_for_multiprocessing(self._start_method, self.scorer._score_func, 'metric')
check_input_for_multiprocessing(self._start_method, self.splitter, 'split strategy')
self.label_encoder = None

def fit(self, X, y):
"""
Expand Down Expand Up @@ -115,7 +117,6 @@ def fit(self, X, y):
)
else:
leaderboard_data.append([index, pipeline.get_summary(), pipeline.get_score()])


self.leaderboard = pd.DataFrame(leaderboard_data, columns=['ranking', 'pipeline', self.metric])

Expand Down Expand Up @@ -283,6 +284,16 @@ def _score(self, X, y, pipeline_id):

return {'metric': self.metric, 'score': score}

def get_serialized_pipeline(self, pipeline_id=None):
pipeline = self.get_pipeline(pipeline_id)
try:
check_is_fitted(pipeline)
except: # It's not fitted, then fit it.
pipeline.fit(self.X, self.y)
serialized_pipeline = PipelineSerializer(pipeline, self.label_encoder)

return serialized_pipeline


class AutoMLClassifier(BaseAutoML):

Expand All @@ -306,22 +317,23 @@ def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdo
:param verbose: Whether or not to show additional logs.
"""

self.label_enconder = LabelEncoder()
task = 'CLASSIFICATION'
super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
split_strategy_kwargs, output_folder, start_mode, verbose)

self.label_encoder = LabelEncoder()

def fit(self, X, y):
y = self.label_enconder.fit_transform(y)
y = self.label_encoder.fit_transform(y)
super().fit(X, y)

def predict(self, X):
predictions = super().predict(X)

return self.label_enconder.inverse_transform(predictions)
return self.label_encoder.inverse_transform(predictions)

def score(self, X, y):
y = self.label_enconder.transform(y)
y = self.label_encoder.transform(y)

return super().score(X, y)

Expand All @@ -331,10 +343,10 @@ def fit_pipeline(self, pipeline_id):
def predict_pipeline(self, X, pipeline_id):
predictions = super().predict_pipeline(X, pipeline_id)

return self.label_enconder.inverse_transform(predictions)
return self.label_encoder.inverse_transform(predictions)

def score_pipeline(self, X, y, pipeline_id):
y = self.label_enconder.transform(y)
y = self.label_encoder.transform(y)

return super().score_pipeline(X, y, pipeline_id)

Expand Down Expand Up @@ -365,7 +377,7 @@ def __init__(self, time_bound=15, metric='mean_absolute_error', split_strategy='
super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
split_strategy_kwargs, output_folder, start_mode, verbose)


class AutoMLTimeSeries(BaseAutoML):
def __init__(self, time_bound=15, metric='mean_squared_error', split_strategy='timeseries', time_bound_run=5,
score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None,
Expand All @@ -390,17 +402,18 @@ def __init__(self, time_bound=15, metric='mean_squared_error', split_strategy='t
task = 'TIME_SERIES_FORECAST'
self.date_column = date_column
self.target_column = target_column

super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
split_strategy_kwargs, output_folder, start_mode, verbose)

def _column_parser(self, X):
cols = list(X.columns.values)
cols.remove(self.date_column)
cols.remove(self.target_column)
X = X[[self.date_column, self.target_column] + cols]
y = X[[self.target_column]]
return X, y

def fit(self, X, y=None):
X, y = self._column_parser(X)
super().fit(X, y)
Expand All @@ -413,7 +426,7 @@ def __init__(self, time_bound=15, metric='f1_score', split_strategy='holdout', t
output_folder=None, start_mode='auto', verbose=logging.INFO):
"""
Create/instantiate an AutoMLSemiSupervisedClassifier object.
:param time_bound: Limit time in minutes to perform the search.
:param metric: A str (see in the documentation the list of available metrics) or a callable object/function.
:param split_strategy: Method to score the pipeline: `holdout`, `cross_validation` or an instance of
Expand All @@ -428,7 +441,7 @@ def __init__(self, time_bound=15, metric='f1_score', split_strategy='holdout', t
:param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`.
:param verbose: Whether or not to show additional logs.
"""
self.label_enconder = SemiSupervisedLabelEncoder()

task = 'SEMISUPERVISED'
super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
split_strategy_kwargs, output_folder, start_mode, verbose)
Expand All @@ -437,18 +450,19 @@ def __init__(self, time_bound=15, metric='f1_score', split_strategy='holdout', t
split_strategy_kwargs = {'test_size': 0.2}

self.splitter = SemiSupervisedSplitter(**split_strategy_kwargs)
self.label_encoder = SemiSupervisedLabelEncoder()

def fit(self, X, y):
y = self.label_enconder.fit_transform(y)
y = self.label_encoder.fit_transform(y)
super().fit(X, y)

def predict(self, X):
predictions = super().predict(X)

return self.label_enconder.inverse_transform(predictions)
return self.label_encoder.inverse_transform(predictions)

def score(self, X, y):
y = self.label_enconder.transform(y)
y = self.label_encoder.transform(y)

return super().score(X, y)

Expand All @@ -458,9 +472,9 @@ def fit_pipeline(self, pipeline_id):
def predict_pipeline(self, X, pipeline_id):
predictions = super().predict_pipeline(X, pipeline_id)

return self.label_enconder.inverse_transform(predictions)
return self.label_encoder.inverse_transform(predictions)

def score_pipeline(self, X, y, pipeline_id):
y = self.label_enconder.transform(y)
y = self.label_encoder.transform(y)

return super().score_pipeline(X, y, pipeline_id)
24 changes: 24 additions & 0 deletions alpha_automl/pipeline_serializer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from sklearn.pipeline import Pipeline as Pipeline


class PipelineSerializer(Pipeline):

def __init__(self, pipeline_sk, label_encoder=None):
self.label_encoder = label_encoder
super().__init__(pipeline_sk.steps)

def fit(self, X, y):
if self.label_encoder is not None:
y = self.label_encoder.fit_transform(y)
super().fit(X, y)

def predict(self, X):
predictions = super().predict(X)

if self.label_encoder is not None:
return self.label_encoder.inverse_transform(predictions)
else:
return predictions

def predict_proba(self, X):
return super().predict_proba(X)
3 changes: 3 additions & 0 deletions streamlit/.streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[server]
maxUploadSize = 2000

32 changes: 32 additions & 0 deletions streamlit/Home.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import streamlit as st
from sklearn import set_config


set_config(display="html")

st.set_page_config(
page_title="Alpha-AutoML App",
)


with st.columns(3)[1]:
st.image("https://github.com/VIDA-NYU/alpha-automl/raw/devel/Alpha-AutoML_logo.png")

st.markdown(
"<p style='text-align: center;'>An extensible open-source AutoML system that supports multiple ML tasks </p>",
unsafe_allow_html=True,
)

st.divider()

st.markdown(
"<p style='text-align: justify;'>Alpha-AutoML leverages in reinforcement learning and neural network components "
"and it relies on standard, open-source infrastructure to specify and run pipelines. It is compatible with "
"state-of-the-art ML techniques: by using the Sklearn pipeline infrastructure, Alpha-AutoML is fully compatible "
"with other standard libraries like XGBoost, Hugging Face, Keras, PyTorch. In addition, primitives can be added on "
"the fly through the standard Sklearn’s fit/predict API, making it possible for Alpha-AutoML to leverage new "
"developments in ML and keep up with the fast pace in the area. </p>",
unsafe_allow_html=True,
)

st.divider()
36 changes: 36 additions & 0 deletions streamlit/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Alpha-AutoML Streamlit App

This directory contains a Streamlit app that implements model training and prediction using the `alpha-automl` system.

## Required dependencies

Make sure to install the following dependencies:
- alpha-automl[image]
- streamlit

We recommend creating a virtual environment to avoid dependency conflicts:
```
python -m venv venv
source venv/bin/activate
pip install alpha-automl[image]@"git+https://github.com/VIDA-NYU/alpha-automl@devel"
pip install streamlit
```

In the commands above, we are installing a specific version of alpha-automl, including the optional `image` dependencies that are required to support building models for image classification.

## Running the app

To run the app execute the following command:

```
streamlit run Home.py
```

The command will print a message like this:
```
You can now view your Streamlit app in your browser.
Local URL: http://localhost:8501
Network URL: http://192.168.1.150:8501
```
Now you can open the app using your browser using one of the addresses specified above, e.g., http://localhost:8501.
Loading

0 comments on commit 8af98ca

Please sign in to comment.