diff --git a/.github/workflows/interface_auto.yml b/.github/workflows/interface_auto.yml new file mode 100644 index 00000000..9abacad4 --- /dev/null +++ b/.github/workflows/interface_auto.yml @@ -0,0 +1,90 @@ +name: Deployment on DockerHub and Hugging Face + +# Trigger the workflow on push and pull request events to the 'main' or 'master' branch +on: + push: + branches: + - "master" + - "main" + pull_request: + branches: + - "master" + - "main" + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + build-and-deploy: + # Run the job on the latest Ubuntu runner + runs-on: ubuntu-latest + name: Push Docker image to Docker Hub and Hugging Face + permissions: + packages: write + contents: read + attestations: write + id-token: write + + steps: + - name: Check out the repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: true + + - name: Set up Git LFS + run: | + git lfs install + git lfs pull + + - name: Remove binary files from git history + run: | + git filter-branch --force --index-filter \ + "git rm --cached --ignore-unmatch DiverseSelector/test/test2/BBB_SECFP6_1024.xlsx DiverseSelector/test/test2/BBB_SECFP6_2048.xlsx" \ + --prune-empty --tag-name-filter cat -- --all + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + # - name: Extract metadata (tags, labels) for Docker + # id: meta + # uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + # with: + # images: my-docker-hub-namespace/my-docker-hub-repository + + - name: Build and push Docker image + id: push + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + push: true + # tags: ${{ steps.meta.outputs.tags }} + # labels: ${{ steps.meta.outputs.labels }} + + # * name: Generate artifact attestation + # uses: actions/attest-build-provenance@v1 + # with: + # subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} + # subject-digest: ${{ steps.push.outputs.digest }} + # push-to-registry: true + + # Step 8: Replace the README.md file for Hugging Face + - name: Replace README for Hugging Face + run: | + mv README_hf.md README.md + git config --global user.name "github-actions[bot]" + git config --global user.email "qcdevs@gmail.com" + git add README.md + git commit -m "Replace README.md with README_hf.md for Hugging Face" + + # Step 9: Push the app to Hugging Face + - name: Push to Hugging Face + env: + APP: ${{ secrets.APP }} + run: | + git push https://QCDevs:$APP@huggingface.co/spaces/QCDevs/selector HEAD:main --force diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..b8567508 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,34 @@ +# Use the official image as a parent image +FROM python:3.11-slim + +# Set the working directory in the container +WORKDIR /app + +# Install system dependencies required for building packages +RUN apt-get update && \ + apt-get install -y build-essential && \ + apt-get clean + +# Copy the requirements file into the container +COPY requirements.txt requirements.txt +COPY requirements_dev.txt requirements_dev.txt + +# Upgrade pip, setuptools, and wheel +RUN pip install --upgrade pip setuptools wheel + +# Install the dependencies using --use-pep517 +RUN pip install --use-pep517 --no-cache-dir -r requirements.txt +RUN pip install --use-pep517 --no-cache-dir -r requirements_dev.txt +RUN pip install --use-pep517 --no-cache-dir streamlit + +# Copy the rest of the application code +COPY . . + +# Install the Selector package using PEP 517 standards-based tools +RUN pip install --use-pep517 . + +# Expose the port the app runs on +EXPOSE 8501 + +# Command to run the app +CMD ["streamlit", "run", "streamlit_app/app.py", "--server.enableXsrfProtection=false"] diff --git a/README_hf.md b/README_hf.md new file mode 100644 index 00000000..ca4c1027 --- /dev/null +++ b/README_hf.md @@ -0,0 +1,8 @@ +--- +title: QC-Selector +emoji: 🐳 +colorFrom: purple +colorTo: gray +sdk: docker +app_port: 8501 +--- \ No newline at end of file diff --git a/streamlit_app/app.py b/streamlit_app/app.py new file mode 100644 index 00000000..dc830c93 --- /dev/null +++ b/streamlit_app/app.py @@ -0,0 +1,99 @@ +# The Selector library provides a set of tools for selecting a +# subset of the dataset and computing diversity. +# +# Copyright (C) 2023 The QC-Devs Community +# +# This file is part of Selector. +# +# Selector is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 3 +# of the License, or (at your option) any later version. +# +# Selector is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see +# +# -- + +import streamlit as st +import os + + +# Get the current directory path +current_dir = os.path.dirname(os.path.abspath(__file__)) + +# Construct the path to the assets directory +assets_dir = os.path.join(current_dir, "assets") + +# Set page configuration +st.set_page_config( + page_title = "QC-Selector", + page_icon = os.path.join(assets_dir, "QC-Devs.png"), +) + +st.image(os.path.join(assets_dir, "selector_logo.png")) + +st.write("# Welcome to QC-Selector! 👋") + +st.sidebar.success("Select an algorithm to get started.") + +st.info("👈 Select an algorithm from the sidebar to see some examples of what QC-Selector can do!") + +st.markdown( + """ + [selector](https://github.com/theochem/Selector) is a free, open-source, and cross-platform + Python library designed to help you effortlessly identify the most diverse subset of molecules + from your dataset. + Please use the following citation in any publication using selector library: + + **“Selector: A Generic Python Package for Subset Selection”**, Fanwang Meng, Alireza Tehrani, + Valerii Chuiko, Abigail Broscius, Abdul, Hassan, Maximilian van Zyl, Marco Martínez González, + Yang, Ramón Alain Miranda-Quintana, Paul W. Ayers, and Farnaz Heidar-Zadeh” + + The selector source code is hosted on [GitHub](https://github.com/theochem/Selector) + and is released under the [GNU General Public License v3.0](https://github.com/theochem/Selector/blob/main/LICENSE). + We welcome any contributions to the selector library in accordance with our Code of Conduct; + please see our [Contributing Guidelines](https://qcdevs.org/guidelines/qcdevs_code_of_conduct/). + Please report any issues you encounter while using + selector library on [GitHub Issues](https://github.com/theochem/Selector/issues). + For further information and inquiries please contact us at qcdevs@gmail.com. + + ### Why QC-Selector? + In the world of chemistry, selecting the right subset of molecules is critical for a wide + range of applications, including drug discovery, materials science, and molecular optimization. + QC-Selector offers a cutting-edge solution to streamline this process, empowering researchers, + scientists, and developers to make smarter decisions faster. + + ### Key Features + 1. Import Your Dataset: Simply import your molecule dataset in various file formats, including SDF, SMILES, and InChi, to get started. + + 2. Define Selection Criteria: Specify the desired level of diversity and other relevant parameters to tailor the subset selection to your unique requirements. + + 3. Run the Analysis: Let QC-Selector’s powerful algorithms process your dataset and efficiently select the most diverse molecules. + + 4. Export: Explore the diverse subset and export the results for further analysis and integration into your projects. +""" +) + +st.sidebar.title("About QC-Devs") + +st.sidebar.info("QC-Devs develops various free, open-source, and cross-platform libraries for scientific computing, especially theoretical and computational chemistry. Our goal is to make programming accessible to chemists and promote precepts of sustainable software development. For further information and inquiries please contact us at qcdevs@gmail.com.") + +# Add icons to the sidebar +st.sidebar.markdown( + """ + +
+ WEBSITE
+ EMAIL
+ GITHUB
+ © 2024 QC-Devs. All rights reserved. +
+ """, + unsafe_allow_html=True +) diff --git a/streamlit_app/assets/QC-Devs.png b/streamlit_app/assets/QC-Devs.png new file mode 100644 index 00000000..b3c61189 Binary files /dev/null and b/streamlit_app/assets/QC-Devs.png differ diff --git a/streamlit_app/assets/selector_logo.png b/streamlit_app/assets/selector_logo.png new file mode 100644 index 00000000..bbc79cc7 Binary files /dev/null and b/streamlit_app/assets/selector_logo.png differ diff --git a/streamlit_app/pages/page_maxmin.py b/streamlit_app/pages/page_maxmin.py new file mode 100644 index 00000000..24a33e72 --- /dev/null +++ b/streamlit_app/pages/page_maxmin.py @@ -0,0 +1,187 @@ +# The Selector library provides a set of tools for selecting a +# subset of the dataset and computing diversity. +# +# Copyright (C) 2023 The QC-Devs Community +# +# This file is part of Selector. +# +# Selector is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 3 +# of the License, or (at your option) any later version. +# +# Selector is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see +# +# -- + +import streamlit as st +import numpy as np +import pandas as pd +import json +import os + +from sklearn.metrics import pairwise_distances +from selector.methods.distance import MaxMin + +# Get the current directory path +current_dir = os.path.dirname(os.path.abspath(__file__)) + +# Construct the path to the assets directory +assets_dir = os.path.join(current_dir, "..", "assets") + +# Set page configuration +st.set_page_config( + page_title="MaxMin", + page_icon=os.path.join(assets_dir, "QC-Devs.png") +) + +st.title("Brute Strength - MaxMin") + +st.sidebar.header("Brute Strength - MaxMin") + +st.sidebar.info( + """ + MaxMin is possibly the most widely used method for dissimilarity-based + compound selection. When presented with a dataset of samples, the + initial point is chosen as the dataset's medoid center. Next, the second + point is chosen to be that which is furthest from this initial point. + Subsequently, all following points are selected via the following + logic: + + 1. Find the minimum distance from every point to the already-selected ones. + 2. Select the point which has the maximum distance among those calculated + in the previous step. + + In the current implementation, this method requires or computes the full pairwise-distance + matrix, so it is not recommended for large datasets. + """ +) + +st.sidebar.title("References") + +st.sidebar.info("[1] Ashton, Mark, et al., Identification of diverse database subsets using " + "property‐based and fragment‐based molecular descriptions, " + "Quantitative Structure‐Activity Relationships 21.6 (2002): 598-604.") + + +# File uploader for feature matrix or distance matrix (required) +matrix_file = st.file_uploader("Upload a feature matrix or distance matrix (required)", type=["csv", "xlsx", "npz", "npy"], key="matrix_file") + +# Clear selected indices if a new matrix file is uploaded +if matrix_file is None: + st.session_state.pop("selected_ids", None) +# Load data from matrix file +else: + try: + header_option = None + if matrix_file.name.endswith(".csv") or matrix_file.name.endswith(".xlsx"): + header_option = st.checkbox("Does the file have a header?", key = "header_option") + st.warning("Warning: This will affect the final output if not specified correctly.") + + if matrix_file.name.endswith(".csv") or matrix_file.name.endswith(".xlsx"): + if header_option: + # Load the matrix with header + matrix = pd.read_csv(matrix_file).values + else: + # Load the matrix without header + matrix = pd.read_csv(matrix_file, header = None).values + st.write("Matrix shape:", matrix.shape) + st.write(matrix) + + elif matrix_file.name.endswith(".npz"): + matrix_data = np.load(matrix_file) + # Select the array in the .npz file + array_names = matrix_data.files + selected_array = st.selectbox("Select the array to use", array_names) + matrix = matrix_data[selected_array] + st.write("Matrix shape:", matrix.shape) + st.write(matrix) + elif matrix_file.name.endswith(".npy"): + matrix = np.load(matrix_file) + st.write("Matrix shape:", matrix.shape) + st.write(matrix) + except Exception as e: + st.error(f'An error occurred while loading matrix file: {e}') + matrix = None + + + # Input for number of points to select (required) + num_points = st.number_input("Number of points to select", min_value=1, step=1, key="num_points") + + # Input for cluster label list (optional) + label_file = st.file_uploader("Upload a cluster label list (optional)", type=["csv", "xlsx"], key="label_file") + labels = None + if label_file: + try: + label_header_option = None + if label_file.name.endswith(".csv") or label_file.name.endswith(".xlsx"): + label_header_option = st.checkbox("Does the file have a header?", + key = "label_header_option") + st.warning( + "Warning: This will affect the final output if not specified correctly.") + + if label_file.name.endswith(".csv") or label_file.name.endswith(".xlsx"): + if label_header_option: + labels = pd.read_csv(label_file).values.flatten() + else: + labels = pd.read_csv(label_file, header = None).values.flatten() + st.write("Cluster labels shape:", labels.shape) + st.write(labels) + except Exception as e: + st.error(f'An error occurred while loading cluster label file: {e}') + labels = None + + + if st.button("Run MaxMin Algorithm"): + try: + # Check if the input matrix is a feature matrix or a distance matrix + if matrix.shape[0] == matrix.shape[1]: + # Distance matrix + selector = MaxMin() + selected_ids = selector.select(matrix, size = num_points, labels = labels) + else: + # Feature matrix + selector = MaxMin(lambda x: pairwise_distances(x, metric = "euclidean")) + selected_ids = selector.select(matrix, size = num_points, labels = labels) + + # Convert selected indices to a list of integers + selected_ids = [int(i) for i in selected_ids] + + # Save selected indices to session state + st.session_state['selected_ids'] = selected_ids + except ValueError as ve: + st.error(f"An error occurred while running the MaxMin algorithm: {ve}") + except Exception as e: + st.error(f"An error occurred while running the MaxMin algorithm: {e}") + +# Check if the selected indices are stored in the session state +if 'selected_ids' in st.session_state and matrix_file is not None: + selected_ids = st.session_state['selected_ids'] + st.write("Selected indices:", selected_ids) + + # export format + export_format = st.selectbox("Select export format", ["CSV", "JSON"], key="export_format") + + if export_format == "CSV": + csv_data = pd.DataFrame(selected_ids, columns = ["Selected Indices"]) + csv = csv_data.to_csv(index = False).encode('utf-8') + st.download_button( + label = "Download as CSV", + data = csv, + file_name = 'selected_indices.csv', + mime = 'text/csv', + ) + else: + json_data = json.dumps({"Selected Indices": selected_ids}) + st.download_button( + label = "Download as JSON", + data = json_data, + file_name = 'selected_indices.json', + mime = 'application/json', + )