✨ 🔀 Continuous perturbations

Add continuous feature perturbations. --------- Co-authored-by: Ricardo Hernández Medina <[email protected]> Co-authored-by: Henry Webel <[email protected]> Co-authored-by: Marc Pielies Avelli <[email protected]>
RasmussenLab · Aug 14, 2024 · 49ee412 · 49ee412
1 parent 92bced0
commit 49ee412
Show file tree

Hide file tree

Showing 48 changed files with 27,935 additions and 313 deletions.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -1,16 +1,111 @@
-name: release on pypi
+name: CI
 on:
   push:
-    branches:
-      - main
+  pull_request:
+    # branches:
+    #   - main
+
 jobs:
+  format:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: psf/black@stable 
+  lint:
+    name: Lint with flake8
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install flake8
+        run: pip install flake8 flake8-bugbear
+      - name: Lint with flake8  
+        run: flake8 src
+  run-tutorial:
+    name: Run tutorial - random_small
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install dependencies
+        run: pip install .
+      - name: Prepare tutorial data
+        run: |
+          cd tutorial
+          move-dl data=random_small task=encode_data --cfg job
+          move-dl data=random_small task=encode_data
+      - name: Train model and analyze latent space
+        run: |
+          cd tutorial
+          move-dl data=random_small task=random_small__latent --cfg job
+          move-dl data=random_small task=random_small__latent
+      - name: Identify associations - t-test
+        run: |
+          cd tutorial
+          move-dl data=random_small task=random_small__id_assoc_ttest --cfg job
+          move-dl data=random_small task=random_small__id_assoc_ttest task.training_loop.num_epochs=30 task.num_refits=4
+      - name: Identify associations - bayes factors
+        run: |
+          cd tutorial
+          move-dl data=random_small task=random_small__id_assoc_bayes --cfg job
+          move-dl data=random_small task=random_small__id_assoc_bayes task.training_loop.num_epochs=30 task.num_refits=20
+  run-tutorial-cont:
+      name: Run tutorial - random_continuous
+      runs-on: ubuntu-latest
+      steps:
+        - uses: actions/checkout@v4
+        - uses: actions/setup-python@v5
+          with:
+            python-version: "3.11"
+        - name: Install dependencies
+          run: pip install .
+        - name: Prepare tutorial data
+          run: |
+            cd tutorial
+            move-dl data=random_continuous task=encode_data
+        - name: Train model and analyze latent space
+          run: |
+            cd tutorial
+            move-dl data=random_continuous task=random_continuous__latent --cfg job
+            move-dl data=random_continuous task=random_continuous__latent
+        - name: Identify associations - t-test
+          run: |
+            cd tutorial
+            move-dl data=random_continuous task=random_continuous__id_assoc_ttest --cfg job
+            move-dl data=random_continuous task=random_continuous__id_assoc_ttest task.training_loop.num_epochs=30 task.num_refits=4
+        - name: Identify associations - bayes factors
+          run: |
+            cd tutorial
+            move-dl data=random_continuous task=random_continuous__id_assoc_bayes --cfg job
+            move-dl data=random_continuous task=random_continuous__id_assoc_bayes task.training_loop.num_epochs=30 task.num_refits=4
+        - name: Identify associations - KS
+          run: |
+            cd tutorial
+            move-dl data=random_continuous task=random_continuous__id_assoc_ks --cfg job
+            move-dl data=random_continuous task=random_continuous__id_assoc_ks task.training_loop.num_epochs=30 task.num_refits=4
+    
   publish:
     name: Publish package
     runs-on: ubuntu-latest
+    if: startsWith(github.ref, 'refs/tags')
+    needs:
+      - format
+      - lint
     steps:
-      - uses: actions/checkout@v3
-      - name: Publish package
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install twine and build
+        run: python -m pip install --upgrade twine build
+      - name: Build
+        run: python -m build
+      - name: Publish package  
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           user: __token__

diff --git a/.gitignore b/.gitignore
@@ -40,6 +40,11 @@ tutorial/*
 !tutorial/notebooks/*.ipynb
 !tutorial/README.md
 
+# Supplementary files
+supplementary_files/*.png
+supplementary_files/*.tsv
+supplementary_files/*.txt
+
 # Virtual environment
 venv/
 virtualvenv/
@@ -48,6 +53,12 @@ virtualvenv/
 docs/build/
 docs/source/_templates/
 
+# VS Code settings
+.vscode
+
+# macOS
+.DS_Store
+
 # Root folder
 /*.*
 !/.gitignore
@@ -58,3 +69,4 @@ docs/source/_templates/
 !/pyproject.toml
 !/requirements.txt
 !/setup.cfg
+!/.github
diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ Medication data
 
 ## Installing MOVE package
 
-MOVE is written in Python and can therefore be installed using `pip`:
+MOVE is written in Python and can be installed using `pip`:
 
 ```bash
 >>> pip install move-dl
@@ -78,11 +78,11 @@ MOVE has five-six steps:
 ## How to run MOVE
 
 Please refer to our [**documentation**](https://move-dl.readthedocs.io/) for
-examples and [tutorials](https://move-dl.readthedocs.io/tutorial/index.html) 
+examples and [tutorials](https://move-dl.readthedocs.io/tutorial/index.html)
 on how to run MOVE.
 
-Additionally, you can copy 
-[this notebook](https://colab.research.google.com/drive/1RFWNsuGymCmppPsElBvDuA9zRbGskKmi?usp=sharing) 
+Additionally, you can copy
+[this notebook](https://colab.research.google.com/drive/1RFWNsuGymCmppPsElBvDuA9zRbGskKmi?usp=sharing)
 and follow its instructions to get familiar with our pipeline.
 
 # Data sets

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -38,7 +38,7 @@
 
 html_theme = "sphinx_rtd_theme"
 html_theme_options = {
-    "collapse_navigation" : False,
+    "collapse_navigation": False,
 }
 html_static_path = []
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 hydra-core>=1.2.0
-numpy>=1.21.5
+numpy>=1.21.5,<2
 pandas>=1.4.2
 torch>=1.11.0
 matplotlib>=3.5.2

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,8 @@
 [metadata]
 name = move-dl
 description = Multi-omics variational autoencoder
+long_description = file: README.md
+long_description_content_type = text/markdown
 url = https://github.com/RasmussenLab/MOVE
 classifiers =
     Intended Audience :: Healthcare Industry
@@ -15,13 +17,13 @@ version = attr: move.__version__
 include_package_data = True
 install_requires =
     hydra-core
-    numpy
+    numpy<2
     pandas
     torch
     matplotlib
     seaborn
     scikit-learn
-    scipy
+    scipy>=1.10.0
 
 package_dir =
     = src
@@ -34,3 +36,8 @@ where = src
 [options.entry_points]
 console_scripts =
     move-dl=move.__main__:main
+
+[flake8]
+max-line-length = 88
+aggressive = 2
+extend-ignore = E203
diff --git a/src/move/__init__.py b/src/move/__init__.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
 
-__license__ = "MIT"
-__version__ = (1, 4, 10)
-__all__ = ["conf", "data", "models", "training_loop", "VAE"]
-
 HYDRA_VERSION_BASE = "1.2"
 
-from move import conf, data, models
-from move.models.vae import VAE
-from move.training.training_loop import training_loop
+from move import conf, data, models  # noqa:E402
+from move.models.vae import VAE  # noqa:E402
+from move.training.training_loop import training_loop  # noqa:E402
+
+__license__ = "MIT"
+__version__ = (1, 5, 0)
+__all__ = ["conf", "data", "models", "training_loop", "VAE"]
diff --git a/src/move/analysis/metrics.py b/src/move/analysis/metrics.py
@@ -81,3 +81,21 @@ def norm(x: np.ma.MaskedArray, axis: int = 1) -> np.ma.MaskedArray:
         1D array with the specified axis removed.
     """
     return np.sqrt(np.sum(x**2, axis=axis))
+
+
+def get_2nd_order_polynomial(
+    x_array: FloatArray, y_array: FloatArray, n_points=100
+) -> tuple[FloatArray, FloatArray, tuple[float, float, float]]:
+    """
+    Given a set of x an y values, find the 2nd oder polynomial fitting best the data.
+
+    Returns:
+        x_pol: x coordinates for the polynomial function evaluation.
+        y_pol: y coordinates for the polynomial function evaluation.
+    """
+    a2, a1, a = np.polyfit(x_array, y_array, deg=2)
+
+    x_pol = np.linspace(np.min(x_array), np.max(x_array), n_points)
+    y_pol = np.array([a2 * x * x + a1 * x + a for x in x_pol])
+
+    return x_pol, y_pol, (a2, a1, a)
diff --git a/src/move/conf/main.yaml b/src/move/conf/main.yaml
@@ -20,6 +20,7 @@ hydra:
   job:
     config:
       override_dirname:
+        item_sep: ";"
         exclude_keys:
           - experiment
 

diff --git a/src/move/conf/schema.py b/src/move/conf/schema.py
@@ -28,9 +28,11 @@ class InputConfig:
     name: str
     weight: int = 1
 
+
 @dataclass
 class ContinuousInputConfig(InputConfig):
     scale: bool = True
+    log2: bool = False
 
 
 @dataclass
@@ -185,6 +187,27 @@ class IdentifyAssociationsTTestConfig(IdentifyAssociationsConfig):
     num_latent: list[int] = MISSING
 
 
+@dataclass
+class IdentifyAssociationsKSConfig(IdentifyAssociationsConfig):
+    """Configure the Kolmogorov-Smirnov approach to identify associations.
+
+    Args:
+        perturbed_feature_names: names of the perturbed features of interest.
+        target_feature_names: names of the target features of interest.
+
+    Description:
+    For each perturbed feature - target feature pair, we will plot:
+            - Input vs. reconstruction correlation plot: to assess reconstruction
+              quality of both target and perturbed features.
+            - Distribution of reconstruction values for the target feature before
+              and after the perturbation of the perturbed feature.
+
+    """
+
+    perturbed_feature_names: list[str] = field(default_factory=list)
+    target_feature_names: list[str] = field(default_factory=list)
+
+
 @dataclass
 class MOVEConfig:
     defaults: list[Any] = field(default_factory=lambda: [dict(data="base_data")])
@@ -237,6 +260,11 @@ def extract_names(configs: list[InputConfig]) -> list[str]:
     name="identify_associations_ttest_schema",
     node=IdentifyAssociationsTTestConfig,
 )
+cs.store(
+    group="task",
+    name="identify_associations_ks_schema",
+    node=IdentifyAssociationsKSConfig,
+)
 
 # Register custom resolvers
 OmegaConf.register_new_resolver("weights", extract_weights)

diff --git a/src/move/conf/task/identify_associations_bayes.yaml b/src/move/conf/task/identify_associations_bayes.yaml
@@ -32,3 +32,5 @@ training_loop:
     - 25
   early_stopping: false
   patience: 0
+
+
diff --git a/src/move/conf/task/identify_associations_ks.yaml b/src/move/conf/task/identify_associations_ks.yaml
@@ -0,0 +1,26 @@
+defaults:
+  - identify_associations_ks_schema
+
+model:
+  categorical_weights: ${weights:${data.categorical_inputs}}
+  continuous_weights: ${weights:${data.continuous_inputs}}
+  num_hidden:
+    - 100
+  num_latent: 50
+  beta: 0.1
+  dropout: 0.1
+  cuda: false
+
+training_loop:
+  lr: 1e-4
+  num_epochs: 200
+  batch_dilation_steps:
+    - 50
+    - 100
+    - 150
+  kld_warmup_steps:
+    - 15
+    - 20
+    - 25
+  early_stopping: false
+  patience: 0
diff --git a/src/move/conf/task/identify_associations_ttest.yaml b/src/move/conf/task/identify_associations_ttest.yaml
@@ -35,3 +35,5 @@ training_loop:
     - 25
   early_stopping: false
   patience: 0
+
+