v0.2.3

gao-lab · Jun 21, 2022 · 0e9c835 · 0e9c835
1 parent 9f37e40
commit 0e9c835
Show file tree

Hide file tree

Showing 7 changed files with 86 additions and 18 deletions.
diff --git a/docs/release.rst b/docs/release.rst
@@ -1,6 +1,20 @@
 Release notes
 =============
 
+v0.2.3
+------
+
+Minor improvements and bug fixes
+
+Bug fixes:
+
+- Data frame in ``obsm`` no longer triggers an error during model training (Resolves `#32 <https://github.com/gao-lab/GLUE/issues/32>`_).
+
+Enhancements:
+
+- `scglue.data.transfer_labels <api/scglue.data.transfer_labels.rst>`_ uses a new strategy with SNN-based estimation of transfer confidence (Resolves `#23 <https://github.com/gao-lab/GLUE/issues/23>`_).
+- Allow setting custom bedtools path via `scglue.config.BEDTOOLS_PATH <api/scglue.utils.ConfigManager.rst>`_ (Resolves `#22 <https://github.com/gao-lab/GLUE/issues/22>`_).
+
 v0.2.2
 ------
 
@@ -42,7 +56,7 @@ Bug fixes:
 Experimental features:
 
 - A `partially paired GLUE model <api/scglue.models.scglue.PairedSCGLUEModel.rst>`_ for utilizing paired cells whenever available
-- The `CLUE model <api/scglue.models.scclue.SCCLUEModel.rst>`_ that won the `NeurIPS 2020 competition in multimodal integration <https://openproblems.bio/neurips_2021/>`_ is here!
+- The `CLUE model <api/scglue.models.scclue.SCCLUEModel.rst>`_ that won the `NeurIPS 2021 competition in multimodal integration <https://openproblems.bio/neurips_2021/>`_ is here!
 
 
 v0.1.1

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
 
 [project]
 name = "scglue"
-version = "0.2.2"
+version = "0.2.3"
 description = "Graph-linked unified embedding for unpaired single-cell multi-omics data integration"
 readme = "README.md"
 requires-python = ">=3.6"

diff --git a/scglue/check.py b/scglue/check.py
@@ -11,7 +11,7 @@
 from packaging.version import parse
 
 from . import version
-from .utils import run_command
+from .utils import config, run_command
 
 
 class Checker:
@@ -146,7 +146,7 @@ def check(self) -> None:
 
 CHECKERS = dict(
     bedtools=CmdChecker(
-        "bedtools", "bedtools --version", r"v([0-9\.]+)",
+        "bedtools", f"{config.BEDTOOLS_PATH or 'bedtools'} --version", r"v([0-9\.]+)",
         vmin="2.29.2", install_hint=INSTALL_HINTS.bedtools
     ),
     plotly=ModuleChecker(

diff --git a/scglue/data.py b/scglue/data.py
@@ -20,6 +20,7 @@
 import sklearn.feature_extraction.text
 import sklearn.linear_model
 import sklearn.neighbors
+import sklearn.preprocessing
 import sklearn.utils.extmath
 from anndata import AnnData
 from networkx.algorithms.bipartite import biadjacency_matrix
@@ -199,7 +200,7 @@ def aggregate_obs(
 
 def transfer_labels(
         ref: AnnData, query: AnnData, field: str,
-        n_neighbors: int = 5, use_rep: Optional[str] = None,
+        n_neighbors: int = 30, use_rep: Optional[str] = None,
         key_added: Optional[str] = None, **kwargs
 ) -> None:
     r"""
@@ -224,20 +225,46 @@ def transfer_labels(
     **kwargs
         Additional keyword arguments are passed to
         :class:`sklearn.neighbors.NearestNeighbors`
+
+    Note
+    ----
+    First, nearest neighbors between reference and query cells are searched and
+    weighted by Jaccard index of SNN (shared nearest neighbors). The Jaccard
+    indices are then normalized per query cell to form a mapping matrix. To
+    obtain predictions for query cells, we multiply the above mapping matrix to
+    the one-hot matrix of reference labels. The category with the highest score
+    is taken as the final prediction, while its score is interpreted as
+    transfer confidence (stored as "{key_added}_confidence" in ``query.obs``).
     """
-    ref_mat = ref.obsm[use_rep] if use_rep else ref.X
-    query_mat = query.obsm[use_rep] if use_rep else query.X
-    nn = sklearn.neighbors.NearestNeighbors(
+    xrep = ref.obsm[use_rep] if use_rep else ref.X
+    yrep = query.obsm[use_rep] if use_rep else query.X
+    xnn = sklearn.neighbors.NearestNeighbors(
         n_neighbors=n_neighbors, **kwargs
-    ).fit(ref_mat)
-    nni = nn.kneighbors(query_mat, return_distance=False)
-    hits = ref.obs[field].to_numpy()[nni]
-    pred = pd.crosstab(
-        np.repeat(query.obs_names, n_neighbors), hits.ravel()
-    ).idxmax(axis=1).loc[query.obs_names]
-    if pd.api.types.is_categorical_dtype(ref.obs[field]):
-        pred = pd.Categorical(pred, categories=ref.obs[field].cat.categories)
-    query.obs[key_added or field] = pred
+    ).fit(xrep)
+    ynn = sklearn.neighbors.NearestNeighbors(
+        n_neighbors=n_neighbors, **kwargs
+    ).fit(yrep)
+    xx = xnn.kneighbors_graph(xrep)
+    xy = ynn.kneighbors_graph(xrep)
+    yx = xnn.kneighbors_graph(yrep)
+    yy = ynn.kneighbors_graph(yrep)
+    jaccard = (xx @ yx.T) + (xy @ yy.T)
+    jaccard.data /= 4 * n_neighbors - jaccard.data
+    normalized_jaccard = jaccard.multiply(1 / jaccard.sum(axis=0))
+    onehot = sklearn.preprocessing.OneHotEncoder()
+    xtab = onehot.fit_transform(ref.obs[[field]])
+    ytab = normalized_jaccard.T @ xtab
+    pred = pd.Series(
+        onehot.categories_[0][ytab.argmax(axis=1).A1],
+        index=query.obs_names, dtype=ref.obs[field].dtype
+    )
+    conf = pd.Series(
+        ytab.max(axis=1).toarray().ravel(),
+        index=query.obs_names
+    )
+    key_added = key_added or field
+    query.obs[key_added] = pred
+    query.obs[key_added + "_confidence"] = conf
 
 
 def extract_rank_genes_groups(

diff --git a/scglue/models/scglue.py b/scglue/models/scglue.py
@@ -284,7 +284,7 @@ def _extract_xalt(self, adata: AnnData, data_config: DATA_CONFIG) -> AnyArray:
                     f"Configured data representation '{use_rep}' "
                     f"cannot be found in input data!"
                 )
-            xalt = adata.obsm[use_rep].astype(default_dtype)
+            xalt = np.asarray(adata.obsm[use_rep]).astype(default_dtype)
             if xalt.shape[1] != rep_dim:
                 raise ValueError(
                     f"Input representation dimensionality {xalt.shape[1]} "

diff --git a/scglue/utils.py b/scglue/utils.py
@@ -14,6 +14,7 @@
 import numpy as np
 import pandas as pd
 import torch
+from pybedtools.helpers import set_bedtools_path
 
 from .typehint import RandomState, T
 
@@ -195,6 +196,7 @@ def __init__(self) -> None:
         self.PRINT_LOSS_INTERVAL = 10
         self.TENSORBOARD_FLUSH_SECS = 5
         self.ALLOW_TRAINING_INTERRUPTION = True
+        self.BEDTOOLS_PATH = ""
 
     @property
     def TMP_PREFIX(self) -> str:
@@ -426,6 +428,19 @@ def ALLOW_TRAINING_INTERRUPTION(self) -> bool:
     def ALLOW_TRAINING_INTERRUPTION(self, allow_training_interruption: bool) -> None:
         self._ALLOW_TRAINING_INTERRUPTION = allow_training_interruption
 
+    @property
+    def BEDTOOLS_PATH(self) -> str:
+        r"""
+        Path to bedtools executable.
+        Default value is ``bedtools``.
+        """
+        return self._BEDTOOLS_PATH
+
+    @BEDTOOLS_PATH.setter
+    def BEDTOOLS_PATH(self, bedtools_path: str) -> None:
+        self._BEDTOOLS_PATH = bedtools_path
+        set_bedtools_path(bedtools_path)
+
 
 config = ConfigManager()
 

diff --git a/tests/models/test_scglue.py b/tests/models/test_scglue.py
@@ -360,3 +360,15 @@ def test_fit_SCGLUE(rna_pp, atac_pp, prior):
         compile_kws={"lr": 1e-5},
         fit_kws={"max_epochs": 5}
     )  # NOTE: Smoke test
+
+
+def test_fit_SCGLUE_with_df_obsm(rna_pp, atac_pp, prior):
+    rna_pp.obsm["X_pca"] = pd.DataFrame(rna_pp.obsm["X_pca"], index=rna_pp.obs_names)
+    scglue.models.configure_dataset(rna_pp, "NB", use_highly_variable=True, use_rep="X_pca", use_batch="batch", use_uid="uid")
+    scglue.models.configure_dataset(atac_pp, "NB", use_highly_variable=True, use_cell_type="ct", use_batch="batch", use_uid="uid")
+    scglue.models.fit_SCGLUE(
+        {"rna": rna_pp, "atac": atac_pp}, prior,
+        init_kws={"latent_dim": 2, "shared_batches": True},
+        compile_kws={"lr": 1e-5},
+        fit_kws={"max_epochs": 5}
+    )  # NOTE: Smoke test