Skip to content

Commit

Permalink
v0.2.3
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeff1995 committed Jun 21, 2022
1 parent 9f37e40 commit 0e9c835
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 18 deletions.
16 changes: 15 additions & 1 deletion docs/release.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
Release notes
=============

v0.2.3
------

Minor improvements and bug fixes

Bug fixes:

- Data frame in ``obsm`` no longer triggers an error during model training (Resolves `#32 <https://github.com/gao-lab/GLUE/issues/32>`_).

Enhancements:

- `scglue.data.transfer_labels <api/scglue.data.transfer_labels.rst>`_ uses a new strategy with SNN-based estimation of transfer confidence (Resolves `#23 <https://github.com/gao-lab/GLUE/issues/23>`_).
- Allow setting custom bedtools path via `scglue.config.BEDTOOLS_PATH <api/scglue.utils.ConfigManager.rst>`_ (Resolves `#22 <https://github.com/gao-lab/GLUE/issues/22>`_).

v0.2.2
------

Expand Down Expand Up @@ -42,7 +56,7 @@ Bug fixes:
Experimental features:

- A `partially paired GLUE model <api/scglue.models.scglue.PairedSCGLUEModel.rst>`_ for utilizing paired cells whenever available
- The `CLUE model <api/scglue.models.scclue.SCCLUEModel.rst>`_ that won the `NeurIPS 2020 competition in multimodal integration <https://openproblems.bio/neurips_2021/>`_ is here!
- The `CLUE model <api/scglue.models.scclue.SCCLUEModel.rst>`_ that won the `NeurIPS 2021 competition in multimodal integration <https://openproblems.bio/neurips_2021/>`_ is here!


v0.1.1
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"

[project]
name = "scglue"
version = "0.2.2"
version = "0.2.3"
description = "Graph-linked unified embedding for unpaired single-cell multi-omics data integration"
readme = "README.md"
requires-python = ">=3.6"
Expand Down
4 changes: 2 additions & 2 deletions scglue/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from packaging.version import parse

from . import version
from .utils import run_command
from .utils import config, run_command


class Checker:
Expand Down Expand Up @@ -146,7 +146,7 @@ def check(self) -> None:

CHECKERS = dict(
bedtools=CmdChecker(
"bedtools", "bedtools --version", r"v([0-9\.]+)",
"bedtools", f"{config.BEDTOOLS_PATH or 'bedtools'} --version", r"v([0-9\.]+)",
vmin="2.29.2", install_hint=INSTALL_HINTS.bedtools
),
plotly=ModuleChecker(
Expand Down
53 changes: 40 additions & 13 deletions scglue/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import sklearn.feature_extraction.text
import sklearn.linear_model
import sklearn.neighbors
import sklearn.preprocessing
import sklearn.utils.extmath
from anndata import AnnData
from networkx.algorithms.bipartite import biadjacency_matrix
Expand Down Expand Up @@ -199,7 +200,7 @@ def aggregate_obs(

def transfer_labels(
ref: AnnData, query: AnnData, field: str,
n_neighbors: int = 5, use_rep: Optional[str] = None,
n_neighbors: int = 30, use_rep: Optional[str] = None,
key_added: Optional[str] = None, **kwargs
) -> None:
r"""
Expand All @@ -224,20 +225,46 @@ def transfer_labels(
**kwargs
Additional keyword arguments are passed to
:class:`sklearn.neighbors.NearestNeighbors`
Note
----
First, nearest neighbors between reference and query cells are searched and
weighted by Jaccard index of SNN (shared nearest neighbors). The Jaccard
indices are then normalized per query cell to form a mapping matrix. To
obtain predictions for query cells, we multiply the above mapping matrix to
the one-hot matrix of reference labels. The category with the highest score
is taken as the final prediction, while its score is interpreted as
transfer confidence (stored as "{key_added}_confidence" in ``query.obs``).
"""
ref_mat = ref.obsm[use_rep] if use_rep else ref.X
query_mat = query.obsm[use_rep] if use_rep else query.X
nn = sklearn.neighbors.NearestNeighbors(
xrep = ref.obsm[use_rep] if use_rep else ref.X
yrep = query.obsm[use_rep] if use_rep else query.X
xnn = sklearn.neighbors.NearestNeighbors(
n_neighbors=n_neighbors, **kwargs
).fit(ref_mat)
nni = nn.kneighbors(query_mat, return_distance=False)
hits = ref.obs[field].to_numpy()[nni]
pred = pd.crosstab(
np.repeat(query.obs_names, n_neighbors), hits.ravel()
).idxmax(axis=1).loc[query.obs_names]
if pd.api.types.is_categorical_dtype(ref.obs[field]):
pred = pd.Categorical(pred, categories=ref.obs[field].cat.categories)
query.obs[key_added or field] = pred
).fit(xrep)
ynn = sklearn.neighbors.NearestNeighbors(
n_neighbors=n_neighbors, **kwargs
).fit(yrep)
xx = xnn.kneighbors_graph(xrep)
xy = ynn.kneighbors_graph(xrep)
yx = xnn.kneighbors_graph(yrep)
yy = ynn.kneighbors_graph(yrep)
jaccard = (xx @ yx.T) + (xy @ yy.T)
jaccard.data /= 4 * n_neighbors - jaccard.data
normalized_jaccard = jaccard.multiply(1 / jaccard.sum(axis=0))
onehot = sklearn.preprocessing.OneHotEncoder()
xtab = onehot.fit_transform(ref.obs[[field]])
ytab = normalized_jaccard.T @ xtab
pred = pd.Series(
onehot.categories_[0][ytab.argmax(axis=1).A1],
index=query.obs_names, dtype=ref.obs[field].dtype
)
conf = pd.Series(
ytab.max(axis=1).toarray().ravel(),
index=query.obs_names
)
key_added = key_added or field
query.obs[key_added] = pred
query.obs[key_added + "_confidence"] = conf


def extract_rank_genes_groups(
Expand Down
2 changes: 1 addition & 1 deletion scglue/models/scglue.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def _extract_xalt(self, adata: AnnData, data_config: DATA_CONFIG) -> AnyArray:
f"Configured data representation '{use_rep}' "
f"cannot be found in input data!"
)
xalt = adata.obsm[use_rep].astype(default_dtype)
xalt = np.asarray(adata.obsm[use_rep]).astype(default_dtype)
if xalt.shape[1] != rep_dim:
raise ValueError(
f"Input representation dimensionality {xalt.shape[1]} "
Expand Down
15 changes: 15 additions & 0 deletions scglue/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import numpy as np
import pandas as pd
import torch
from pybedtools.helpers import set_bedtools_path

from .typehint import RandomState, T

Expand Down Expand Up @@ -195,6 +196,7 @@ def __init__(self) -> None:
self.PRINT_LOSS_INTERVAL = 10
self.TENSORBOARD_FLUSH_SECS = 5
self.ALLOW_TRAINING_INTERRUPTION = True
self.BEDTOOLS_PATH = ""

@property
def TMP_PREFIX(self) -> str:
Expand Down Expand Up @@ -426,6 +428,19 @@ def ALLOW_TRAINING_INTERRUPTION(self) -> bool:
def ALLOW_TRAINING_INTERRUPTION(self, allow_training_interruption: bool) -> None:
self._ALLOW_TRAINING_INTERRUPTION = allow_training_interruption

@property
def BEDTOOLS_PATH(self) -> str:
r"""
Path to bedtools executable.
Default value is ``bedtools``.
"""
return self._BEDTOOLS_PATH

@BEDTOOLS_PATH.setter
def BEDTOOLS_PATH(self, bedtools_path: str) -> None:
self._BEDTOOLS_PATH = bedtools_path
set_bedtools_path(bedtools_path)


config = ConfigManager()

Expand Down
12 changes: 12 additions & 0 deletions tests/models/test_scglue.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,3 +360,15 @@ def test_fit_SCGLUE(rna_pp, atac_pp, prior):
compile_kws={"lr": 1e-5},
fit_kws={"max_epochs": 5}
) # NOTE: Smoke test


def test_fit_SCGLUE_with_df_obsm(rna_pp, atac_pp, prior):
rna_pp.obsm["X_pca"] = pd.DataFrame(rna_pp.obsm["X_pca"], index=rna_pp.obs_names)
scglue.models.configure_dataset(rna_pp, "NB", use_highly_variable=True, use_rep="X_pca", use_batch="batch", use_uid="uid")
scglue.models.configure_dataset(atac_pp, "NB", use_highly_variable=True, use_cell_type="ct", use_batch="batch", use_uid="uid")
scglue.models.fit_SCGLUE(
{"rna": rna_pp, "atac": atac_pp}, prior,
init_kws={"latent_dim": 2, "shared_batches": True},
compile_kws={"lr": 1e-5},
fit_kws={"max_epochs": 5}
) # NOTE: Smoke test

0 comments on commit 0e9c835

Please sign in to comment.