-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: DriesSchaumont <[email protected]>
- Loading branch information
1 parent
54fa8e5
commit 55fa49b
Showing
4 changed files
with
303 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
name: tfidf | ||
namespace: "transform" | ||
description: | | ||
Perform TF-IDF normalization of the data (typically, ATAC). | ||
TF-IDF stands for "term frequency - inverse document frequency". It is a technique from natural language processing analysis. | ||
In the context of ATAC data, "terms" are the features (genes) and "documents" are the observations (cells). | ||
TF-IDF normalization is applied to single-cell ATAC-seq data to highlight the importance of specific genomic regions (typically peaks) | ||
across different cells while down-weighting regions that are commonly accessible across many cells. | ||
authors: | ||
- __merge__: /src/authors/vladimir_shitov.yaml | ||
roles: [ maintainer ] | ||
arguments: | ||
# input | ||
- name: "--input" | ||
alternatives: ["-i"] | ||
type: file | ||
description: Input h5mu file | ||
direction: input | ||
required: true | ||
example: input.h5mu | ||
|
||
- name: "--modality" | ||
type: string | ||
default: "atac" | ||
required: false | ||
|
||
- name: "--input_layer" | ||
type: string | ||
required: false | ||
description: "Input layer to use. By default, X is normalized" | ||
|
||
# output | ||
- name: "--output" | ||
alternatives: ["-o"] | ||
type: file | ||
description: Output h5mu file. | ||
direction: output | ||
required: true | ||
|
||
- name: "--output_compression" | ||
type: string | ||
description: The compression format to be used on the output h5mu object. | ||
choices: ["gzip", "lzf"] | ||
required: false | ||
example: "gzip" | ||
|
||
- name: "--output_layer" | ||
type: string | ||
description: Output layer to use. | ||
default: "tfidf" | ||
required: false | ||
|
||
# arguments | ||
- name: "--scale_factor" | ||
type: integer | ||
description: Scale factor to multiply the TF-IDF matrix by. | ||
default: 10000 | ||
min: 1 | ||
|
||
- name: "--log_idf" | ||
description: Whether to log-transform IDF term. | ||
type: boolean | ||
default: true | ||
|
||
- name: "--log_tf" | ||
description: Whether to log-transform TF term. | ||
type: boolean | ||
default: true | ||
|
||
- name: "--log_tfidf" | ||
description: Whether to log-transform TF*IDF term (False by default). Can only be used when log_tf and log_idf are False. | ||
type: boolean | ||
default: false | ||
|
||
resources: | ||
- type: python_script | ||
path: script.py | ||
- path: /src/utils/setup_logger.py | ||
test_resources: | ||
- type: python_script | ||
path: test.py | ||
- path: /resources_test/cellranger_atac_tiny_bcl/counts/ | ||
engines: | ||
- type: docker | ||
image: python:3.10-slim-bullseye | ||
setup: | ||
- type: apt | ||
packages: | ||
- libhdf5-dev | ||
- procps | ||
- pkg-config # Otherwise h5py installation fails, which is required for scanpy | ||
- gcc | ||
- type: python | ||
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] | ||
packages: | ||
- muon~=0.1.5 | ||
test_setup: | ||
- type: python | ||
__merge__: [ /src/base/requirements/viashpy.yaml, .] | ||
runners: | ||
- type: executable | ||
- type: nextflow | ||
directives: | ||
label: [midmem, lowcpu] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import sys | ||
import mudata | ||
import muon | ||
|
||
## VIASH START | ||
par = { | ||
"input": "work/d9/3adbd080e0de618d44b59b1ec81685/run.output.h5mu", | ||
"output": "output.h5mu", | ||
"scale_factor": 10000, | ||
"modality": "atac", | ||
"input_layer": None, | ||
"output_layer": None, | ||
"output_compression": "gzip", | ||
"log_idf": True, | ||
"log_tf": True, | ||
"log_tfidf": False | ||
} | ||
meta = {"name": "tfidf"} | ||
## VIASH END | ||
|
||
sys.path.append(meta["resources_dir"]) | ||
# START TEMPORARY WORKAROUND setup_logger | ||
# reason: resources aren't available when using Nextflow fusion | ||
# from setup_logger import setup_logger | ||
def setup_logger(): | ||
import logging | ||
from sys import stdout | ||
|
||
logger = logging.getLogger() | ||
logger.setLevel(logging.INFO) | ||
console_handler = logging.StreamHandler(stdout) | ||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") | ||
console_handler.setFormatter(logFormatter) | ||
logger.addHandler(console_handler) | ||
|
||
return logger | ||
# END TEMPORARY WORKAROUND setup_logger | ||
logger = setup_logger() | ||
|
||
logger.info("Reading input mudata") | ||
mdata = mudata.read_h5mu(par["input"]) | ||
|
||
logger.info(par) | ||
|
||
mod = par["modality"] | ||
logger.info("Performing TF-IDF normalization on modality %s", mod) | ||
adata = mdata.mod[mod].copy() | ||
|
||
muon.atac.pp.tfidf( | ||
adata, | ||
log_tf=par["log_tf"], | ||
log_idf=par["log_idf"], | ||
log_tfidf=par["log_tfidf"], | ||
scale_factor=par["scale_factor"], | ||
inplace=True, | ||
copy=False, | ||
from_layer=par["input_layer"], | ||
to_layer=par["output_layer"], | ||
) | ||
|
||
mdata.mod[mod].layers[par["output_layer"]] = adata.layers[par["output_layer"]] | ||
|
||
logger.info("Writing to file") | ||
mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
from pathlib import Path | ||
import pytest | ||
import sys | ||
|
||
import mudata as md | ||
import numpy as np | ||
import scanpy as sc | ||
import muon as mu | ||
|
||
## VIASH START | ||
meta = { | ||
'executable': './target/docker/transform/tfidf/tfidf', | ||
'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/", | ||
'config': './src/transform/tfidf/config.vsh.yaml', | ||
'cpus': 2 | ||
} | ||
## VIASH END | ||
|
||
@pytest.fixture | ||
def synthetic_example(): | ||
atac = sc.AnnData(np.array([ | ||
[0, 0, 0], | ||
[1, 0, 1], | ||
[10, 0, 0], | ||
[100, 0, 1], | ||
[1000, 0, 0] | ||
])) | ||
atac.obs_names = ["A", "B", "C", "D", "E"] | ||
atac.var_names = ["x", "y", "z"] | ||
|
||
return md.MuData({"atac": atac}) | ||
|
||
@pytest.fixture | ||
def example_mudata(tmp_path, synthetic_example): | ||
mdata_path = tmp_path / "example.h5mu" | ||
synthetic_example.write(mdata_path) | ||
|
||
return mdata_path | ||
|
||
@pytest.fixture | ||
def example_mudata_with_layer(tmp_path, synthetic_example): | ||
synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy() | ||
synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape) | ||
mdata_path = tmp_path / "example.h5mu" | ||
synthetic_example.write(mdata_path) | ||
|
||
return mdata_path | ||
|
||
@pytest.fixture | ||
def neurips_mudata(tmp_path): | ||
"""From the `NeurIPS Multimodal Single-Cell Integration Challenge | ||
<https://www.kaggle.com/competitions/open-problems-multimodal/data>` | ||
Link is taken from the Moscot repository: | ||
https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67 | ||
""" | ||
adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503") | ||
|
||
mdata = md.MuData({"atac": adata}) | ||
mdata_path = tmp_path / "neurips.h5mu" | ||
mdata.write(mdata_path) | ||
|
||
return mdata_path | ||
|
||
@pytest.fixture | ||
def tiny_atac_mudata(tmp_path): | ||
resources_dir = Path(meta["resources_dir"]) | ||
|
||
mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5") | ||
mdata_path = tmp_path / "tiny_atac.h5mu" | ||
mdata.write(mdata_path) | ||
|
||
return mdata_path | ||
|
||
@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"]) | ||
def test_output_layer(run_component, request, mudata, tmp_path): | ||
input_path = request.getfixturevalue(mudata) | ||
output_path = tmp_path / "foo.h5mu" | ||
|
||
args = [ | ||
"--input", str(input_path), | ||
"--output", str(output_path), | ||
"--modality", "atac", | ||
] | ||
|
||
run_component(args) | ||
assert output_path.is_file() | ||
output_mdata = md.read(output_path) | ||
|
||
assert "tfidf" in output_mdata.mod["atac"].layers.keys() | ||
|
||
@pytest.mark.parametrize("mudata", ["example_mudata"]) | ||
def test_calculations_correctness(request, run_component, mudata, tmp_path): | ||
input_path = request.getfixturevalue(mudata) | ||
output_path = tmp_path / "foo.h5mu" | ||
|
||
args = [ | ||
"--input", str(input_path), | ||
"--output", str(output_path), | ||
"--modality", "atac", | ||
] | ||
|
||
run_component(args + ["--scale_factor", "10000", "--output_layer", "tfidf_10000"]) | ||
assert output_path.is_file() | ||
output_mdata = md.read(output_path) | ||
|
||
assert np.allclose( | ||
output_mdata.mod["atac"].layers["tfidf_10000"].toarray(), | ||
np.array([[ np.nan, np.nan, np.nan], | ||
[0.0382461 , 0. , 10.67027475], | ||
[0.04135813, 0. , 0. ], | ||
[0.04131346, 0. , 5.7693107 ], | ||
[0.04135813, 0. , 0. ]]), | ||
equal_nan=True | ||
) | ||
|
||
run_component(args + ["--scale_factor", "100", "--output_layer", "tfidf_100"]) | ||
output_mdata = md.read(output_path) | ||
|
||
assert np.allclose( | ||
output_mdata.mod["atac"].layers["tfidf_100"].toarray(), | ||
np.array([[ np.nan, np.nan, np.nan], | ||
[0.01765529, 0. , 4.92564555], | ||
[0.02072352, 0. , 0. ], | ||
[0.02067929, 0. , 0.86213192], | ||
[0.02072352, 0. , 0. ]]), | ||
equal_nan=True | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(pytest.main([__file__])) |