From 55fa49b8b55436a89b05ea884eb3eb49c9ff41bb Mon Sep 17 00:00:00 2001 From: Vladimir Shitov <35199218+VladimirShitov@users.noreply.github.com> Date: Thu, 26 Sep 2024 09:01:15 +0200 Subject: [PATCH] Add TF-IDF normalization (#870) Co-authored-by: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com> --- CHANGELOG.md | 2 + src/transform/tfidf/config.vsh.yaml | 105 ++++++++++++++++++++++ src/transform/tfidf/script.py | 64 ++++++++++++++ src/transform/tfidf/test.py | 132 ++++++++++++++++++++++++++++ 4 files changed, 303 insertions(+) create mode 100644 src/transform/tfidf/config.vsh.yaml create mode 100644 src/transform/tfidf/script.py create mode 100644 src/transform/tfidf/test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e705baf80e..fb94e293add 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -117,6 +117,8 @@ * `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863). +* Added `transform/tfidf` component: normalize ATAC data with TF-IDF (PR #870). + * Added `dimred/lsi` component (PR #552). * `metadata/copy_obs` component: Added a component to copy an .obs column from a MuData object to another (PR #874). diff --git a/src/transform/tfidf/config.vsh.yaml b/src/transform/tfidf/config.vsh.yaml new file mode 100644 index 00000000000..e11fa6679b5 --- /dev/null +++ b/src/transform/tfidf/config.vsh.yaml @@ -0,0 +1,105 @@ +name: tfidf +namespace: "transform" +description: | + Perform TF-IDF normalization of the data (typically, ATAC). + + TF-IDF stands for "term frequency - inverse document frequency". It is a technique from natural language processing analysis. + In the context of ATAC data, "terms" are the features (genes) and "documents" are the observations (cells). + TF-IDF normalization is applied to single-cell ATAC-seq data to highlight the importance of specific genomic regions (typically peaks) + across different cells while down-weighting regions that are commonly accessible across many cells. +authors: + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ maintainer ] +arguments: + # input + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + type: string + default: "atac" + required: false + + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. By default, X is normalized" + + # output + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + + - name: "--output_compression" + type: string + description: The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + + - name: "--output_layer" + type: string + description: Output layer to use. + default: "tfidf" + required: false + + # arguments + - name: "--scale_factor" + type: integer + description: Scale factor to multiply the TF-IDF matrix by. + default: 10000 + min: 1 + + - name: "--log_idf" + description: Whether to log-transform IDF term. + type: boolean + default: true + + - name: "--log_tf" + description: Whether to log-transform TF term. + type: boolean + default: true + + - name: "--log_tfidf" + description: Whether to log-transform TF*IDF term (False by default). Can only be used when log_tf and log_idf are False. + type: boolean + default: false + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_atac_tiny_bcl/counts/ +engines: + - type: docker + image: python:3.10-slim-bullseye + setup: + - type: apt + packages: + - libhdf5-dev + - procps + - pkg-config # Otherwise h5py installation fails, which is required for scanpy + - gcc + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - muon~=0.1.5 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [midmem, lowcpu] diff --git a/src/transform/tfidf/script.py b/src/transform/tfidf/script.py new file mode 100644 index 00000000000..509053a4f12 --- /dev/null +++ b/src/transform/tfidf/script.py @@ -0,0 +1,64 @@ +import sys +import mudata +import muon + +## VIASH START +par = { + "input": "work/d9/3adbd080e0de618d44b59b1ec81685/run.output.h5mu", + "output": "output.h5mu", + "scale_factor": 10000, + "modality": "atac", + "input_layer": None, + "output_layer": None, + "output_compression": "gzip", + "log_idf": True, + "log_tf": True, + "log_tfidf": False +} +meta = {"name": "tfidf"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +# START TEMPORARY WORKAROUND setup_logger +# reason: resources aren't available when using Nextflow fusion +# from setup_logger import setup_logger +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger +# END TEMPORARY WORKAROUND setup_logger +logger = setup_logger() + +logger.info("Reading input mudata") +mdata = mudata.read_h5mu(par["input"]) + +logger.info(par) + +mod = par["modality"] +logger.info("Performing TF-IDF normalization on modality %s", mod) +adata = mdata.mod[mod].copy() + +muon.atac.pp.tfidf( + adata, + log_tf=par["log_tf"], + log_idf=par["log_idf"], + log_tfidf=par["log_tfidf"], + scale_factor=par["scale_factor"], + inplace=True, + copy=False, + from_layer=par["input_layer"], + to_layer=par["output_layer"], +) + +mdata.mod[mod].layers[par["output_layer"]] = adata.layers[par["output_layer"]] + +logger.info("Writing to file") +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) diff --git a/src/transform/tfidf/test.py b/src/transform/tfidf/test.py new file mode 100644 index 00000000000..843eb5e72f8 --- /dev/null +++ b/src/transform/tfidf/test.py @@ -0,0 +1,132 @@ +from pathlib import Path +import pytest +import sys + +import mudata as md +import numpy as np +import scanpy as sc +import muon as mu + +## VIASH START +meta = { + 'executable': './target/docker/transform/tfidf/tfidf', + 'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/", + 'config': './src/transform/tfidf/config.vsh.yaml', + 'cpus': 2 +} +## VIASH END + +@pytest.fixture +def synthetic_example(): + atac = sc.AnnData(np.array([ + [0, 0, 0], + [1, 0, 1], + [10, 0, 0], + [100, 0, 1], + [1000, 0, 0] + ])) + atac.obs_names = ["A", "B", "C", "D", "E"] + atac.var_names = ["x", "y", "z"] + + return md.MuData({"atac": atac}) + +@pytest.fixture +def example_mudata(tmp_path, synthetic_example): + mdata_path = tmp_path / "example.h5mu" + synthetic_example.write(mdata_path) + + return mdata_path + +@pytest.fixture +def example_mudata_with_layer(tmp_path, synthetic_example): + synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy() + synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape) + mdata_path = tmp_path / "example.h5mu" + synthetic_example.write(mdata_path) + + return mdata_path + +@pytest.fixture +def neurips_mudata(tmp_path): + """From the `NeurIPS Multimodal Single-Cell Integration Challenge + ` + + Link is taken from the Moscot repository: + https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67 + """ + adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503") + + mdata = md.MuData({"atac": adata}) + mdata_path = tmp_path / "neurips.h5mu" + mdata.write(mdata_path) + + return mdata_path + +@pytest.fixture +def tiny_atac_mudata(tmp_path): + resources_dir = Path(meta["resources_dir"]) + + mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5") + mdata_path = tmp_path / "tiny_atac.h5mu" + mdata.write(mdata_path) + + return mdata_path + +@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"]) +def test_output_layer(run_component, request, mudata, tmp_path): + input_path = request.getfixturevalue(mudata) + output_path = tmp_path / "foo.h5mu" + + args = [ + "--input", str(input_path), + "--output", str(output_path), + "--modality", "atac", + ] + + run_component(args) + assert output_path.is_file() + output_mdata = md.read(output_path) + + assert "tfidf" in output_mdata.mod["atac"].layers.keys() + +@pytest.mark.parametrize("mudata", ["example_mudata"]) +def test_calculations_correctness(request, run_component, mudata, tmp_path): + input_path = request.getfixturevalue(mudata) + output_path = tmp_path / "foo.h5mu" + + args = [ + "--input", str(input_path), + "--output", str(output_path), + "--modality", "atac", + ] + + run_component(args + ["--scale_factor", "10000", "--output_layer", "tfidf_10000"]) + assert output_path.is_file() + output_mdata = md.read(output_path) + + assert np.allclose( + output_mdata.mod["atac"].layers["tfidf_10000"].toarray(), + np.array([[ np.nan, np.nan, np.nan], + [0.0382461 , 0. , 10.67027475], + [0.04135813, 0. , 0. ], + [0.04131346, 0. , 5.7693107 ], + [0.04135813, 0. , 0. ]]), + equal_nan=True + ) + + run_component(args + ["--scale_factor", "100", "--output_layer", "tfidf_100"]) + output_mdata = md.read(output_path) + + assert np.allclose( + output_mdata.mod["atac"].layers["tfidf_100"].toarray(), + np.array([[ np.nan, np.nan, np.nan], + [0.01765529, 0. , 4.92564555], + [0.02072352, 0. , 0. ], + [0.02067929, 0. , 0.86213192], + [0.02072352, 0. , 0. ]]), + equal_nan=True + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__]))