openpipelines-bio · DriesSchaumont · Sep 26, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 30, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -117,6 +117,8 @@
 
 * `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863).
 
+* Added `transform/tfidf` component: normalize ATAC data with TF-IDF (PR #870).
+
 * Added `dimred/lsi` component (PR #552).
 
 * `metadata/copy_obs` component: Added a component to copy an .obs column from a MuData object to another (PR #874).

diff --git a/src/transform/tfidf/config.vsh.yaml b/src/transform/tfidf/config.vsh.yaml
@@ -0,0 +1,105 @@
+name: tfidf
+namespace: "transform"
+description: |
+  Perform TF-IDF normalization of the data (typically, ATAC).
+
+  TF-IDF stands for "term frequency - inverse document frequency". It is a technique from natural language processing analysis.
+  In the context of ATAC data, "terms" are the features (genes) and "documents" are the observations (cells). 
+  TF-IDF normalization is applied to single-cell ATAC-seq data to highlight the importance of specific genomic regions (typically peaks)
+  across different cells while down-weighting regions that are commonly accessible across many cells. 
+authors:
+  - __merge__: /src/authors/vladimir_shitov.yaml
+    roles: [ maintainer ]
+arguments:
+  # input
+  - name: "--input"
+    alternatives: ["-i"]
+    type: file
+    description: Input h5mu file
+    direction: input
+    required: true
+    example: input.h5mu
+
+  - name: "--modality"
+    type: string
+    default: "atac"
+    required: false
+
+  - name: "--input_layer"
+    type: string
+    required: false
+    description: "Input layer to use. By default, X is normalized"
+
+  # output
+  - name: "--output"
+    alternatives: ["-o"]
+    type: file
+    description: Output h5mu file.
+    direction: output
+    required: true
+
+  - name: "--output_compression"
+    type: string
+    description: The compression format to be used on the output h5mu object.
+    choices: ["gzip", "lzf"]
+    required: false
+    example: "gzip"
+
+  - name: "--output_layer"
+    type: string
+    description: Output layer to use.
+    default: "tfidf"
+    required: false
+
+  # arguments
+  - name: "--scale_factor"
+    type: integer
+    description: Scale factor to multiply the TF-IDF matrix by.
+    default: 10000
+    min: 1
+
+  - name: "--log_idf"
+    description: Whether to log-transform IDF term.
+    type: boolean
+    default: true
+
+  - name: "--log_tf"
+    description: Whether to log-transform TF term.
+    type: boolean
+    default: true
+
+  - name: "--log_tfidf"
+    description: Whether to log-transform TF*IDF term (False by default). Can only be used when log_tf and log_idf are False.
+    type: boolean
+    default: false
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/cellranger_atac_tiny_bcl/counts/
+engines:
+  - type: docker
+    image: python:3.10-slim-bullseye
+    setup:
+      - type: apt
+        packages: 
+          - libhdf5-dev
+          - procps
+          - pkg-config  # Otherwise h5py installation fails, which is required for scanpy
+          - gcc
+      - type: python
+        __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
+        packages:
+          - muon~=0.1.5
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/viashpy.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midmem, lowcpu]
diff --git a/src/transform/tfidf/script.py b/src/transform/tfidf/script.py
@@ -0,0 +1,64 @@
+import sys
+import mudata
+import muon
+
+## VIASH START
+par = {
+    "input": "work/d9/3adbd080e0de618d44b59b1ec81685/run.output.h5mu",
+    "output": "output.h5mu",
+    "scale_factor": 10000,
+    "modality": "atac",
+    "input_layer": None,
+    "output_layer": None,
+    "output_compression": "gzip",
+    "log_idf": True,
+    "log_tf": True,
+    "log_tfidf": False
+}
+meta = {"name": "tfidf"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+# START TEMPORARY WORKAROUND setup_logger
+# reason: resources aren't available when using Nextflow fusion
+# from setup_logger import setup_logger
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+logger = setup_logger()
+
+logger.info("Reading input mudata")
+mdata = mudata.read_h5mu(par["input"])
+
+logger.info(par)
+
+mod = par["modality"]
+logger.info("Performing TF-IDF normalization on modality %s", mod)
+adata = mdata.mod[mod].copy()
+
+muon.atac.pp.tfidf(
+    adata,
+    log_tf=par["log_tf"],
+    log_idf=par["log_idf"],
+    log_tfidf=par["log_tfidf"],
+    scale_factor=par["scale_factor"],
+    inplace=True,
+    copy=False,
+    from_layer=par["input_layer"],
+    to_layer=par["output_layer"],
+)
+
+mdata.mod[mod].layers[par["output_layer"]] = adata.layers[par["output_layer"]]
+
+logger.info("Writing to file")
+mdata.write_h5mu(filename=par["output"], compression=par["output_compression"])
diff --git a/src/transform/tfidf/test.py b/src/transform/tfidf/test.py
@@ -0,0 +1,132 @@
+from pathlib import Path
+import pytest
+import sys
+
+import mudata as md
+import numpy as np
+import scanpy as sc
+import muon as mu
+
+## VIASH START
+meta = {
+    'executable': './target/docker/transform/tfidf/tfidf',
+    'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/",
+    'config': './src/transform/tfidf/config.vsh.yaml',
+    'cpus': 2
+}
+## VIASH END
+
+@pytest.fixture
+def synthetic_example():
+    atac = sc.AnnData(np.array([
+        [0, 0, 0],
+        [1, 0, 1],
+        [10, 0, 0],
+        [100, 0, 1],
+        [1000, 0, 0]
+    ]))
+    atac.obs_names = ["A", "B", "C", "D", "E"]
+    atac.var_names = ["x", "y", "z"]
+
+    return md.MuData({"atac": atac})
+
+@pytest.fixture
+def example_mudata(tmp_path, synthetic_example):
+    mdata_path = tmp_path / "example.h5mu"
+    synthetic_example.write(mdata_path)
+
+    return mdata_path
+
+@pytest.fixture
+def example_mudata_with_layer(tmp_path, synthetic_example):
+    synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy()
+    synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape)
+    mdata_path = tmp_path / "example.h5mu"
+    synthetic_example.write(mdata_path)
+
+    return mdata_path
+
+@pytest.fixture
+def neurips_mudata(tmp_path):
+    """From the `NeurIPS Multimodal Single-Cell Integration Challenge
+    <https://www.kaggle.com/competitions/open-problems-multimodal/data>`
+
+    Link is taken from the Moscot repository: 
+    https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67
+    """
+    adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503")
+
+    mdata = md.MuData({"atac": adata})
+    mdata_path = tmp_path / "neurips.h5mu"
+    mdata.write(mdata_path)
+
+    return mdata_path
+
+@pytest.fixture
+def tiny_atac_mudata(tmp_path):
+    resources_dir = Path(meta["resources_dir"])
+
+    mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5")
+    mdata_path = tmp_path / "tiny_atac.h5mu"
+    mdata.write(mdata_path)
+
+    return mdata_path
+
+@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"])
+def test_output_layer(run_component, request, mudata, tmp_path):
+    input_path = request.getfixturevalue(mudata)
+    output_path = tmp_path / "foo.h5mu"
+
+    args = [
+        "--input", str(input_path),
+        "--output", str(output_path),
+        "--modality", "atac",
+    ]
+
+    run_component(args)
+    assert output_path.is_file()
+    output_mdata = md.read(output_path)
+
+    assert "tfidf" in output_mdata.mod["atac"].layers.keys()
+
+@pytest.mark.parametrize("mudata", ["example_mudata"])
+def test_calculations_correctness(request, run_component, mudata, tmp_path):
+    input_path = request.getfixturevalue(mudata)
+    output_path = tmp_path / "foo.h5mu"
+
+    args = [
+        "--input", str(input_path),
+        "--output", str(output_path),
+        "--modality", "atac",
+    ]
+
+    run_component(args + ["--scale_factor", "10000", "--output_layer", "tfidf_10000"])
+    assert output_path.is_file()
+    output_mdata = md.read(output_path)
+
+    assert np.allclose(
+        output_mdata.mod["atac"].layers["tfidf_10000"].toarray(),
+        np.array([[    np.nan, np.nan,      np.nan],
+                  [0.0382461 , 0.    , 10.67027475],
+                  [0.04135813, 0.    , 0.         ],
+                  [0.04131346, 0.    , 5.7693107  ],
+                  [0.04135813, 0.    , 0.         ]]),
+        equal_nan=True
+        )
+
+    run_component(args + ["--scale_factor", "100", "--output_layer", "tfidf_100"])
+    output_mdata = md.read(output_path)
+
+    assert np.allclose(
+        output_mdata.mod["atac"].layers["tfidf_100"].toarray(),
+        np.array([[    np.nan, np.nan,     np.nan],
+                  [0.01765529, 0.    , 4.92564555],
+                  [0.02072352, 0.    , 0.        ],
+                  [0.02067929, 0.    , 0.86213192],
+                  [0.02072352, 0.    , 0.        ]]),
+        equal_nan=True
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))