Skip to content

Commit

Permalink
Add TF-IDF normalization (#870)
Browse files Browse the repository at this point in the history
Co-authored-by: DriesSchaumont <[email protected]>
  • Loading branch information
VladimirShitov and DriesSchaumont authored Sep 26, 2024
1 parent 54fa8e5 commit 55fa49b
Show file tree
Hide file tree
Showing 4 changed files with 303 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@

* `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863).

* Added `transform/tfidf` component: normalize ATAC data with TF-IDF (PR #870).

* Added `dimred/lsi` component (PR #552).

* `metadata/copy_obs` component: Added a component to copy an .obs column from a MuData object to another (PR #874).
Expand Down
105 changes: 105 additions & 0 deletions src/transform/tfidf/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
name: tfidf
namespace: "transform"
description: |
Perform TF-IDF normalization of the data (typically, ATAC).
TF-IDF stands for "term frequency - inverse document frequency". It is a technique from natural language processing analysis.
In the context of ATAC data, "terms" are the features (genes) and "documents" are the observations (cells).
TF-IDF normalization is applied to single-cell ATAC-seq data to highlight the importance of specific genomic regions (typically peaks)
across different cells while down-weighting regions that are commonly accessible across many cells.
authors:
- __merge__: /src/authors/vladimir_shitov.yaml
roles: [ maintainer ]
arguments:
# input
- name: "--input"
alternatives: ["-i"]
type: file
description: Input h5mu file
direction: input
required: true
example: input.h5mu

- name: "--modality"
type: string
default: "atac"
required: false

- name: "--input_layer"
type: string
required: false
description: "Input layer to use. By default, X is normalized"

# output
- name: "--output"
alternatives: ["-o"]
type: file
description: Output h5mu file.
direction: output
required: true

- name: "--output_compression"
type: string
description: The compression format to be used on the output h5mu object.
choices: ["gzip", "lzf"]
required: false
example: "gzip"

- name: "--output_layer"
type: string
description: Output layer to use.
default: "tfidf"
required: false

# arguments
- name: "--scale_factor"
type: integer
description: Scale factor to multiply the TF-IDF matrix by.
default: 10000
min: 1

- name: "--log_idf"
description: Whether to log-transform IDF term.
type: boolean
default: true

- name: "--log_tf"
description: Whether to log-transform TF term.
type: boolean
default: true

- name: "--log_tfidf"
description: Whether to log-transform TF*IDF term (False by default). Can only be used when log_tf and log_idf are False.
type: boolean
default: false

resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/cellranger_atac_tiny_bcl/counts/
engines:
- type: docker
image: python:3.10-slim-bullseye
setup:
- type: apt
packages:
- libhdf5-dev
- procps
- pkg-config # Otherwise h5py installation fails, which is required for scanpy
- gcc
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
packages:
- muon~=0.1.5
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [midmem, lowcpu]
64 changes: 64 additions & 0 deletions src/transform/tfidf/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import sys
import mudata
import muon

## VIASH START
par = {
"input": "work/d9/3adbd080e0de618d44b59b1ec81685/run.output.h5mu",
"output": "output.h5mu",
"scale_factor": 10000,
"modality": "atac",
"input_layer": None,
"output_layer": None,
"output_compression": "gzip",
"log_idf": True,
"log_tf": True,
"log_tfidf": False
}
meta = {"name": "tfidf"}
## VIASH END

sys.path.append(meta["resources_dir"])
# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
def setup_logger():
import logging
from sys import stdout

logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)

return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()

logger.info("Reading input mudata")
mdata = mudata.read_h5mu(par["input"])

logger.info(par)

mod = par["modality"]
logger.info("Performing TF-IDF normalization on modality %s", mod)
adata = mdata.mod[mod].copy()

muon.atac.pp.tfidf(
adata,
log_tf=par["log_tf"],
log_idf=par["log_idf"],
log_tfidf=par["log_tfidf"],
scale_factor=par["scale_factor"],
inplace=True,
copy=False,
from_layer=par["input_layer"],
to_layer=par["output_layer"],
)

mdata.mod[mod].layers[par["output_layer"]] = adata.layers[par["output_layer"]]

logger.info("Writing to file")
mdata.write_h5mu(filename=par["output"], compression=par["output_compression"])
132 changes: 132 additions & 0 deletions src/transform/tfidf/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from pathlib import Path
import pytest
import sys

import mudata as md
import numpy as np
import scanpy as sc
import muon as mu

## VIASH START
meta = {
'executable': './target/docker/transform/tfidf/tfidf',
'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/",
'config': './src/transform/tfidf/config.vsh.yaml',
'cpus': 2
}
## VIASH END

@pytest.fixture
def synthetic_example():
atac = sc.AnnData(np.array([
[0, 0, 0],
[1, 0, 1],
[10, 0, 0],
[100, 0, 1],
[1000, 0, 0]
]))
atac.obs_names = ["A", "B", "C", "D", "E"]
atac.var_names = ["x", "y", "z"]

return md.MuData({"atac": atac})

@pytest.fixture
def example_mudata(tmp_path, synthetic_example):
mdata_path = tmp_path / "example.h5mu"
synthetic_example.write(mdata_path)

return mdata_path

@pytest.fixture
def example_mudata_with_layer(tmp_path, synthetic_example):
synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy()
synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape)
mdata_path = tmp_path / "example.h5mu"
synthetic_example.write(mdata_path)

return mdata_path

@pytest.fixture
def neurips_mudata(tmp_path):
"""From the `NeurIPS Multimodal Single-Cell Integration Challenge
<https://www.kaggle.com/competitions/open-problems-multimodal/data>`
Link is taken from the Moscot repository:
https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67
"""
adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503")

mdata = md.MuData({"atac": adata})
mdata_path = tmp_path / "neurips.h5mu"
mdata.write(mdata_path)

return mdata_path

@pytest.fixture
def tiny_atac_mudata(tmp_path):
resources_dir = Path(meta["resources_dir"])

mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5")
mdata_path = tmp_path / "tiny_atac.h5mu"
mdata.write(mdata_path)

return mdata_path

@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"])
def test_output_layer(run_component, request, mudata, tmp_path):
input_path = request.getfixturevalue(mudata)
output_path = tmp_path / "foo.h5mu"

args = [
"--input", str(input_path),
"--output", str(output_path),
"--modality", "atac",
]

run_component(args)
assert output_path.is_file()
output_mdata = md.read(output_path)

assert "tfidf" in output_mdata.mod["atac"].layers.keys()

@pytest.mark.parametrize("mudata", ["example_mudata"])
def test_calculations_correctness(request, run_component, mudata, tmp_path):
input_path = request.getfixturevalue(mudata)
output_path = tmp_path / "foo.h5mu"

args = [
"--input", str(input_path),
"--output", str(output_path),
"--modality", "atac",
]

run_component(args + ["--scale_factor", "10000", "--output_layer", "tfidf_10000"])
assert output_path.is_file()
output_mdata = md.read(output_path)

assert np.allclose(
output_mdata.mod["atac"].layers["tfidf_10000"].toarray(),
np.array([[ np.nan, np.nan, np.nan],
[0.0382461 , 0. , 10.67027475],
[0.04135813, 0. , 0. ],
[0.04131346, 0. , 5.7693107 ],
[0.04135813, 0. , 0. ]]),
equal_nan=True
)

run_component(args + ["--scale_factor", "100", "--output_layer", "tfidf_100"])
output_mdata = md.read(output_path)

assert np.allclose(
output_mdata.mod["atac"].layers["tfidf_100"].toarray(),
np.array([[ np.nan, np.nan, np.nan],
[0.01765529, 0. , 4.92564555],
[0.02072352, 0. , 0. ],
[0.02067929, 0. , 0.86213192],
[0.02072352, 0. , 0. ]]),
equal_nan=True
)


if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

0 comments on commit 55fa49b

Please sign in to comment.