Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TF-IDF normalization #870

Merged
merged 16 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@

* `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863).

* Added `transform/tfidf` component: normalize ATAC data with TF-IDF (PR #870).

* Added `dimred/lsi` component (PR #552).

* `metadata/copy_obs` component: Added a component to copy an .obs column from a MuData object to another (PR #874).
Expand Down
105 changes: 105 additions & 0 deletions src/transform/tfidf/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
name: tfidf
namespace: "transform"
description: |
Perform TF-IDF normalization of the data (typically, ATAC).

TF-IDF stands for "term frequency - inverse document frequency". It is a technique from natural language processing analysis.
In the context of ATAC data, "terms" are the features (genes) and "documents" are the observations (cells).
TF-IDF normalization is applied to single-cell ATAC-seq data to highlight the importance of specific genomic regions (typically peaks)
across different cells while down-weighting regions that are commonly accessible across many cells.
authors:
- __merge__: /src/authors/vladimir_shitov.yaml
roles: [ maintainer ]
arguments:
# input
- name: "--input"
alternatives: ["-i"]
type: file
description: Input h5mu file
direction: input
required: true
example: input.h5mu

- name: "--modality"
type: string
default: "atac"
required: false

- name: "--input_layer"
type: string
required: false
description: "Input layer to use. By default, X is normalized"

# output
- name: "--output"
alternatives: ["-o"]
type: file
description: Output h5mu file.
direction: output
required: true

- name: "--output_compression"
type: string
description: The compression format to be used on the output h5mu object.
choices: ["gzip", "lzf"]
required: false
example: "gzip"

- name: "--output_layer"
type: string
description: Output layer to use.
default: "tfidf"
required: false

# arguments
- name: "--scale_factor"
type: integer
description: Scale factor to multiply the TF-IDF matrix by.
default: 10000
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a min here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, numbers less than 0 make little sense. Anything above can theoretically be used

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I set it to 1. Likely, less then this will not be used

min: 1

- name: "--log_idf"
description: Whether to log-transform IDF term.
type: boolean
default: true

- name: "--log_tf"
description: Whether to log-transform TF term.
type: boolean
default: true

- name: "--log_tfidf"
description: Whether to log-transform TF*IDF term (False by default). Can only be used when log_tf and log_idf are False.
DriesSchaumont marked this conversation as resolved.
Show resolved Hide resolved
type: boolean
default: false

resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/cellranger_atac_tiny_bcl/counts/
engines:
- type: docker
image: python:3.10-slim-bullseye
setup:
- type: apt
packages:
- libhdf5-dev
- procps
- pkg-config # Otherwise h5py installation fails, which is required for scanpy
- gcc
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
packages:
- muon~=0.1.5
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [midmem, lowcpu]
64 changes: 64 additions & 0 deletions src/transform/tfidf/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import sys
import mudata
import muon

## VIASH START
par = {
"input": "work/d9/3adbd080e0de618d44b59b1ec81685/run.output.h5mu",
"output": "output.h5mu",
"scale_factor": 10000,
"modality": "atac",
"input_layer": None,
"output_layer": None,
"output_compression": "gzip",
"log_idf": True,
"log_tf": True,
"log_tfidf": False
}
meta = {"name": "tfidf"}
## VIASH END

sys.path.append(meta["resources_dir"])
# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
def setup_logger():
import logging
from sys import stdout

logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)

return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()

logger.info("Reading input mudata")
mdata = mudata.read_h5mu(par["input"])

logger.info(par)

mod = par["modality"]
logger.info("Performing TF-IDF normalization on modality %s", mod)
adata = mdata.mod[mod].copy()

muon.atac.pp.tfidf(
DriesSchaumont marked this conversation as resolved.
Show resolved Hide resolved
adata,
log_tf=par["log_tf"],
log_idf=par["log_idf"],
log_tfidf=par["log_tfidf"],
scale_factor=par["scale_factor"],
inplace=True,
copy=False,
from_layer=par["input_layer"],
to_layer=par["output_layer"],
)

mdata.mod[mod].layers[par["output_layer"]] = adata.layers[par["output_layer"]]

logger.info("Writing to file")
mdata.write_h5mu(filename=par["output"], compression=par["output_compression"])
132 changes: 132 additions & 0 deletions src/transform/tfidf/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from pathlib import Path
import pytest
import sys

import mudata as md
import numpy as np
import scanpy as sc
import muon as mu

## VIASH START
meta = {
'executable': './target/docker/transform/tfidf/tfidf',
'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/",
'config': './src/transform/tfidf/config.vsh.yaml',
'cpus': 2
}
## VIASH END

@pytest.fixture
def synthetic_example():
atac = sc.AnnData(np.array([
[0, 0, 0],
[1, 0, 1],
[10, 0, 0],
[100, 0, 1],
[1000, 0, 0]
]))
atac.obs_names = ["A", "B", "C", "D", "E"]
atac.var_names = ["x", "y", "z"]

return md.MuData({"atac": atac})

@pytest.fixture
def example_mudata(tmp_path, synthetic_example):
mdata_path = tmp_path / "example.h5mu"
synthetic_example.write(mdata_path)

return mdata_path

@pytest.fixture
def example_mudata_with_layer(tmp_path, synthetic_example):
synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy()
synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape)
mdata_path = tmp_path / "example.h5mu"
synthetic_example.write(mdata_path)

return mdata_path

@pytest.fixture
def neurips_mudata(tmp_path):
"""From the `NeurIPS Multimodal Single-Cell Integration Challenge
<https://www.kaggle.com/competitions/open-problems-multimodal/data>`

Link is taken from the Moscot repository:
https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67
"""
adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503")

mdata = md.MuData({"atac": adata})
mdata_path = tmp_path / "neurips.h5mu"
mdata.write(mdata_path)

return mdata_path

@pytest.fixture
def tiny_atac_mudata(tmp_path):
resources_dir = Path(meta["resources_dir"])

mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5")
mdata_path = tmp_path / "tiny_atac.h5mu"
mdata.write(mdata_path)

return mdata_path

@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"])
def test_output_layer(run_component, request, mudata, tmp_path):
input_path = request.getfixturevalue(mudata)
output_path = tmp_path / "foo.h5mu"

args = [
"--input", str(input_path),
"--output", str(output_path),
"--modality", "atac",
]

run_component(args)
assert output_path.is_file()
output_mdata = md.read(output_path)

assert "tfidf" in output_mdata.mod["atac"].layers.keys()

@pytest.mark.parametrize("mudata", ["example_mudata"])
def test_calculations_correctness(request, run_component, mudata, tmp_path):
input_path = request.getfixturevalue(mudata)
output_path = tmp_path / "foo.h5mu"

args = [
"--input", str(input_path),
"--output", str(output_path),
"--modality", "atac",
]

run_component(args + ["--scale_factor", "10000", "--output_layer", "tfidf_10000"])
assert output_path.is_file()
output_mdata = md.read(output_path)

assert np.allclose(
output_mdata.mod["atac"].layers["tfidf_10000"].toarray(),
np.array([[ np.nan, np.nan, np.nan],
[0.0382461 , 0. , 10.67027475],
[0.04135813, 0. , 0. ],
[0.04131346, 0. , 5.7693107 ],
[0.04135813, 0. , 0. ]]),
equal_nan=True
)

run_component(args + ["--scale_factor", "100", "--output_layer", "tfidf_100"])
output_mdata = md.read(output_path)

assert np.allclose(
output_mdata.mod["atac"].layers["tfidf_100"].toarray(),
np.array([[ np.nan, np.nan, np.nan],
[0.01765529, 0. , 4.92564555],
[0.02072352, 0. , 0. ],
[0.02067929, 0. , 0.86213192],
[0.02072352, 0. , 0. ]]),
equal_nan=True
)


if __name__ == "__main__":
sys.exit(pytest.main([__file__]))