Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add filter_zeros parameter to analysis functions #310

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# calour changelog
## Version 2024.9.29
New features:
* Add filter_zeros parameter to analysis functions (correlation, diff_abundance) to enable working with data containing negative values (such as log-ratios)

## Version 2024.8.31
New features:
Expand Down
2 changes: 1 addition & 1 deletion calour/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


__credits__ = "https://github.com/biocore/calour/graphs/contributors"
__version__ = "2024.8.31"
__version__ = "2024.9.29"

__all__ = ['read', 'read_amplicon', 'read_ms', 'read_qiime2',
'Experiment', 'AmpliconExperiment', 'MS1Experiment','mRNAExperiment',
Expand Down
45 changes: 36 additions & 9 deletions calour/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

@format_docstring(_CALOUR_PVAL, _CALOUR_QVAL, _CALOUR_STAT, _CALOUR_DIRECTION)
def correlation(exp: Experiment, field, method='spearman', nonzero=False, transform=None,
numperm=1000, alpha=0.1, fdr_method='dsfdr', shuffler=None,random_seed=None) -> Experiment:
numperm=1000, alpha=0.1, fdr_method='dsfdr', filter_zeros=True, shuffler=None,random_seed=None) -> Experiment:
'''Find features with correlation to a numeric metadata field.

The permutation based p-values and multiple hypothesis correction is implemented.
Expand Down Expand Up @@ -84,6 +84,9 @@ def correlation(exp: Experiment, field, method='spearman', nonzero=False, transf
* 'bhfdr': Benjamini-Hochberg FDR method
* 'byfdr' : Benjamini-Yekutielli FDR method
* 'filterBH' : Benjamini-Hochberg FDR method with filtering
filter_zeros : bool, optional
True to remove features with 0 abundance in all samples.
Should be set to False when working with data that contains negative values (such as log-ratio data)
shuffler: function or None, optional
if None, use shuffling on all samples (using the random_seed supplied)
if function, use the supplied function to shuffle to labels for random iteration. Can be used for paired shuffling, etc.
Expand Down Expand Up @@ -111,7 +114,11 @@ def correlation(exp: Experiment, field, method='spearman', nonzero=False, transf
if field not in exp.sample_metadata.columns:
raise ValueError('Field %s not in sample_metadata. Possible fields are: %s' % (field, exp.sample_metadata.columns))

cexp = exp.filter_sum_abundance(0, strict=True)
# remove features not present in both groups
if filter_zeros:
if cexp.data.min() < 0:
logger.warning('filter_zeros=True is not recommended for data with negative values. Recommend using filter_zeros=False')
cexp = cexp.filter_sum_abundance(0, strict=True)

data = cexp.get_data(copy=True, sparse=False).transpose()

Expand Down Expand Up @@ -145,7 +152,7 @@ def correlation(exp: Experiment, field, method='spearman', nonzero=False, transf


@format_docstring(_CALOUR_PVAL, _CALOUR_QVAL, _CALOUR_STAT, _CALOUR_DIRECTION)
def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', transform='rankdata', numperm=1000, alpha=0.1, fdr_method='dsfdr', shuffler=None, random_seed=None) -> Experiment:
def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', transform='rankdata', numperm=1000, alpha=0.1, fdr_method='dsfdr', filter_zeros=True, shuffler=None, random_seed=None) -> Experiment:
'''Differential abundance test between 2 groups of samples for all the features.

It uses permutation based nonparametric test and then applies
Expand Down Expand Up @@ -192,6 +199,9 @@ def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', t
alpha (e.g. a feature that appears in only 1 sample can
obtain a minimal p-value of 0.5 and will therefore be
removed when say alpha=0.1)
filter_zeros : bool, optional
True to remove features with 0 abundance in all samples.
Should be set to False when working with data that contains negative values (such as log-ratio data)
shuffler: function or None, optional
if None, use shuffling on all samples (using the random_seed supplied)
if function, use thi supplied function to shuffle to labels for random iteration. Can be used for paired shuffling, etc.
Expand Down Expand Up @@ -234,7 +244,10 @@ def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', t
grp2 = 'NOT %s' % grp1

# remove features not present in both groups
cexp = cexp.filter_sum_abundance(0, strict=True)
if filter_zeros:
if cexp.data.min() < 0:
logger.warning('filter_zeros=True is not recommended for data with negative values. Recommend using filter_zeros=False')
cexp = cexp.filter_sum_abundance(0, strict=True)

data = cexp.get_data(copy=True, sparse=False).transpose()
# prepare the labels.
Expand All @@ -257,7 +270,7 @@ def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', t
return newexp


def diff_abundance_kw(exp: Experiment, field, transform='rankdata', numperm=1000, alpha=0.1, fdr_method='dsfdr', random_seed=None) -> Experiment:
def diff_abundance_kw(exp: Experiment, field, transform='rankdata', numperm=1000, alpha=0.1, fdr_method='dsfdr', filter_zeros=True, random_seed=None) -> Experiment:
'''Test the differential abundance between multiple sample groups using the Kruskal Wallis test.

It uses permutation based nonparametric test and then applies
Expand All @@ -276,10 +289,20 @@ def diff_abundance_kw(exp: Experiment, field, transform='rankdata', numperm=1000
* 'normdata' : normalize the data to constant sum per samples
* 'binarydata' : convert to binary absence/presence

numperm : int
number of permutations to perform
alpha : float
the desired FDR control level
numperm : int
number of permutations to perform
fdr_method : str
method to compute FDR. Allowed methods include:

* 'dsfdr': discrete FDR
* 'bhfdr': Benjamini-Hochberg FDR method
* 'byfdr' : Benjamini-Yekutielli FDR method
* 'filterBH' : Benjamini-Hochberg FDR method with filtering
filter_zeros : bool, optional
True to remove features with 0 abundance in all samples.
Should be set to False when working with data that contains negative values (such as log-ratio data)
random_seed : int, np.radnom.Generator instance or None, optional, default=None
set the random number generator seed for the random permutations
If int, random_seed is the seed used by the random number generator;
Expand All @@ -301,8 +324,11 @@ def diff_abundance_kw(exp: Experiment, field, transform='rankdata', numperm=1000

logger.debug('diff_abundance_kw for field %s' % field)

# remove features with 0 abundance
cexp = exp.filter_sum_abundance(0, strict=True)
# remove features not present in both groups
if filter_zeros:
if cexp.data.min() < 0:
logger.warning('filter_zeros=True is not recommended for data with negative values. Recommend using filter_zeros=False')
cexp = cexp.filter_sum_abundance(0, strict=True)

data = cexp.get_data(copy=True, sparse=False).transpose()
# prepare the labels. If correlation method, get the values, otherwise the group
Expand Down Expand Up @@ -342,6 +368,7 @@ def diff_abundance_paired(exp: Experiment, pair_field, field, val1, val2=None, t
Similar to diff_abundance transform parameter. Additional options are:
'pair_rank': for each group of samples (a single value in pair_field), samples are ranked within the group.


Keyword Arguments
-----------------
%(analysis.diff_abundance.parameters)s
Expand Down