diff --git a/CHANGELOG.md b/CHANGELOG.md index c3b64b6a..fc31c37b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,7 @@ # calour changelog +## Version 2024.9.29 +New features: +* Add filter_zeros parameter to analysis functions (correlation, diff_abundance) to enable working with data containing negative values (such as log-ratios) ## Version 2024.8.31 New features: diff --git a/calour/__init__.py b/calour/__init__.py index b74ce940..862122dd 100644 --- a/calour/__init__.py +++ b/calour/__init__.py @@ -19,7 +19,7 @@ __credits__ = "https://github.com/biocore/calour/graphs/contributors" -__version__ = "2024.8.31" +__version__ = "2024.9.29" __all__ = ['read', 'read_amplicon', 'read_ms', 'read_qiime2', 'Experiment', 'AmpliconExperiment', 'MS1Experiment','mRNAExperiment', diff --git a/calour/analysis.py b/calour/analysis.py index 3d594bad..35189ce4 100644 --- a/calour/analysis.py +++ b/calour/analysis.py @@ -46,7 +46,7 @@ @format_docstring(_CALOUR_PVAL, _CALOUR_QVAL, _CALOUR_STAT, _CALOUR_DIRECTION) def correlation(exp: Experiment, field, method='spearman', nonzero=False, transform=None, - numperm=1000, alpha=0.1, fdr_method='dsfdr', shuffler=None,random_seed=None) -> Experiment: + numperm=1000, alpha=0.1, fdr_method='dsfdr', filter_zeros=True, shuffler=None,random_seed=None) -> Experiment: '''Find features with correlation to a numeric metadata field. The permutation based p-values and multiple hypothesis correction is implemented. @@ -84,6 +84,9 @@ def correlation(exp: Experiment, field, method='spearman', nonzero=False, transf * 'bhfdr': Benjamini-Hochberg FDR method * 'byfdr' : Benjamini-Yekutielli FDR method * 'filterBH' : Benjamini-Hochberg FDR method with filtering + filter_zeros : bool, optional + True to remove features with 0 abundance in all samples. + Should be set to False when working with data that contains negative values (such as log-ratio data) shuffler: function or None, optional if None, use shuffling on all samples (using the random_seed supplied) if function, use the supplied function to shuffle to labels for random iteration. Can be used for paired shuffling, etc. @@ -111,7 +114,11 @@ def correlation(exp: Experiment, field, method='spearman', nonzero=False, transf if field not in exp.sample_metadata.columns: raise ValueError('Field %s not in sample_metadata. Possible fields are: %s' % (field, exp.sample_metadata.columns)) - cexp = exp.filter_sum_abundance(0, strict=True) + # remove features not present in both groups + if filter_zeros: + if cexp.data.min() < 0: + logger.warning('filter_zeros=True is not recommended for data with negative values. Recommend using filter_zeros=False') + cexp = cexp.filter_sum_abundance(0, strict=True) data = cexp.get_data(copy=True, sparse=False).transpose() @@ -145,7 +152,7 @@ def correlation(exp: Experiment, field, method='spearman', nonzero=False, transf @format_docstring(_CALOUR_PVAL, _CALOUR_QVAL, _CALOUR_STAT, _CALOUR_DIRECTION) -def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', transform='rankdata', numperm=1000, alpha=0.1, fdr_method='dsfdr', shuffler=None, random_seed=None) -> Experiment: +def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', transform='rankdata', numperm=1000, alpha=0.1, fdr_method='dsfdr', filter_zeros=True, shuffler=None, random_seed=None) -> Experiment: '''Differential abundance test between 2 groups of samples for all the features. It uses permutation based nonparametric test and then applies @@ -192,6 +199,9 @@ def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', t alpha (e.g. a feature that appears in only 1 sample can obtain a minimal p-value of 0.5 and will therefore be removed when say alpha=0.1) + filter_zeros : bool, optional + True to remove features with 0 abundance in all samples. + Should be set to False when working with data that contains negative values (such as log-ratio data) shuffler: function or None, optional if None, use shuffling on all samples (using the random_seed supplied) if function, use thi supplied function to shuffle to labels for random iteration. Can be used for paired shuffling, etc. @@ -234,7 +244,10 @@ def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', t grp2 = 'NOT %s' % grp1 # remove features not present in both groups - cexp = cexp.filter_sum_abundance(0, strict=True) + if filter_zeros: + if cexp.data.min() < 0: + logger.warning('filter_zeros=True is not recommended for data with negative values. Recommend using filter_zeros=False') + cexp = cexp.filter_sum_abundance(0, strict=True) data = cexp.get_data(copy=True, sparse=False).transpose() # prepare the labels. @@ -257,7 +270,7 @@ def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', t return newexp -def diff_abundance_kw(exp: Experiment, field, transform='rankdata', numperm=1000, alpha=0.1, fdr_method='dsfdr', random_seed=None) -> Experiment: +def diff_abundance_kw(exp: Experiment, field, transform='rankdata', numperm=1000, alpha=0.1, fdr_method='dsfdr', filter_zeros=True, random_seed=None) -> Experiment: '''Test the differential abundance between multiple sample groups using the Kruskal Wallis test. It uses permutation based nonparametric test and then applies @@ -276,10 +289,20 @@ def diff_abundance_kw(exp: Experiment, field, transform='rankdata', numperm=1000 * 'normdata' : normalize the data to constant sum per samples * 'binarydata' : convert to binary absence/presence + numperm : int + number of permutations to perform alpha : float the desired FDR control level - numperm : int - number of permutations to perform + fdr_method : str + method to compute FDR. Allowed methods include: + + * 'dsfdr': discrete FDR + * 'bhfdr': Benjamini-Hochberg FDR method + * 'byfdr' : Benjamini-Yekutielli FDR method + * 'filterBH' : Benjamini-Hochberg FDR method with filtering + filter_zeros : bool, optional + True to remove features with 0 abundance in all samples. + Should be set to False when working with data that contains negative values (such as log-ratio data) random_seed : int, np.radnom.Generator instance or None, optional, default=None set the random number generator seed for the random permutations If int, random_seed is the seed used by the random number generator; @@ -301,8 +324,11 @@ def diff_abundance_kw(exp: Experiment, field, transform='rankdata', numperm=1000 logger.debug('diff_abundance_kw for field %s' % field) - # remove features with 0 abundance - cexp = exp.filter_sum_abundance(0, strict=True) + # remove features not present in both groups + if filter_zeros: + if cexp.data.min() < 0: + logger.warning('filter_zeros=True is not recommended for data with negative values. Recommend using filter_zeros=False') + cexp = cexp.filter_sum_abundance(0, strict=True) data = cexp.get_data(copy=True, sparse=False).transpose() # prepare the labels. If correlation method, get the values, otherwise the group @@ -342,6 +368,7 @@ def diff_abundance_paired(exp: Experiment, pair_field, field, val1, val2=None, t Similar to diff_abundance transform parameter. Additional options are: 'pair_rank': for each group of samples (a single value in pair_field), samples are ranked within the group. + Keyword Arguments ----------------- %(analysis.diff_abundance.parameters)s