diff --git a/HISTORY.md b/HISTORY.md index cdba9d32..bb68662f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,7 @@ * ADD plot Sample Distribution Histogram * ADD paired-ttest option * ENH add option to remove samples in GUI +* ADD pyComBat Batch correction Behdenna A, Haziza J, Azencot CA and Nordor A. (2020) pyComBat, a Python tool for batch effects correction in high-throughput molecular data using empirical Bayes methods. bioRxiv doi: 10.1101/2020.03.17.995431 # 0.4.5 * FIX loading of Data on Windows diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/DataSet_Preprocess.py index 9d0c77b2..7e9d8c69 100644 --- a/alphastats/DataSet_Preprocess.py +++ b/alphastats/DataSet_Preprocess.py @@ -8,6 +8,7 @@ from alphastats.utils import ignore_warning from sklearn.experimental import enable_iterative_imputer import itertools +from combat.pycombat import pycombat class Preprocess: @@ -197,17 +198,27 @@ def _log2_transform(self): self.mat = np.log2(self.mat + 0.1) self.preprocessing_info.update({"Log2-transformed": True}) print("Data has been log2-transformed.") + + def batch_correction(self, batch:str): + """Correct for technical bias/batch effects + Behdenna A, Haziza J, Azencot CA and Nordor A. (2020) pyComBat, a Python tool for batch effects correction in high-throughput molecular data using empirical Bayes methods. bioRxiv doi: 10.1101/2020.03.17.995431 + Args: + batch (str): column name in the metadata describing the different batches + """ + data = self.mat.transpose() + series_of_batches = self.metadata.set_index(self.sample).reindex(data.columns.to_list())[batch] + self.mat = pycombat(data=data, batch=series_of_batches).transpose() @ignore_warning(RuntimeWarning) def preprocess( self, - log2_transform=True, - remove_contaminations=False, - subset=False, - normalization=None, - imputation=None, - remove_samples=None, + log2_transform: bool=True, + remove_contaminations: bool=False, + subset: bool=False, + normalization: str=None, + imputation: str=None, + remove_samples: list=None, ): """Preprocess Protein data diff --git a/alphastats/gui/pages/03_Preprocessing.py b/alphastats/gui/pages/03_Preprocessing.py index 0f55bb91..86d9120f 100644 --- a/alphastats/gui/pages/03_Preprocessing.py +++ b/alphastats/gui/pages/03_Preprocessing.py @@ -12,7 +12,7 @@ def preprocessing(): st.markdown( "Before analyzing your data, consider normalizing and imputing your data as well as the removal of contaminants. " - + "A more detailed description about the preprocessing methods can be found in the AlphaPeptStats" + + "A more detailed description about the preprocessing methods can be found in the AlphaPeptStats " + "[documentation](https://alphapeptstats.readthedocs.io/en/main/data_preprocessing.html)." ) @@ -70,6 +70,25 @@ def preprocessing(): pd.DataFrame.from_dict(preprocessing, orient="index").astype(str), use_container_width=True, ) + + st.markdown("#### Batch correction: correct for technical bias") + + with st.form("Batch correction: correct for technical bias"): + batch = st.selectbox( + "Batch", + options= st.session_state.dataset.metadata.columns.to_list() + ) + submit_batch_correction = st.form_submit_button("Submit") + + if submit_batch_correction: + st.session_state.dataset.batch_correction( + batch=batch + ) + st.info( + "Data has been processed. " + + datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") + ) + with c2: diff --git a/requirements.txt b/requirements.txt index 442f3c52..6752b8cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,5 @@ numba==0.56.4 numba-stats==0.5.0 swifter==1.2.0 click==8.0.1 -kaleido==0.2.1 \ No newline at end of file +kaleido==0.2.1 +combat==0.3.3 \ No newline at end of file diff --git a/testfiles/maxquant/metadata.xlsx b/testfiles/maxquant/metadata.xlsx index 04d7b580..96456c6d 100644 Binary files a/testfiles/maxquant/metadata.xlsx and b/testfiles/maxquant/metadata.xlsx differ diff --git a/testfiles/maxquant/~$metadata.xlsx b/testfiles/maxquant/~$metadata.xlsx new file mode 100644 index 00000000..815f77b7 Binary files /dev/null and b/testfiles/maxquant/~$metadata.xlsx differ diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index 94f9573e..17ffac2a 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -678,6 +678,13 @@ def test_plot_intensity_sign_001(self): def test_plot_samplehistograms(self): fig = self.obj.plot_samplehistograms().to_plotly_json() self.assertEqual(312, len(fig["data"])) + + def test_batch_correction(self): + self.obj.preprocess(subset=True, imputation="knn", normalization="quantile") + self.obj.batch_correction(batch="batch_artifical_added") + first_value = self.obj.mat.values[0,0] + self.assertAlmostEqual(0.0111, first_value, places=2) +