From 086dc9893ae7f8e79d66b42665d5d2efe52ad2da Mon Sep 17 00:00:00 2001 From: Qi-Xuan Yue Date: Tue, 29 Oct 2024 17:42:42 +0800 Subject: [PATCH] Add mzid plugin --- .github/workflows/python-app.yml | 10 +- .github/workflows/python-package.yml | 4 +- .github/workflows/python-publish.yml | 4 +- README.md | 1 + pmultiqc/cli.py | 1 + pmultiqc/main.py | 6 + pmultiqc/modules/quantms/quantms.py | 743 +++++++++++++++++++++------ setup.py | 3 +- 8 files changed, 610 insertions(+), 162 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index e02fcaa..e9165b9 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -15,9 +15,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python 3.9 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: 3.9 - name: Install dependencies @@ -41,7 +41,7 @@ jobs: unzip -d ./lfq lfq.zip python setup.py install multiqc -f ./lfq --config ./lfq/multiqc_config.yml -o ./results_lfq - - uses: actions/upload-artifact@v1 + - uses: actions/upload-artifact@v4 if: always() name: Upload results with: @@ -53,7 +53,7 @@ jobs: unzip -d ./tmt tmt-replicates.zip python setup.py install multiqc -f ./tmt --config ./tmt/multiqc_config.yml -o ./results_tmt - - uses: actions/upload-artifact@v1 + - uses: actions/upload-artifact@v4 if: always() name: Upload results with: @@ -65,7 +65,7 @@ jobs: unzip -d ./dia dia.zip python setup.py install multiqc -f ./dia --config ./dia/multiqc_config.yml -o ./results_dia - - uses: actions/upload-artifact@v1 + - uses: actions/upload-artifact@v4 if: always() name: Upload results with: diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b4f8d70..ef0e1b6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -19,9 +19,9 @@ jobs: python-version: [3.8, 3.9, 3.11] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index ec70354..6c65810 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -21,9 +21,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: '3.x' - name: Install dependencies diff --git a/README.md b/README.md index 97b68d1..e608c2c 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ example: ```multiqc resources/LFQ -o ./``` - --quantification_method: The quantification method for LFQ experiment (default: `feature_intensity`) - --disable_table: Disable protein/peptide table plots for large dataset - --ignored_idxml: ignored idxml files for faster running +- --mzid_plugin: Generate reports based on mzid and mzML/mgf An example report can be found in [multiqc_report.html](http://bigbio.xyz/pmultiqc/shared-peptides-star-align-stricter-pep-protein-FDR/multiqc_report.html) diff --git a/pmultiqc/cli.py b/pmultiqc/cli.py index 42c8b13..75d6999 100644 --- a/pmultiqc/cli.py +++ b/pmultiqc/cli.py @@ -35,3 +35,4 @@ def print_version(ctx, params, value): help='Prefix (default) or suffix') disable_plugin = click.option('--disable_plugin', 'disable_plugin', is_flag=True, help="Disable the pmultiqc plugin on this run") +mzid_plugin = click.option('--mzid_plugin', 'mzid_plugin', is_flag=True, help="Extract mzIdentML") \ No newline at end of file diff --git a/pmultiqc/main.py b/pmultiqc/main.py index ec4cdd9..d029a2e 100644 --- a/pmultiqc/main.py +++ b/pmultiqc/main.py @@ -43,6 +43,12 @@ def pmultiqc_plugin_execution_start(): if 'quantms/mzML' not in config.sp: config.update_dict(config.sp, {'quantms/mzML': {'fn': '*.mzML', 'num_lines': 0}}) + if 'quantms/mgf' not in config.sp: + config.update_dict(config.sp, {'quantms/mgf': {'fn': '*.mgf', 'num_lines': 0}}) + + if 'quantms/mzid' not in config.sp: + config.update_dict(config.sp, {'quantms/mzid': {'fn': '*.mzid', 'num_lines': 0}}) + if 'quantms/ms_info' not in config.sp: config.update_dict(config.sp, {'quantms/ms_info': {'fn': '*_ms_info.parquet', 'num_lines': 0}}) diff --git a/pmultiqc/modules/quantms/quantms.py b/pmultiqc/modules/quantms/quantms.py index f20893a..fac98c9 100755 --- a/pmultiqc/modules/quantms/quantms.py +++ b/pmultiqc/modules/quantms/quantms.py @@ -18,7 +18,7 @@ import pandas as pd from functools import reduce import re -from pyteomics import mztab +from pyteomics import mztab, mzid, mgf from pyopenms import IdXMLFile, MzMLFile, MSExperiment, OpenMSBuildInfo, AASequence import os import sqlite3 @@ -80,167 +80,202 @@ def __init__(self): if config.kwargs.get('disable_plugin', True): return None - self.enable_exp = False - self.enable_sdrf = False - self.msstats_input_valid = False - # TODO what if multiple are found?? - for f in self.find_log_files("quantms/exp_design", filecontents=False): - self.exp_design = os.path.join(f["root"], f['fn']) - self.enable_exp = True - if self.enable_exp == False: - for f in self.find_log_files("quantms/sdrf", filecontents=False): - self.sdrf = os.path.join(f["root"], f['fn']) - OpenMS().openms_convert(self.sdrf, config.kwargs['raw'], - False, True, False, config.kwargs['condition']) - # experimental_design.tsv is the default output name - self.exp_design = os.path.join(f["root"], f['experimental_design.tsv']) - self.enable_sdrf = True - - # TODO in theory the module can work without the design. We just need to remove the according sections! - if self.enable_sdrf == False and self.enable_exp == False: - raise AttributeError( - "Neither exp_design nor sdrf can be found! Please provide or correct your multiqc_config.yml.") - - self.PSM_table = dict() - self.mzml_peptide_map = dict() - self.identified_spectrum = dict() self.ms_with_psm = list() - self.pep_quant_table = dict() - self.read_ms_info = False - self.mzml_table = OrderedDict() - self.search_engine = OrderedDict() - self.XCORR_HIST_RANGE = {'start': 0, 'end': 5, 'step': 0.1} - self.HYPER_HIST_RANGE = {'start': 0, 'end': 5, 'step': 0.1} - self.SPECEVALUE_HIST_RANGE = {'start': 0, 'end': 20, 'step': 0.4} - self.PEP_HIST_RANGE = {'start': 0, 'end': 1, 'step': 0.02} - self.total_ms2_spectra = 0 - self.Total_ms2_Spectral_Identified = 0 - self.Total_Peptide_Count = 0 self.Total_Protein_Identified = 0 - self.Total_Protein_Quantified = 0 - self.out_csv_data = dict() self.cal_num_table_data = dict() - self.mL_spec_ident_final = dict() + self.oversampling = dict() + self.identified_spectrum = dict() self.delta_mass = dict() - self.delta_mass_percent = dict() - self.heatmap_con_score = dict() - self.heatmap_pep_intensity = {} + self.Total_ms2_Spectral_Identified = 0 + self.Total_Peptide_Count = 0 + self.total_ms2_spectra = 0 + self.enable_dia = False + self.is_bruker = False + self.read_ms_info = False self.heatmap_charge_score = dict() self.MissedCleavages_heatmap_score = dict() - self.MissedCleavagesVar_score = dict() self.ID_RT_score = dict() self.HeatmapOverSamplingScore = dict() self.HeatmapPepMissingScore = dict() - self.exp_design_table = dict() + self.MissedCleavagesVar_score = dict() + self.pep_table_exists = False self.ms_info = dict() self.ms_info['charge_distribution'] = dict() self.ms_info['peaks_per_ms2'] = dict() - self.ms_info['peak_distribution'] = dict() - self.oversampling = dict() - self.ms1_tic = dict() - self.ms1_bpc = dict() - self.ms1_peaks = dict() - self.ms1_general_stats = dict() - self.is_bruker = False - # parse input data - # draw the experimental design - self.draw_exp_design() - self.pep_table_exists = False - self.enable_dia = False + self.ms_info['peak_distribution'] = dict() self.ms_paths = [] for mzML_file in self.find_log_files("quantms/mzML", filecontents=False): self.ms_paths.append(os.path.join(mzML_file["root"], mzML_file['fn'])) self.ms_info_path = [] - for ms_info in self.find_log_files("quantms/ms_info", filecontents=False): - self.ms_info_path.append(os.path.join(ms_info["root"], ms_info['fn'])) - self.ms_info_path.sort() - if len(self.ms_info_path) > 0: - self.read_ms_info = True - self.ms_paths = [self.file_prefix(i).replace("_ms_info", ".mzML") for i in self.ms_info_path] - - for f in self.find_log_files("quantms/mztab", filecontents=False): - self.out_mzTab_path = os.path.join(f["root"], f['fn']) - self.parse_out_mzTab() - - for report in self.find_log_files("quantms/diann_report", filecontents=False): - self.diann_report_path = os.path.join(report["root"], report["fn"]) - self.enable_dia = True - - mt = self.parse_mzml() - self.idx_paths = [] - for idx_file in self.find_log_files("quantms/idXML", filecontents=False): - self.idx_paths.append(os.path.join(idx_file["root"], idx_file['fn'])) - - self.draw_ms_information() - if self.enable_dia: - self.parse_diann_report() - self.draw_summary_protein_ident_table() - self.draw_quantms_identi_num() - self.draw_num_pep_per_protein() - if len(self.ms_info_path) > 0 and not self.is_bruker: - self.draw_precursor_charge_distribution() - self.draw_peaks_per_ms2() - self.draw_peak_intensity_distribution() - else: - if not config.kwargs['ignored_idxml']: - self.parse_idxml(mt) - self.CalHeatMapScore() + + # Check if this is a display for mzid identification results + if config.kwargs.get('mzid_plugin', False): + + self.mzid_peptide_map = dict() + self.ms_without_psm = dict() + + self.mgf_paths = [] + for mgf_file in self.find_log_files("quantms/mgf", filecontents=False): + self.mgf_paths.append(os.path.join(mgf_file["root"], mgf_file['fn'])) + + self.mzid_paths = [] + for mzid_file in self.find_log_files("quantms/mzid", filecontents=False): + self.mzid_paths.append(os.path.join(mzid_file["root"], mzid_file['fn'])) + + mzid_psm = self.parse_out_mzid() + + if self.mgf_paths: + self.parse_out_mgf() + elif self.ms_paths: + _ = self.parse_mzml() + + self.mzid_CalHeatMapScore(mzid_psm) self.draw_heatmap() self.draw_summary_protein_ident_table() - self.draw_quantms_identi_num() + self.draw_mzid_identi_num() self.draw_num_pep_per_protein() - if not config.kwargs['ignored_idxml']: - self.draw_mzml_ms() - self.draw_search_engine() self.draw_precursor_charge_distribution() self.draw_peaks_per_ms2() self.draw_peak_intensity_distribution() self.draw_oversampling() - # TODO what if multiple are found?? - # if config.kwargs.get('disable_table', True): - # log.info("Skip protein/peptide table plot!") - # else: - for msstats_input in self.find_log_files("quantms/msstats", filecontents=False): - self.msstats_input_path = os.path.join(msstats_input["root"], msstats_input["fn"]) - self.msstats_input_valid = True - self.parse_msstats_input() - - if config.kwargs["quantification_method"] == "spectral_counting": - # Add a report section with psm table plot from mzTab for spectral counting - self.add_section( - name="Peptide Spectrum Matches", - anchor="peptide_spectrum_matches", - description='This plot shows the information of peptide spectrum matches', - helptext=''' - This table shows the information of peptide spectrum matches from mzTab PSM section. - ''', - plot=self.psm_table_html - ) - - # TODO draw protein quantification from mzTab in the future with Protein and peptide tables from mzTab - # currently only draw protein tabel for spectral counting - if not self.msstats_input_valid and config.kwargs["quantification_method"] == "spectral_counting": - log.warning("MSstats input file not found!") - self.add_section( - name="Protein Quantification Table", - anchor="protein_quant_result", - description='This plot shows the quantification information of proteins' - ' in the final result (mainly the mzTab file).', - helptext=''' - The quantification information (Spectral Counting) of proteins is obtained from the mzTab file. - The table shows the quantitative level and distribution of proteins in different study variables and run. - - * Peptides_Number: The number of peptides for each protein. - * Average Spectral Counting: Average spectral counting of each protein across all conditions with NA=0 or NA ignored. - * Spectral Counting in each condition (Eg. `CT=Mixture;CN=UPS1;QY=0.1fmol`): Average spectral counting of replicates. - - Click `Show replicates` to switch to bar plots of counting in each replicate. - ''', - plot=self.protein_quantification_table_html - ) + else: + self.enable_exp = False + self.enable_sdrf = False + self.msstats_input_valid = False + # TODO what if multiple are found?? + for f in self.find_log_files("quantms/exp_design", filecontents=False): + self.exp_design = os.path.join(f["root"], f['fn']) + self.enable_exp = True + + if not self.enable_exp: + for f in self.find_log_files("quantms/sdrf", filecontents=False): + self.sdrf = os.path.join(f["root"], f['fn']) + OpenMS().openms_convert(self.sdrf, config.kwargs['raw'], + False, True, False, config.kwargs['condition']) + # experimental_design.tsv is the default output name + self.exp_design = os.path.join(f["root"], f['experimental_design.tsv']) + self.enable_sdrf = True + + # TODO in theory the module can work without the design. We just need to remove the according sections! + if not self.enable_sdrf and not self.enable_exp: + raise AttributeError( + "Neither exp_design nor sdrf can be found! Please provide or correct your multiqc_config.yml.") + + self.PSM_table = dict() + self.mzml_peptide_map = dict() + self.pep_quant_table = dict() + self.mzml_table = OrderedDict() + self.search_engine = OrderedDict() + self.XCORR_HIST_RANGE = {'start': 0, 'end': 5, 'step': 0.1} + self.HYPER_HIST_RANGE = {'start': 0, 'end': 5, 'step': 0.1} + self.SPECEVALUE_HIST_RANGE = {'start': 0, 'end': 20, 'step': 0.4} + self.PEP_HIST_RANGE = {'start': 0, 'end': 1, 'step': 0.02} + self.Total_Protein_Quantified = 0 + self.out_csv_data = dict() + self.mL_spec_ident_final = dict() + self.delta_mass_percent = dict() + self.heatmap_con_score = dict() + self.heatmap_pep_intensity = {} + self.exp_design_table = dict() + self.ms1_tic = dict() + self.ms1_bpc = dict() + self.ms1_peaks = dict() + self.ms1_general_stats = dict() + # parse input data + # draw the experimental design + self.draw_exp_design() + + for ms_info in self.find_log_files("quantms/ms_info", filecontents=False): + self.ms_info_path.append(os.path.join(ms_info["root"], ms_info['fn'])) + self.ms_info_path.sort() + if len(self.ms_info_path) > 0: + self.read_ms_info = True + self.ms_paths = [self.file_prefix(i).replace("_ms_info", ".mzML") for i in self.ms_info_path] + + for f in self.find_log_files("quantms/mztab", filecontents=False): + self.out_mzTab_path = os.path.join(f["root"], f['fn']) + self.parse_out_mzTab() + + for report in self.find_log_files("quantms/diann_report", filecontents=False): + self.diann_report_path = os.path.join(report["root"], report["fn"]) + self.enable_dia = True + + mt = self.parse_mzml() + self.idx_paths = [] + for idx_file in self.find_log_files("quantms/idXML", filecontents=False): + self.idx_paths.append(os.path.join(idx_file["root"], idx_file['fn'])) + + self.draw_ms_information() + if self.enable_dia: + self.parse_diann_report() + self.draw_summary_protein_ident_table() + self.draw_quantms_identi_num() + self.draw_num_pep_per_protein() + if len(self.ms_info_path) > 0 and not self.is_bruker: + self.draw_precursor_charge_distribution() + self.draw_peaks_per_ms2() + self.draw_peak_intensity_distribution() + else: + if not config.kwargs['ignored_idxml']: + self.parse_idxml(mt) + self.CalHeatMapScore() + self.draw_heatmap() + self.draw_summary_protein_ident_table() + self.draw_quantms_identi_num() + self.draw_num_pep_per_protein() + if not config.kwargs['ignored_idxml']: + self.draw_mzml_ms() + self.draw_search_engine() + self.draw_precursor_charge_distribution() + self.draw_peaks_per_ms2() + self.draw_peak_intensity_distribution() + self.draw_oversampling() + + # TODO what if multiple are found?? + # if config.kwargs.get('disable_table', True): + # log.info("Skip protein/peptide table plot!") + # else: + for msstats_input in self.find_log_files("quantms/msstats", filecontents=False): + self.msstats_input_path = os.path.join(msstats_input["root"], msstats_input["fn"]) + self.msstats_input_valid = True + self.parse_msstats_input() + + if config.kwargs["quantification_method"] == "spectral_counting": + # Add a report section with psm table plot from mzTab for spectral counting + self.add_section( + name="Peptide Spectrum Matches", + anchor="peptide_spectrum_matches", + description='This plot shows the information of peptide spectrum matches', + helptext=''' + This table shows the information of peptide spectrum matches from mzTab PSM section. + ''', + plot=self.psm_table_html + ) + + # TODO draw protein quantification from mzTab in the future with Protein and peptide tables from mzTab + # currently only draw protein tabel for spectral counting + if not self.msstats_input_valid and config.kwargs["quantification_method"] == "spectral_counting": + log.warning("MSstats input file not found!") + self.add_section( + name="Protein Quantification Table", + anchor="protein_quant_result", + description='This plot shows the quantification information of proteins' + ' in the final result (mainly the mzTab file).', + helptext=''' + The quantification information (Spectral Counting) of proteins is obtained from the mzTab file. + The table shows the quantitative level and distribution of proteins in different study variables and run. + + * Peptides_Number: The number of peptides for each protein. + * Average Spectral Counting: Average spectral counting of each protein across all conditions with NA=0 or NA ignored. + * Spectral Counting in each condition (Eg. `CT=Mixture;CN=UPS1;QY=0.1fmol`): Average spectral counting of replicates. + + Click `Show replicates` to switch to bar plots of counting in each replicate. + ''', + plot=self.protein_quantification_table_html + ) self.css = { 'assets/css/quantms.css': @@ -257,6 +292,7 @@ def __init__(self): os.path.join(os.path.dirname(__file__), 'assets', 'js', 'sql-optimized.js') } + def draw_heatmap(self): HeatMapScore = [] if self.pep_table_exists: @@ -417,7 +453,10 @@ def draw_summary_protein_ident_table(self): summary_table[self.total_ms2_spectra]['%Identified MS2 Spectra'] = coverage summary_table[self.total_ms2_spectra]['#Peptides Identified'] = self.Total_Peptide_Count summary_table[self.total_ms2_spectra]['#Proteins Identified'] = self.Total_Protein_Identified - summary_table[self.total_ms2_spectra]['#Proteins Quantified'] = self.Total_Protein_Quantified + + if not config.kwargs.get('mzid_plugin', False): + summary_table[self.total_ms2_spectra]['#Proteins Quantified'] = self.Total_Protein_Quantified + headers['#Identified MS2 Spectra'] = { 'description': 'Total number of MS/MS spectra identified', } @@ -442,14 +481,23 @@ def draw_summary_protein_ident_table(self): } table_html = table.plot(summary_table, headers, pconfig) + if config.kwargs.get('mzid_plugin', False): + description_str = 'This plot shows the summary statistics of the submitted data' + helptext_str=''' + This plot shows the summary statistics of the submitted data + ''' + else: + description_str = 'This table shows the quantms pipeline summary statistics' + helptext_str=''' + This table shows the quantms pipeline summary statistics + ''' + # Add a report section with the table self.add_section( name="Summary Table", anchor="quantms_summary_table", - description='This table shows the quantms pipeline summary statistics', - helptext=''' - This table shows the quantms pipeline summary statistics - ''', + description=description_str, + helptext=helptext_str, plot=table_html ) @@ -583,6 +631,56 @@ def draw_quantms_identi_num(self): plot=table_html ) + def draw_mzid_identi_num(self): + pconfig = { + 'id': 'result statistics', # ID used for the table + 'title': 'pipeline result statistics', # Title of the table. Used in the column config modal + 'save_file': False, # Whether to save the table data to a file + 'raw_data_fn': 'multiqc_result statistics_table', # File basename to use for raw data file + 'sort_rows': True, # Whether to sort rows alphabetically + 'only_defined_headers': False, # Only show columns that are defined in the headers config + 'col1_header': 'Spectra File', + 'no_violin': True, + # 'format': '{:,.0f}' # The header used for the first column + } + + headers = OrderedDict() + headers['peptide_num'] = { + 'title': '#Peptide IDs', + 'description': 'The number of identified PSMs in the pipeline', + 'color': "#ffffff" + } + headers['unique_peptide_num'] = { + 'title': '#Unambiguous Peptide IDs', + 'description': 'The number of unique peptides in the pipeline. Those that match only one protein in the provided database.', + 'color': "#ffffff" + } + headers['modified_peptide_num'] = { + 'title': '#Modified Peptide IDs', + 'description': 'Number of modified identified peptides in the pipeline', + 'color': "#ffffff" + } + headers['protein_num'] = { + 'title': '#Protein (group) IDs', + 'description': 'The number of identified protein(group)s in the pipeline', + 'color': "#ffffff" + } + + table_html = table.plot(self.cal_num_table_data, headers, pconfig) + + # Add a report section with the line plot + self.add_section( + name="Submitted Result Statistics", + anchor="Submitted Result Statistics", + description='This plot shows the submitted results', + helptext=''' + This plot shows the submitted results. + Including the number of identified peptides and the number of identified modified peptides in the submitted results. + You can also remove the decoy with the `remove_decoy` parameter. + ''', + plot=table_html + ) + # draw number of peptides per proteins def draw_num_pep_per_protein(self): # Create table plot @@ -604,16 +702,25 @@ def draw_num_pep_per_protein(self): } bar_html = bargraph.plot([self.pep_plot.dict['data']['frequency'], self.pep_plot.dict['data']['percentage']], ["Frequency", "Percentage"], pconfig) + + if config.kwargs.get('mzid_plugin', False): + description_str = 'This plot shows the number of peptides per protein in the submitted data' + helptext_str=''' + Proteins supported by more peptide identifications can constitute more confident results. + ''' + else: + description_str = 'This plot shows the number of peptides per protein in quantms pipeline final result' + helptext_str=''' + This statistic is extracted from the out_msstats file. Proteins supported by more peptide + identifications can constitute more confident results. + ''' + # Add a report section with the line plot self.add_section( name="Number of Peptides identified Per Protein", anchor="number_of_peptides_per_proteins", - description='This plot shows the number of peptides per proteins ' - 'in quantms pipeline final result', - helptext=''' - This statistic is extracted from the out_msstats file. Proteins supported by more peptide - identifications can constitute more confident results. - ''', + description=description_str, + helptext=helptext_str, plot=bar_html ) @@ -698,7 +805,12 @@ def draw_peak_intensity_distribution(self): 'title': 'Peak Intensity Distribution', # 'xlab': 'Peak Intensity' } - cats = self.mzml_peak_distribution_plot.dict['cats'] + + if config.kwargs.get('mzid_plugin', False) and self.mgf_paths: + cats = self.mgf_peak_distribution_plot.dict['cats'] + else: + cats = self.mzml_peak_distribution_plot.dict['cats'] + bar_html = bargraph.plot(self.ms_info['peak_distribution'], cats, pconfig) # Add a report section with the line plot @@ -731,7 +843,12 @@ def draw_precursor_charge_distribution(self): 'cpswitch': False, 'title': 'Precursor Ion Charge Distribution', } - cats = self.mzml_charge_plot.dict['cats'] + + if config.kwargs.get('mzid_plugin', False) and self.mgf_paths: + cats = self.mgf_charge_plot.dict['cats'] + else: + cats = self.mzml_charge_plot.dict['cats'] + bar_html = bargraph.plot(self.ms_info['charge_distribution'], cats, pconfig) # Add a report section with the line plot @@ -757,7 +874,12 @@ def draw_peaks_per_ms2(self): 'cpswitch': False, 'title': 'Number of Peaks per MS/MS spectrum', } - cats = self.mzml_peaks_ms2_plot.dict['cats'] + + if config.kwargs.get('mzid_plugin', False) and self.mgf_paths: + cats = self.mgf_peaks_ms2_plot.dict['cats'] + else: + cats = self.mzml_peaks_ms2_plot.dict['cats'] + bar_html = bargraph.plot(self.ms_info['peaks_per_ms2'], cats, pconfig) self.add_section( @@ -783,6 +905,7 @@ def draw_oversampling(self): 'title': 'MS2 counts per 3D-peak', 'scale': "set3" } + cats = self.oversampling_plot.dict['cats'] bar_html = bargraph.plot(self.oversampling, cats, pconfig) @@ -1013,6 +1136,67 @@ def draw_search_engine(self): ''' ) + def mzid_CalHeatMapScore(self, psm): + log.info("Calculating Heatmap Scores...") + + # HeatMapMissedCleavages + global_peps = psm[["PeptideSequence", "Modifications"]].drop_duplicates() + global_peps_count = len(global_peps) + + enzyme_list = list() + for mzid_path in self.mzid_paths: + try: + enzyme_iter = mzid.MzIdentML(mzid_path).iterfind("Enzyme") + enzyme = next(enzyme_iter).get("EnzymeName", None) + if enzyme: + enzyme_name = list(enzyme.keys())[0] + else: + enzyme_name = "Trypsin" + enzyme_list.append(enzyme_name) + except StopIteration: + enzyme_list.append("Trypsin") + + enzyme_list = list(set(enzyme_list)) + enzyme = enzyme_list[0] if len(enzyme_list) == 1 else "Trypsin" + psm['missed_cleavages'] = psm.apply(lambda x: self.cal_MissCleavages(x['PeptideSequence'], enzyme), axis=1) + + # Calculate the ID RT Score + if "retention_time" not in psm.columns and self.mgf_paths: + # MGF + psm = pd.merge(psm, + self.mgf_rtinseconds[["spectrumID", "filename", "retention_time"]], + on=["filename", "spectrumID"], how="left") + + for name, group in psm.groupby('stand_spectra_ref'): + sc = group['missed_cleavages'].value_counts() + mis_0 = sc.get(0, 0) + self.MissedCleavages_heatmap_score[name] = mis_0 / sc[:].sum() + + x = group['retention_time'] / np.sum(group['retention_time']) + n = len(group['retention_time']) + y = np.sum(x) / n + worst = ((1 - y) ** 0.5) * 1 / n + (y ** 0.5) * (n - 1) / n + sc = np.sum(np.abs(x - y) ** 0.5) / n + if worst == 0: + self.ID_RT_score[name] = 1.0 + else: + self.ID_RT_score[name] = float((worst - sc) / worst) + + # For HeatMapOverSamplingScore + self.HeatmapOverSamplingScore[name] = self.oversampling[name]['1'] / np.sum(list(self.oversampling[name].values())) + + # For HeatMapPepMissingScore + idFraction = len(pd.merge(global_peps, + group[["PeptideSequence", "Modifications"]].drop_duplicates(), + on=["PeptideSequence", "Modifications"]).drop_duplicates()) / global_peps_count + self.HeatmapPepMissingScore[name] = np.minimum(1.0, idFraction) + + median = np.median(list(self.MissedCleavages_heatmap_score.values())) + self.MissedCleavagesVar_score = dict(zip(self.MissedCleavages_heatmap_score.keys(), + list(map(lambda v: 1 - np.abs(v - median), + self.MissedCleavages_heatmap_score.values())))) + log.info("Done calculating Heatmap Scores.") + def CalHeatMapScore(self): log.info("Calculating Heatmap Scores...") mztab_data = mztab.MzTab(self.out_mzTab_path) @@ -1634,8 +1818,8 @@ def parse_out_mzTab(self): # draw PSMs table for spectral counting if config.kwargs['quantification_method'] == "spectral_counting" and not config.kwargs.get('disable_table', True): - mztab_data_psm_full = psm[['sequence', 'accession', 'search_engine_score[1]', 'stand_spectra_ref']] - mztab_data_psm_full.rename(columns={"sequence": "Sequence", "accession": "Accession", + mztab_data_psm_full = psm[['opt_global_cv_MS:1000889_peptidoform_sequence', 'accession', 'search_engine_score[1]', 'stand_spectra_ref']].copy() + mztab_data_psm_full.rename(columns={"opt_global_cv_MS:1000889_peptidoform_sequence": "Sequence", "accession": "Accession", "search_engine_score[1]": "Search_Engine_Score", "stand_spectra_ref": "Spectra_Ref"}, inplace=True) mztab_data_psm_full[["Sequence", "Modification"]] = mztab_data_psm_full.apply( @@ -1838,6 +2022,261 @@ def getSpectrumCountAcrossRep(condition_count_dict: dict): self.protein_quantification_table_html = table_html + def parse_out_mgf(self): + def get_spectrumID(SpectrumTitle, index): + if "scan=" in SpectrumTitle: + spectrumID = SpectrumTitle + else: + spectrumID = "index=" + str(index) + return spectrumID + + self.mgf_peak_distribution_plot = Histogram('Peak Intensity', plot_category='range', breaks=[ + 0, 10, 100, 300, 500, 700, 900, 1000, 3000, 6000, 10000]) + self.mgf_charge_plot = Histogram('Precursor Charge', plot_category='frequency') + self.mgf_peaks_ms2_plot = Histogram('#Peaks per MS/MS spectrum', plot_category='range', breaks=[ + i for i in range(0, 1001, 100)]) + + self.mgf_peak_distribution_plot_1 = copy.deepcopy(self.mgf_peak_distribution_plot) + self.mgf_charge_plot_1 = copy.deepcopy(self.mgf_charge_plot) + self.mgf_peaks_ms2_plot_1 = copy.deepcopy(self.mgf_peaks_ms2_plot) + + heatmap_charge = dict() + mgf_rtinseconds = {"spectrumID": [], "title": [], "filename": [], "retention_time": []} + + for m in self.mgf_paths: + log.info("{}: Parsing MGF file {}...".format(datetime.now().strftime("%H:%M:%S"), m)) + mgf_data = mgf.MGF(m) + log.info("{}: Done parsing MGF file {}...".format(datetime.now().strftime("%H:%M:%S"), m)) + m = self.file_prefix(m) + log.info("{}: Aggregating MGF file {}...".format(datetime.now().strftime("%H:%M:%S"), m)) + + charge_2 = 0 + + for i, spectrum in enumerate(mgf_data): + charge_state = int(spectrum.get("params", {}).get("charge", [])[0]) + if charge_state == 2: + charge_2 += 1 + + peak_per_ms2 = len(spectrum["m/z array"]) + base_peak_intensity = max(spectrum["intensity array"]) if len(spectrum["intensity array"]) > 0 else None + + raw_title = spectrum.get("params", {}).get("title", []) + mgf_rtinseconds["title"].append(raw_title) + title = get_spectrumID(raw_title, i) + mgf_rtinseconds["spectrumID"].append(title) + mgf_rtinseconds["filename"].append(m) + + rtinseconds = float(spectrum.get("params", {}).get("rtinseconds", None)) + mgf_rtinseconds["retention_time"].append(rtinseconds) + + if m in self.ms_with_psm: + if title in self.identified_spectrum[m]: + self.mgf_charge_plot.addValue(charge_state) + self.mgf_peak_distribution_plot.addValue(base_peak_intensity) + self.mgf_peaks_ms2_plot.addValue(peak_per_ms2) + else: + self.mgf_charge_plot_1.addValue(charge_state) + self.mgf_peak_distribution_plot_1.addValue(base_peak_intensity) + self.mgf_peaks_ms2_plot_1.addValue(peak_per_ms2) + else: + if m not in self.ms_without_psm: + self.ms_without_psm.append(m) + ms2_number = i + 1 + + heatmap_charge[m] = charge_2 / ms2_number + self.total_ms2_spectra = self.total_ms2_spectra + ms2_number + log.info("{}: Done aggregating MGF file {}...".format(datetime.now().strftime("%H:%M:%S"), m)) + + self.mgf_rtinseconds = pd.DataFrame(mgf_rtinseconds) + + for i in self.ms_without_psm: + log.warning("No PSM found in '{}'!".format(i)) + + self.mgf_peaks_ms2_plot.to_dict() + self.mgf_peak_distribution_plot.to_dict() + self.mgf_peaks_ms2_plot_1.to_dict() + self.mgf_peak_distribution_plot_1.to_dict() + self.mgf_charge_plot.to_dict() + self.mgf_charge_plot_1.to_dict() + + self.mgf_charge_plot.dict["cats"].update(self.mgf_charge_plot_1.dict["cats"]) + charge_cats_keys = [int(i) for i in self.mgf_charge_plot.dict["cats"]] + charge_cats_keys.sort() + self.mgf_charge_plot.dict["cats"] = OrderedDict( + {str(i): self.mgf_charge_plot.dict["cats"][str(i)] for i in charge_cats_keys}) + + self.ms_info['charge_distribution'] = { + 'identified_spectra': self.mgf_charge_plot.dict['data'], + 'unidentified_spectra': self.mgf_charge_plot_1.dict['data'] + } + self.ms_info['peaks_per_ms2'] = { + 'identified_spectra': self.mgf_peaks_ms2_plot.dict['data'], + 'unidentified_spectra': self.mgf_peaks_ms2_plot_1.dict['data'] + } + self.ms_info['peak_distribution'] = { + 'identified_spectra': self.mgf_peak_distribution_plot.dict['data'], + 'unidentified_spectra': self.mgf_peak_distribution_plot_1.dict['data'] + } + + median = np.median(list(heatmap_charge.values())) + self.heatmap_charge_score = dict(zip( + heatmap_charge.keys(), + list(map(lambda v: 1 - np.abs(v - median), heatmap_charge.values())))) + + def parse_out_mzid(self): + def parse_location(location): + if '\\' in location: + location = location.replace('\\', '/') + return os.path.basename(location) + + def Process_Modification(Modification): + if not isinstance(Modification, list): + modifications = None + else: + modifi_list = list() + for i in Modification: + if i.get('name', None) is not None: + modifi_list.append(str(i.get('location')) + "-" + i.get('name', None)) + elif i.get("cross-link receiver", None) is not None: + modifi_list.append(str(i.get("location")) + "-CrossLinkReceiver") + modifications = ";".join(modifi_list) + return modifications + + def read_mzid(m): + log.info("{}: Parsing MzIdentML file {}...".format(datetime.now().strftime("%H:%M:%S"), m)) + mzid_data = mzid.MzIdentML(m) + if len(mzid_data) == 0: + raise ValueError("Please check your MzIdentML", m) + log.info("{}: Done parsing MzIdentML file {}.".format(datetime.now().strftime("%H:%M:%S"), m)) + m = self.file_prefix(m) + log.info("{}: Aggregating MzIdentML file {}...".format(datetime.now().strftime("%H:%M:%S"), m)) + + mzid_dict = list() + for mzid_tmp in mzid_data: + mzid_tmp_part = {k: v for k, v in mzid_tmp.items() if k not in ["SpectrumIdentificationItem"]} + for spectrum_item in mzid_tmp.get("SpectrumIdentificationItem", []): + spectrum_item_part = {k: v for k, v in spectrum_item.items() if k not in ["PeptideEvidenceRef", "PeptideSequence"]} + if spectrum_item_part.get("rank") == 1 and spectrum_item_part.get("peptide passes threshold", "true") == "true" and \ + spectrum_item.get("passThreshold", False): + for peptide_ref in spectrum_item.get("PeptideEvidenceRef", []): + if "name" in peptide_ref: + peptide_ref["PeptideEvidenceRef_name"] = peptide_ref.pop("name") + if "location" in peptide_ref: + peptide_ref["PeptideEvidenceRef_location"] = peptide_ref.pop("location") + if "FileFormat" in peptide_ref: + peptide_ref["PeptideEvidenceRef_FileFormat"] = peptide_ref.pop("FileFormat") + mzid_dict.append({**mzid_tmp_part, **spectrum_item_part, **peptide_ref}) + + mzid_t = pd.DataFrame(mzid_dict) + log.info("{}: Done aggregating MzIdentML file {}...".format(datetime.now().strftime("%H:%M:%S"), m)) + + return mzid_t + + mzid_table = pd.DataFrame() + for mzid_path in self.mzid_paths: + mzid_table = pd.concat([mzid_table, read_mzid(mzid_path)], axis=0, ignore_index=True) + + search_engines = ["SEQUEST:xcorr", "Mascot:score", "PEAKS:peptideScore", "xi:score"] + mzid_table.rename(columns=lambda x: "search_engine_score" if x in search_engines else x, inplace=True) + + if "retention time" in mzid_table.columns: + mzid_table.rename(columns={"retention time": "retention_time"}, inplace=True) + + if "location" in mzid_table.columns: + mzid_table["location"] = mzid_table["location"].apply(parse_location) + + if "PeptideEvidenceRef_location" in mzid_table.columns: + mzid_table["PeptideEvidenceRef_location"] = mzid_table["PeptideEvidenceRef_location"].apply(parse_location) + + mzid_table["stand_spectra_ref"] = mzid_table.apply(lambda x: self.file_prefix(x.location), axis=1) + mzid_table["Modifications"] = mzid_table["Modification"].apply(lambda x: Process_Modification(x)) + + if "cross-link spectrum identification item" in mzid_table.columns: + mzid_table["CrossLinked_Peptide"] = ~mzid_table["cross-link spectrum identification item"].isna() + else: + mzid_table["CrossLinked_Peptide"] = False + + mzid_table["pep_to_prot_unique"] = mzid_table.groupby(["spectrumID", "PeptideSequence"])["accession"].transform(lambda x: len(set(x)) == 1) + mzid_table["accession_group"] = mzid_table.groupby(["spectrumID", "PeptideSequence"])["accession"].transform(lambda x: ";".join(x.unique())) + + if "isDecoy" not in mzid_table.columns: + mzid_table["isDecoy"] = False + + mzid_table['filename'] = mzid_table.apply(lambda x: self.file_prefix(x.location), axis=1) + self.ms_with_psm = mzid_table['filename'].unique().tolist() + + # TODO remove_decoy + if config.kwargs['remove_decoy']: + mzid_table = mzid_table[~mzid_table["isDecoy"]] + + self.Total_Protein_Identified = mzid_table["accession_group"].nunique() + + self.pep_plot = Histogram('Number of peptides per proteins', plot_category='frequency') + + counts_per_acc = mzid_table[["PeptideSequence", "accession"]].drop_duplicates()["accession"].value_counts() + counts_per_acc.apply(self.pep_plot.addValue) + + categories = OrderedDict() + categories['Frequency'] = { + 'name': 'Frequency', + 'description': 'Number of identified peptides per protein.' + } + self.pep_plot.to_dict(percentage=True, cats=categories) + + if "retention_time" in mzid_table.columns: + psm = mzid_table[["spectrumID", "PeptideSequence", "chargeState", "Modifications", + "accession_group", "experimentalMassToCharge", "calculatedMassToCharge", + "isDecoy", "pep_to_prot_unique", "search_engine_score", "stand_spectra_ref", + "location", "filename", "retention_time"]].drop_duplicates().reset_index() + else: + psm = mzid_table[["spectrumID", "PeptideSequence", "chargeState", "Modifications", + "accession_group", "experimentalMassToCharge", "calculatedMassToCharge", + "isDecoy", "pep_to_prot_unique", "search_engine_score", "stand_spectra_ref", + "location", "filename"]].drop_duplicates().reset_index() + + for m, group in psm.groupby("filename"): + self.oversampling_plot = Histogram('MS/MS counts per 3D-peak', plot_category='frequency', breaks=[1, 2, 3]) + for _, j in group.groupby(["PeptideSequence", "chargeState", "Modifications"]): + self.oversampling_plot.addValue(j["spectrumID"].nunique()) + + self.oversampling_plot.to_dict() + self.oversampling[m] = self.oversampling_plot.dict["data"] + + proteins = set(group["accession_group"]) + peptides = group[["PeptideSequence", "Modifications"]].drop_duplicates() + + # unique_peptides = group[group["pep_to_prot_unique"] == True][["PeptideSequence", "Modifications"]].drop_duplicates() + unique_peptides = group[group["pep_to_prot_unique"]][["PeptideSequence", "Modifications"]].drop_duplicates() + + self.identified_spectrum[m] = group["spectrumID"].drop_duplicates().tolist() + self.mzid_peptide_map[m] = list(set(group["PeptideSequence"].tolist())) + + if None in proteins: + proteins.remove(None) + + self.cal_num_table_data[m] = {"protein_num": len(proteins)} + self.cal_num_table_data[m]["peptide_num"] = len(peptides) + self.cal_num_table_data[m]["unique_peptide_num"] = len(unique_peptides) + + modified_pep = peptides.dropna(subset=["Modifications"]) + self.cal_num_table_data[m]["modified_peptide_num"] = len(modified_pep) + + if self.mgf_paths: + self.ms_without_psm = set([self.file_prefix(i) for i in self.mgf_paths]) - set(self.ms_with_psm) + elif self.ms_paths: + self.ms_without_psm = set([self.file_prefix(i) for i in self.ms_paths]) - set(self.ms_with_psm) + for i in self.ms_without_psm: + self.cal_num_table_data[self.file_prefix(i)] = {'protein_num': 0, + 'peptide_num': 0, + 'unique_peptide_num': 0, + 'modified_peptide_num': 0 + } + + self.Total_ms2_Spectral_Identified = psm["spectrumID"].nunique() + self.Total_Peptide_Count = psm["PeptideSequence"].nunique() + + return psm + def parse_diann_report(self): log.info("Parsing {}...".format(self.diann_report_path)) pattern = re.compile(r"\(.*?\)") diff --git a/setup.py b/setup.py index a0819a1..d9b8b45 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,8 @@ def readme(): 'quantification_method = pmultiqc.cli:quantification_method', 'disable_table = pmultiqc.cli:disable_table', 'ignored_idxml = pmultiqc.cli:ignored_idxml', - 'affix_type = pmultiqc.cli:affix_type' + 'affix_type = pmultiqc.cli:affix_type', + 'mzid_plugin = pmultiqc.cli:mzid_plugin' ], 'multiqc.hooks.v1': [ 'execution_start = pmultiqc.main:pmultiqc_plugin_execution_start'