diff --git a/quantmsio/commands/psm_command.py b/quantmsio/commands/psm_command.py index 703488d..0f88a99 100644 --- a/quantmsio/commands/psm_command.py +++ b/quantmsio/commands/psm_command.py @@ -56,7 +56,7 @@ def convert_psm_file( psm_manager = Psm(mzTab_path=mztab_file) output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet") - psm_manager.write_feature_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file) + psm_manager.write_psm_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file) @click.command("compare-set-psms", short_help="plot venn for a set of Psms parquet") diff --git a/quantmsio/core/common.py b/quantmsio/core/common.py index e0010a8..2c67955 100644 --- a/quantmsio/core/common.py +++ b/quantmsio/core/common.py @@ -1,14 +1,16 @@ from quantmsio import __version__ - +from quantmsio.core.format import PSM_FIELDS, FEATURE_FIELDS +import pyarrow as pa PSM_MAP = { "sequence": "sequence", "modifications": "modifications", + "opt_global_cv_MS:1000889_peptidoform_sequence": "peptidoform", "opt_global_Posterior_Error_Probability_score": "posterior_error_probability", - "opt_global_q-value": "global_qvalue", + #"opt_global_q-value": "global_qvalue", "opt_global_cv_MS:1002217_decoy_peptide": "is_decoy", "calc_mass_to_charge": "calculated_mz", "accession": "mp_accessions", - "unique": "unique", + #"unique": "unique", "charge": "precursor_charge", "exp_mass_to_charge": "observed_mz", "retention_time": "rt", @@ -75,3 +77,12 @@ MAXQUANT_USECOLS = list(MAXQUANT_MAP.keys()) QUANTMSIO_VERSION = __version__ + +PSM_SCHEMA = pa.schema( + PSM_FIELDS, + metadata={"description": "psm file in quantms.io format"}, +) +FEATURE_SCHEMA = pa.schema( + FEATURE_FIELDS, + metadata={"description": "feature file in quantms.io format"}, +) \ No newline at end of file diff --git a/quantmsio/core/feature.py b/quantmsio/core/feature.py index 1676bc5..35a3c65 100644 --- a/quantmsio/core/feature.py +++ b/quantmsio/core/feature.py @@ -14,12 +14,6 @@ ) from quantmsio.utils.constants import ITRAQ_CHANNEL, TMT_CHANNELS from quantmsio.core.common import MSSTATS_MAP, MSSTATS_USECOLS, SDRF_USECOLS, SDRF_MAP, QUANTMSIO_VERSION -from quantmsio.core.format import FEATURE_FIELDS - -FEATURE_SCHEMA = pa.schema( - FEATURE_FIELDS, - metadata={"description": "feature file in quantms.io format"}, -) class Feature(MzTab): diff --git a/quantmsio/core/format.py b/quantmsio/core/format.py index 8eeb5d0..436c10d 100644 --- a/quantmsio/core/format.py +++ b/quantmsio/core/format.py @@ -13,11 +13,6 @@ ), pa.field( "modifications", - pa.list_(pa.string()), - metadata={"description": "List of modifications as string array, easy for search and filter"}, - ), - pa.field( - "modification_details", pa.list_( pa.struct( [ @@ -43,11 +38,6 @@ pa.float32(), metadata={"description": "Posterior error probability for the given peptide spectrum match"}, ), - pa.field( - "global_qvalue", - pa.float32(), - metadata={"description": "Global q-value of the peptide or psm at the level of the experiment"}, - ), pa.field( "is_decoy", pa.int32(), @@ -77,18 +67,6 @@ pa.list_(pa.struct([("name", pa.string()), ("value", pa.float32())])), metadata={"description": "List of structures, each structure contains two fields: name and value"}, ), - pa.field( - "unique", - pa.int32(), - metadata={ - "description": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0" - }, - ), - pa.field( - "pg_global_qvalue", - pa.float32(), - metadata={"description": "Global q-value of the protein group at the experiment level"}, - ), pa.field( "mp_accessions", pa.list_(pa.string()), @@ -117,21 +95,11 @@ ] PSM_UNIQUE_FIELDS = [ - pa.field( - "consensus_support", - pa.float32(), - metadata={ - "description": "Consensus support for the given peptide spectrum match, when multiple search engines are used" - }, - ), pa.field( "rt", pa.float32(), metadata={"description": "MS2 scan’s precursor retention time (in seconds)"}, ), - pa.field( - "rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"} - ), pa.field( "ion_mobility", pa.float32(), @@ -278,3 +246,35 @@ PSM_FIELDS = PEPTIDE_FIELDS + PSM_UNIQUE_FIELDS FEATURE_FIELDS = PEPTIDE_FIELDS + FEATURE_UNIQUE_FIELDS + +# pa.field( +# "modifications", +# pa.list_(pa.string()), +# metadata={"description": "List of modifications as string array, easy for search and filter"}, +# ), + +# pa.field( +# "pg_global_qvalue", +# pa.float32(), +# metadata={"description": "Global q-value of the protein group at the experiment level"}, +# ), + +# pa.field( +# "consensus_support", +# pa.float32(), +# metadata={ +# "description": "Consensus support for the given peptide spectrum match, when multiple search engines are used" +# }, +# ), + +# pa.field( +# "unique", +# pa.int32(), +# metadata={ +# "description": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0" +# }, +# ), + +# pa.field( +# "rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"} +# ), \ No newline at end of file diff --git a/quantmsio/core/maxquant.py b/quantmsio/core/maxquant.py index 8541688..5f14262 100644 --- a/quantmsio/core/maxquant.py +++ b/quantmsio/core/maxquant.py @@ -6,7 +6,7 @@ import pyarrow.parquet as pq from quantmsio.core.sdrf import SDRFHandler from pyopenms import AASequence -from quantmsio.operate.tools import get_ahocorasick, get_modification_details +from quantmsio.operate.tools import get_ahocorasick, get_modification_details, get_mod_map from pyopenms.Constants import PROTON_MASS_U from quantmsio.utils.pride_utils import get_peptidoform_proforma_version_in_mztab from quantmsio.core.common import MAXQUANT_MAP, MAXQUANT_USECOLS @@ -49,24 +49,6 @@ def find_modification(peptide): return original_mods -def get_mod_map(sdrf_path): - sdrf = pd.read_csv(sdrf_path, sep="\t", nrows=1) - mod_cols = [col for col in sdrf.columns if col.startswith("comment[modification parameters]")] - mod_map = {} - for col in mod_cols: - mod_msg = sdrf[col].values[0].split(";") - mod_dict = {k.split("=")[0]: k.split("=")[1] for k in mod_msg} - if "TA" in mod_dict: - mod = f"{mod_dict['NT']} ({mod_dict['TA']})" - elif "PP" in mod_dict: - mod = f"{mod_dict['NT']} ({mod_dict['PP']})" - else: - mod = mod_dict["NT"] - mod_map[mod] = mod_dict["AC"] - - return mod_map - - def generate_mods(row, mod_map): mod_seq = row["Modified sequence"] mod_p = find_modification(mod_seq) diff --git a/quantmsio/core/mztab.py b/quantmsio/core/mztab.py index 0c5df08..fcd39a4 100644 --- a/quantmsio/core/mztab.py +++ b/quantmsio/core/mztab.py @@ -1,8 +1,10 @@ import codecs import os import pandas as pd +import re +from pyopenms import ModificationsDB from quantmsio.utils.pride_utils import get_quantmsio_modifications - +from quantmsio.operate.tools import get_modification_details def generate_modification_list(modification_str: str, modifications): @@ -221,3 +223,38 @@ def get_modifications(self): line = f.readline() f.close() return mod_dict + + def get_mods_map(self): + if os.stat(self.mztab_path).st_size == 0: + raise ValueError("File is empty") + f = codecs.open(self.mztab_path, "r", "utf-8") + line = f.readline() + mods_map = {} + modifications_db = ModificationsDB() + while line.startswith("MTD"): + if "software" in line and "_modifications" in line: + parts = line.split("\t") + mods = parts[2].split(":")[1].strip().split(",") + for mod in mods: + match = re.search(r'\((.*?)\)', mod) + mod = re.search(r'^[a-zA-Z]+', mod) + if match: + site = match.group(1) + else: + site = "X" + mod = mod.group(0) + Mod = modifications_db.getModification(mod) + unimod = Mod.getUniModAccession() + mods_map[mod] = [unimod.upper(), site] + mods_map[unimod.upper()] = [mod, site] + line = f.readline() + f.close() + return mods_map + + def generate_modifications_details(self, seq, mods_map, automaton): + seq = seq.replace('.','') + modification_details = get_modification_details(seq, mods_map, automaton) + if(len(modification_details) == 0): + return None + else: + return modification_details \ No newline at end of file diff --git a/quantmsio/core/psm.py b/quantmsio/core/psm.py index fedea02..66c872f 100644 --- a/quantmsio/core/psm.py +++ b/quantmsio/core/psm.py @@ -3,24 +3,19 @@ from quantmsio.utils.file_utils import extract_protein_list from quantmsio.utils.pride_utils import generate_scan_number from quantmsio.utils.pride_utils import get_peptidoform_proforma_version_in_mztab -from quantmsio.core.common import PSM_USECOLS, PSM_MAP, QUANTMSIO_VERSION +from quantmsio.operate.tools import get_ahocorasick, get_modification_details, get_mod_map +from quantmsio.core.common import PSM_USECOLS, PSM_MAP, PSM_SCHEMA from quantmsio.core.mztab import MzTab, generate_modification_list -from quantmsio.core.format import PSM_FIELDS import pandas as pd -PSM_SCHEMA = pa.schema( - PSM_FIELDS, - metadata={"description": "psm file in quantms.io format"}, -) - - class Psm(MzTab): def __init__(self, mzTab_path): super(Psm, self).__init__(mzTab_path) self._ms_runs = self.extract_ms_runs() self._protein_global_qvalue_map = self.get_protein_map() - self._modifications = self.get_modifications() self._score_names = self.get_score_names() + self._mods_map = self.get_mods_map() + self._automaton = get_ahocorasick(self._mods_map) def iter_psm_table(self, chunksize=1000000, protein_str=None): for df in self.skip_and_load_csv("PSH", chunksize=chunksize): @@ -28,10 +23,7 @@ def iter_psm_table(self, chunksize=1000000, protein_str=None): df = df[df["accession"].str.contains(f"{protein_str}", na=False)] no_cols = set(PSM_USECOLS) - set(df.columns) for col in no_cols: - if col == "unique": - df.loc[:, col] = df["accession"].apply(lambda x: 0 if ";" in x else 1) - else: - df.loc[:, col] = None + df.loc[:, col] = None df.rename(columns=PSM_MAP, inplace=True) yield df @@ -52,11 +44,12 @@ def generate_report(self, chunksize=1000000, protein_str=None): for df in self.iter_psm_table(chunksize=chunksize, protein_str=protein_str): self.transform_psm(df) self.add_addition_msg(df) - self.convert_to_parquet_format(df, self._modifications) + self.convert_to_parquet_format(df) df = self.transform_parquet(df) yield df def transform_psm(self, df): + modifications = df["peptidoform"].apply(lambda seq: self.generate_modifications_details(seq, self._mods_map, self._automaton)) df.loc[:, "scan"] = df["spectra_ref"].apply(generate_scan_number) df.loc[:, "reference_file_name"] = df["spectra_ref"].apply(lambda x: self._ms_runs[x[: x.index(":")]]) @@ -65,10 +58,11 @@ def transform_psm(self, df): ) df.loc[:, "peptidoform"] = df[["modifications", "sequence"]].apply( lambda row: get_peptidoform_proforma_version_in_mztab( - row["sequence"], row["modifications"], self._modifications + row["sequence"], row["modifications"], self._mods_map ), axis=1, ) + df.loc[:, "modifications"] = modifications df.drop(["spectra_ref", "search_engine", "search_engine_score[1]"], inplace=True, axis=1) @staticmethod @@ -83,19 +77,15 @@ def _genarate_additional_scores(self, cols): return struct_list def add_addition_msg(self, df): - df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map) + df.loc[:, "cv_params"] = None df.loc[:, "best_id_score"] = None - df.loc[:, "consensus_support"] = None - df.loc[:, "modification_details"] = None df.loc[:, "predicted_rt"] = None df.loc[:, "ion_mobility"] = None df.loc[:, "number_peaks"] = None df.loc[:, "mz_array"] = None df.loc[:, "intensity_array"] = None - df.loc[:, "rank"] = None - df.loc[:, "cv_params"] = None - def write_feature_to_file(self, output_path, chunksize=1000000, protein_file=None): + def write_psm_to_file(self, output_path, chunksize=1000000, protein_file=None): protein_list = extract_protein_list(protein_file) if protein_file else None protein_str = "|".join(protein_list) if protein_list else None pqwriter = None @@ -107,22 +97,20 @@ def write_feature_to_file(self, output_path, chunksize=1000000, protein_file=Non pqwriter.close() @staticmethod - def convert_to_parquet_format(res, modifications): + def convert_to_parquet_format(res): res["mp_accessions"] = res["mp_accessions"].str.split(";") - res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float) - res["unique"] = res["unique"].astype("Int32") - res["modifications"] = res["modifications"].apply(lambda x: generate_modification_list(x, modifications)) res["precursor_charge"] = res["precursor_charge"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32") res["calculated_mz"] = res["calculated_mz"].astype(float) res["observed_mz"] = res["observed_mz"].astype(float) res["posterior_error_probability"] = res["posterior_error_probability"].astype(float) - res["global_qvalue"] = res["global_qvalue"].astype(float) res["is_decoy"] = res["is_decoy"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32") - res["scan"] = res["scan"].astype(str) - if "rt" in res.columns: res["rt"] = res["rt"].astype(float) else: res.loc[:, "rt"] = None - # return pa.Table.from_pandas(res, schema=PSM_SCHEMA) + +#df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map) +#res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float) +#res["unique"] = res["unique"].astype("Int32") +#res["global_qvalue"] = res["global_qvalue"].astype(float) \ No newline at end of file diff --git a/quantmsio/operate/tools.py b/quantmsio/operate/tools.py index f4d645d..2f1a226 100644 --- a/quantmsio/operate/tools.py +++ b/quantmsio/operate/tools.py @@ -6,8 +6,7 @@ from Bio import SeqIO from quantmsio.core.project import ProjectHandler import ahocorasick -from quantmsio.core.feature import FEATURE_SCHEMA -from quantmsio.core.psm import PSM_SCHEMA +from quantmsio.core.common import FEATURE_SCHEMA, PSM_SCHEMA from quantmsio.operate.query import Query from quantmsio.utils.pride_utils import get_unanimous_name from quantmsio.utils.file_utils import read_large_parquet @@ -173,21 +172,25 @@ def load_de_or_ae(path): return pd.read_csv(f, sep="\t"), content -def get_modification_details(seq: str, mods_dict: dict, automaton: any, modifications: list = None): +def get_modification_details(seq: str, mods_dict: dict, automaton: any): if "(" not in seq: return [] total = 0 - modification_details = [{"name": mods_dict[key], "fields": []} for key in modifications] + modifications = [] + modification_details = [] for item in automaton.iter(seq): total += len(item[1]) modification = item[1][1:-1] + position = item[0] - total + 1 if modification in modifications: index = modifications.index(modification) - position = item[0] - total + 1 modification_details[index]["fields"].append({"position": position, "localization_probability": 1.0}) + else: + modifications.append(modification) + modification_details.append({"name": mods_dict[modification][0], "fields": [{"position": position, "localization_probability": 1.0}]}) + return modification_details - def get_ahocorasick(mods_dict: dict): automaton = ahocorasick.Automaton() for key in mods_dict.keys(): @@ -195,3 +198,21 @@ def get_ahocorasick(mods_dict: dict): automaton.add_word(key, key) automaton.make_automaton() return automaton + + +def get_mod_map(sdrf_path): + sdrf = pd.read_csv(sdrf_path, sep="\t", nrows=1) + mod_cols = [col for col in sdrf.columns if col.startswith("comment[modification parameters]")] + mod_map = {} + for col in mod_cols: + mod_msg = sdrf[col].values[0].split(";") + mod_dict = {k.split("=")[0]: k.split("=")[1] for k in mod_msg} + if "TA" in mod_dict: + mod = f"{mod_dict['NT']} ({mod_dict['TA']})" + elif "PP" in mod_dict: + mod = f"{mod_dict['NT']} ({mod_dict['PP']})" + else: + mod = mod_dict["NT"] + mod_map[mod] = mod_dict["AC"] + + return mod_map \ No newline at end of file