diff --git a/quantmsio/commands/psm_command.py b/quantmsio/commands/psm_command.py index 703488d..0f88a99 100644 --- a/quantmsio/commands/psm_command.py +++ b/quantmsio/commands/psm_command.py @@ -56,7 +56,7 @@ def convert_psm_file( psm_manager = Psm(mzTab_path=mztab_file) output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet") - psm_manager.write_feature_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file) + psm_manager.write_psm_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file) @click.command("compare-set-psms", short_help="plot venn for a set of Psms parquet") diff --git a/quantmsio/core/common.py b/quantmsio/core/common.py index 501caa2..2c67955 100644 --- a/quantmsio/core/common.py +++ b/quantmsio/core/common.py @@ -4,12 +4,13 @@ PSM_MAP = { "sequence": "sequence", "modifications": "modifications", + "opt_global_cv_MS:1000889_peptidoform_sequence": "peptidoform", "opt_global_Posterior_Error_Probability_score": "posterior_error_probability", - "opt_global_q-value": "global_qvalue", + #"opt_global_q-value": "global_qvalue", "opt_global_cv_MS:1002217_decoy_peptide": "is_decoy", "calc_mass_to_charge": "calculated_mz", "accession": "mp_accessions", - "unique": "unique", + #"unique": "unique", "charge": "precursor_charge", "exp_mass_to_charge": "observed_mz", "retention_time": "rt", diff --git a/quantmsio/core/mztab.py b/quantmsio/core/mztab.py index 2f6eebf..fcd39a4 100644 --- a/quantmsio/core/mztab.py +++ b/quantmsio/core/mztab.py @@ -246,6 +246,7 @@ def get_mods_map(self): Mod = modifications_db.getModification(mod) unimod = Mod.getUniModAccession() mods_map[mod] = [unimod.upper(), site] + mods_map[unimod.upper()] = [mod, site] line = f.readline() f.close() return mods_map diff --git a/quantmsio/core/psm.py b/quantmsio/core/psm.py index 40e12f0..66c872f 100644 --- a/quantmsio/core/psm.py +++ b/quantmsio/core/psm.py @@ -13,7 +13,6 @@ def __init__(self, mzTab_path): super(Psm, self).__init__(mzTab_path) self._ms_runs = self.extract_ms_runs() self._protein_global_qvalue_map = self.get_protein_map() - #self._modifications = self.get_modifications() self._score_names = self.get_score_names() self._mods_map = self.get_mods_map() self._automaton = get_ahocorasick(self._mods_map) @@ -24,10 +23,7 @@ def iter_psm_table(self, chunksize=1000000, protein_str=None): df = df[df["accession"].str.contains(f"{protein_str}", na=False)] no_cols = set(PSM_USECOLS) - set(df.columns) for col in no_cols: - if col == "unique": - df.loc[:, col] = df["accession"].apply(lambda x: 0 if ";" in x else 1) - else: - df.loc[:, col] = None + df.loc[:, col] = None df.rename(columns=PSM_MAP, inplace=True) yield df @@ -48,12 +44,12 @@ def generate_report(self, chunksize=1000000, protein_str=None): for df in self.iter_psm_table(chunksize=chunksize, protein_str=protein_str): self.transform_psm(df) self.add_addition_msg(df) - self.convert_to_parquet_format(df, self._modifications) + self.convert_to_parquet_format(df) df = self.transform_parquet(df) yield df def transform_psm(self, df): - df.loc[:, "modifications"] = df["peptidoform"].apply(lambda row: self.generate_modifications_details(row["peptidoform"], self._mods_map, self._automaton),axis=1) + modifications = df["peptidoform"].apply(lambda seq: self.generate_modifications_details(seq, self._mods_map, self._automaton)) df.loc[:, "scan"] = df["spectra_ref"].apply(generate_scan_number) df.loc[:, "reference_file_name"] = df["spectra_ref"].apply(lambda x: self._ms_runs[x[: x.index(":")]]) @@ -62,10 +58,11 @@ def transform_psm(self, df): ) df.loc[:, "peptidoform"] = df[["modifications", "sequence"]].apply( lambda row: get_peptidoform_proforma_version_in_mztab( - row["sequence"], row["modifications"], self._modifications + row["sequence"], row["modifications"], self._mods_map ), axis=1, ) + df.loc[:, "modifications"] = modifications df.drop(["spectra_ref", "search_engine", "search_engine_score[1]"], inplace=True, axis=1) @staticmethod @@ -80,19 +77,15 @@ def _genarate_additional_scores(self, cols): return struct_list def add_addition_msg(self, df): - df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map) + df.loc[:, "cv_params"] = None df.loc[:, "best_id_score"] = None - df.loc[:, "consensus_support"] = None - df.loc[:, "modification_details"] = None df.loc[:, "predicted_rt"] = None df.loc[:, "ion_mobility"] = None df.loc[:, "number_peaks"] = None df.loc[:, "mz_array"] = None df.loc[:, "intensity_array"] = None - df.loc[:, "rank"] = None - df.loc[:, "cv_params"] = None - def write_feature_to_file(self, output_path, chunksize=1000000, protein_file=None): + def write_psm_to_file(self, output_path, chunksize=1000000, protein_file=None): protein_list = extract_protein_list(protein_file) if protein_file else None protein_str = "|".join(protein_list) if protein_list else None pqwriter = None @@ -104,22 +97,20 @@ def write_feature_to_file(self, output_path, chunksize=1000000, protein_file=Non pqwriter.close() @staticmethod - def convert_to_parquet_format(res, modifications): + def convert_to_parquet_format(res): res["mp_accessions"] = res["mp_accessions"].str.split(";") - res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float) - res["unique"] = res["unique"].astype("Int32") - #res["modifications"] = res["modifications"].apply(lambda x: generate_modification_list(x, modifications)) res["precursor_charge"] = res["precursor_charge"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32") res["calculated_mz"] = res["calculated_mz"].astype(float) res["observed_mz"] = res["observed_mz"].astype(float) res["posterior_error_probability"] = res["posterior_error_probability"].astype(float) - res["global_qvalue"] = res["global_qvalue"].astype(float) res["is_decoy"] = res["is_decoy"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32") - res["scan"] = res["scan"].astype(str) - if "rt" in res.columns: res["rt"] = res["rt"].astype(float) else: res.loc[:, "rt"] = None - # return pa.Table.from_pandas(res, schema=PSM_SCHEMA) + +#df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map) +#res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float) +#res["unique"] = res["unique"].astype("Int32") +#res["global_qvalue"] = res["global_qvalue"].astype(float) \ No newline at end of file