update: psm

bigbio · Oct 23, 2024 · 3234963 · 3234963
1 parent 45bbb2f
commit 3234963
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 25 deletions.
diff --git a/quantmsio/commands/psm_command.py b/quantmsio/commands/psm_command.py
@@ -56,7 +56,7 @@ def convert_psm_file(
 
     psm_manager = Psm(mzTab_path=mztab_file)
     output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet")
-    psm_manager.write_feature_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file)
+    psm_manager.write_psm_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file)
 
 
 @click.command("compare-set-psms", short_help="plot venn for a set of Psms parquet")

diff --git a/quantmsio/core/common.py b/quantmsio/core/common.py
@@ -4,12 +4,13 @@
 PSM_MAP = {
     "sequence": "sequence",
     "modifications": "modifications",
+    "opt_global_cv_MS:1000889_peptidoform_sequence": "peptidoform",
     "opt_global_Posterior_Error_Probability_score": "posterior_error_probability",
-    "opt_global_q-value": "global_qvalue",
+    #"opt_global_q-value": "global_qvalue",
     "opt_global_cv_MS:1002217_decoy_peptide": "is_decoy",
     "calc_mass_to_charge": "calculated_mz",
     "accession": "mp_accessions",
-    "unique": "unique",
+    #"unique": "unique",
     "charge": "precursor_charge",
     "exp_mass_to_charge": "observed_mz",
     "retention_time": "rt",

diff --git a/quantmsio/core/mztab.py b/quantmsio/core/mztab.py
@@ -246,6 +246,7 @@ def get_mods_map(self):
                     Mod = modifications_db.getModification(mod)
                     unimod = Mod.getUniModAccession()
                     mods_map[mod] = [unimod.upper(), site]
+                    mods_map[unimod.upper()] = [mod, site]
             line = f.readline()
         f.close()
         return mods_map

diff --git a/quantmsio/core/psm.py b/quantmsio/core/psm.py
@@ -13,7 +13,6 @@ def __init__(self, mzTab_path):
         super(Psm, self).__init__(mzTab_path)
         self._ms_runs = self.extract_ms_runs()
         self._protein_global_qvalue_map = self.get_protein_map()
-        #self._modifications = self.get_modifications()
         self._score_names = self.get_score_names()
         self._mods_map = self.get_mods_map()
         self._automaton = get_ahocorasick(self._mods_map)
@@ -24,10 +23,7 @@ def iter_psm_table(self, chunksize=1000000, protein_str=None):
                 df = df[df["accession"].str.contains(f"{protein_str}", na=False)]
             no_cols = set(PSM_USECOLS) - set(df.columns)
             for col in no_cols:
-                if col == "unique":
-                    df.loc[:, col] = df["accession"].apply(lambda x: 0 if ";" in x else 1)
-                else:
-                    df.loc[:, col] = None
+                df.loc[:, col] = None
             df.rename(columns=PSM_MAP, inplace=True)
             yield df
 
@@ -48,12 +44,12 @@ def generate_report(self, chunksize=1000000, protein_str=None):
         for df in self.iter_psm_table(chunksize=chunksize, protein_str=protein_str):
             self.transform_psm(df)
             self.add_addition_msg(df)
-            self.convert_to_parquet_format(df, self._modifications)
+            self.convert_to_parquet_format(df)
             df = self.transform_parquet(df)
             yield df
 
     def transform_psm(self, df):
-        df.loc[:, "modifications"] = df["peptidoform"].apply(lambda row: self.generate_modifications_details(row["peptidoform"], self._mods_map, self._automaton),axis=1)
+        modifications = df["peptidoform"].apply(lambda seq: self.generate_modifications_details(seq, self._mods_map, self._automaton))
         df.loc[:, "scan"] = df["spectra_ref"].apply(generate_scan_number)
 
         df.loc[:, "reference_file_name"] = df["spectra_ref"].apply(lambda x: self._ms_runs[x[: x.index(":")]])
@@ -62,10 +58,11 @@ def transform_psm(self, df):
         )
         df.loc[:, "peptidoform"] = df[["modifications", "sequence"]].apply(
             lambda row: get_peptidoform_proforma_version_in_mztab(
-                row["sequence"], row["modifications"], self._modifications
+                row["sequence"], row["modifications"], self._mods_map
             ),
             axis=1,
         )
+        df.loc[:, "modifications"] = modifications
         df.drop(["spectra_ref", "search_engine", "search_engine_score[1]"], inplace=True, axis=1)
 
     @staticmethod
@@ -80,19 +77,15 @@ def _genarate_additional_scores(self, cols):
         return struct_list
 
     def add_addition_msg(self, df):
-        df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map)
+        df.loc[:, "cv_params"] = None
         df.loc[:, "best_id_score"] = None
-        df.loc[:, "consensus_support"] = None
-        df.loc[:, "modification_details"] = None
         df.loc[:, "predicted_rt"] = None
         df.loc[:, "ion_mobility"] = None
         df.loc[:, "number_peaks"] = None
         df.loc[:, "mz_array"] = None
         df.loc[:, "intensity_array"] = None
-        df.loc[:, "rank"] = None
-        df.loc[:, "cv_params"] = None
 
-    def write_feature_to_file(self, output_path, chunksize=1000000, protein_file=None):
+    def write_psm_to_file(self, output_path, chunksize=1000000, protein_file=None):
         protein_list = extract_protein_list(protein_file) if protein_file else None
         protein_str = "|".join(protein_list) if protein_list else None
         pqwriter = None
@@ -104,22 +97,20 @@ def write_feature_to_file(self, output_path, chunksize=1000000, protein_file=Non
             pqwriter.close()
 
     @staticmethod
-    def convert_to_parquet_format(res, modifications):
+    def convert_to_parquet_format(res):
         res["mp_accessions"] = res["mp_accessions"].str.split(";")
-        res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float)
-        res["unique"] = res["unique"].astype("Int32")
-        #res["modifications"] = res["modifications"].apply(lambda x: generate_modification_list(x, modifications))
         res["precursor_charge"] = res["precursor_charge"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32")
         res["calculated_mz"] = res["calculated_mz"].astype(float)
         res["observed_mz"] = res["observed_mz"].astype(float)
         res["posterior_error_probability"] = res["posterior_error_probability"].astype(float)
-        res["global_qvalue"] = res["global_qvalue"].astype(float)
         res["is_decoy"] = res["is_decoy"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32")
-
         res["scan"] = res["scan"].astype(str)
-
         if "rt" in res.columns:
             res["rt"] = res["rt"].astype(float)
         else:
             res.loc[:, "rt"] = None
-        # return pa.Table.from_pandas(res, schema=PSM_SCHEMA)
+
+#df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map)
+#res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float)
+#res["unique"] = res["unique"].astype("Int32")
+#res["global_qvalue"] = res["global_qvalue"].astype(float)