Skip to content

Commit

Permalink
update: psm
Browse files Browse the repository at this point in the history
  • Loading branch information
zprobot committed Oct 23, 2024
1 parent 45bbb2f commit 3234963
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 25 deletions.
2 changes: 1 addition & 1 deletion quantmsio/commands/psm_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def convert_psm_file(

psm_manager = Psm(mzTab_path=mztab_file)
output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet")
psm_manager.write_feature_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file)
psm_manager.write_psm_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file)


@click.command("compare-set-psms", short_help="plot venn for a set of Psms parquet")
Expand Down
5 changes: 3 additions & 2 deletions quantmsio/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
PSM_MAP = {
"sequence": "sequence",
"modifications": "modifications",
"opt_global_cv_MS:1000889_peptidoform_sequence": "peptidoform",
"opt_global_Posterior_Error_Probability_score": "posterior_error_probability",
"opt_global_q-value": "global_qvalue",
#"opt_global_q-value": "global_qvalue",
"opt_global_cv_MS:1002217_decoy_peptide": "is_decoy",
"calc_mass_to_charge": "calculated_mz",
"accession": "mp_accessions",
"unique": "unique",
#"unique": "unique",
"charge": "precursor_charge",
"exp_mass_to_charge": "observed_mz",
"retention_time": "rt",
Expand Down
1 change: 1 addition & 0 deletions quantmsio/core/mztab.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def get_mods_map(self):
Mod = modifications_db.getModification(mod)
unimod = Mod.getUniModAccession()
mods_map[mod] = [unimod.upper(), site]
mods_map[unimod.upper()] = [mod, site]
line = f.readline()
f.close()
return mods_map
Expand Down
35 changes: 13 additions & 22 deletions quantmsio/core/psm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ def __init__(self, mzTab_path):
super(Psm, self).__init__(mzTab_path)
self._ms_runs = self.extract_ms_runs()
self._protein_global_qvalue_map = self.get_protein_map()
#self._modifications = self.get_modifications()
self._score_names = self.get_score_names()
self._mods_map = self.get_mods_map()
self._automaton = get_ahocorasick(self._mods_map)
Expand All @@ -24,10 +23,7 @@ def iter_psm_table(self, chunksize=1000000, protein_str=None):
df = df[df["accession"].str.contains(f"{protein_str}", na=False)]
no_cols = set(PSM_USECOLS) - set(df.columns)
for col in no_cols:
if col == "unique":
df.loc[:, col] = df["accession"].apply(lambda x: 0 if ";" in x else 1)
else:
df.loc[:, col] = None
df.loc[:, col] = None
df.rename(columns=PSM_MAP, inplace=True)
yield df

Expand All @@ -48,12 +44,12 @@ def generate_report(self, chunksize=1000000, protein_str=None):
for df in self.iter_psm_table(chunksize=chunksize, protein_str=protein_str):
self.transform_psm(df)
self.add_addition_msg(df)
self.convert_to_parquet_format(df, self._modifications)
self.convert_to_parquet_format(df)
df = self.transform_parquet(df)
yield df

def transform_psm(self, df):
df.loc[:, "modifications"] = df["peptidoform"].apply(lambda row: self.generate_modifications_details(row["peptidoform"], self._mods_map, self._automaton),axis=1)
modifications = df["peptidoform"].apply(lambda seq: self.generate_modifications_details(seq, self._mods_map, self._automaton))
df.loc[:, "scan"] = df["spectra_ref"].apply(generate_scan_number)

df.loc[:, "reference_file_name"] = df["spectra_ref"].apply(lambda x: self._ms_runs[x[: x.index(":")]])
Expand All @@ -62,10 +58,11 @@ def transform_psm(self, df):
)
df.loc[:, "peptidoform"] = df[["modifications", "sequence"]].apply(
lambda row: get_peptidoform_proforma_version_in_mztab(
row["sequence"], row["modifications"], self._modifications
row["sequence"], row["modifications"], self._mods_map
),
axis=1,
)
df.loc[:, "modifications"] = modifications
df.drop(["spectra_ref", "search_engine", "search_engine_score[1]"], inplace=True, axis=1)

@staticmethod
Expand All @@ -80,19 +77,15 @@ def _genarate_additional_scores(self, cols):
return struct_list

def add_addition_msg(self, df):
df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map)
df.loc[:, "cv_params"] = None
df.loc[:, "best_id_score"] = None
df.loc[:, "consensus_support"] = None
df.loc[:, "modification_details"] = None
df.loc[:, "predicted_rt"] = None
df.loc[:, "ion_mobility"] = None
df.loc[:, "number_peaks"] = None
df.loc[:, "mz_array"] = None
df.loc[:, "intensity_array"] = None
df.loc[:, "rank"] = None
df.loc[:, "cv_params"] = None

def write_feature_to_file(self, output_path, chunksize=1000000, protein_file=None):
def write_psm_to_file(self, output_path, chunksize=1000000, protein_file=None):
protein_list = extract_protein_list(protein_file) if protein_file else None
protein_str = "|".join(protein_list) if protein_list else None
pqwriter = None
Expand All @@ -104,22 +97,20 @@ def write_feature_to_file(self, output_path, chunksize=1000000, protein_file=Non
pqwriter.close()

@staticmethod
def convert_to_parquet_format(res, modifications):
def convert_to_parquet_format(res):
res["mp_accessions"] = res["mp_accessions"].str.split(";")
res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float)
res["unique"] = res["unique"].astype("Int32")
#res["modifications"] = res["modifications"].apply(lambda x: generate_modification_list(x, modifications))
res["precursor_charge"] = res["precursor_charge"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32")
res["calculated_mz"] = res["calculated_mz"].astype(float)
res["observed_mz"] = res["observed_mz"].astype(float)
res["posterior_error_probability"] = res["posterior_error_probability"].astype(float)
res["global_qvalue"] = res["global_qvalue"].astype(float)
res["is_decoy"] = res["is_decoy"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32")

res["scan"] = res["scan"].astype(str)

if "rt" in res.columns:
res["rt"] = res["rt"].astype(float)
else:
res.loc[:, "rt"] = None
# return pa.Table.from_pandas(res, schema=PSM_SCHEMA)

#df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map)
#res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float)
#res["unique"] = res["unique"].astype("Int32")
#res["global_qvalue"] = res["global_qvalue"].astype(float)

0 comments on commit 3234963

Please sign in to comment.