Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev library formatting - decoy generation implementation #71

Closed
wants to merge 13 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 195 additions & 0 deletions src/zodiaq/loaders/library/decoyGenerationFunctions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
import re
import random
from pyteomics import mass
from zodiaq.utils import format_protein_string_to_list, format_protein_list_to_string
from zodiaq.loaders.library.modificationMassDict import modificationMassDict

nonCleavageAminoAcids = [
"A",
"N",
"D",
"C",
"E",
"Q",
"G",
"H",
"I",
"L",
"M",
"F",
"S",
"T",
"W",
"Y",
"V",
]
cleavageAminoAcids = set(["K", "P", "R"])
modificationRegexStr = r"([A-Z](?:[\[\(][^\)\]]+[\]\)])?)"


def shuffle_peptide_sequence_with_preserved_cleavage_points(peptide):
peptide = re.sub("^(?:[\[\(][^\)\]]+[\]\)])", "", peptide)
originalPeptide = re.findall(modificationRegexStr, peptide)
cleavageAALocations = [
(i - len(originalPeptide) + 1, char)
for (i, char) in enumerate(originalPeptide)
if char in cleavageAminoAcids
]
otherAAs = [char for char in originalPeptide if char not in cleavageAminoAcids]
for i in range(100):
shuffledPeptide = shuffle_non_cleavage_amino_acids(otherAAs, i)
shuffledPeptide = insert_cleavage_amino_acids_into_shuffled_peptide(
shuffledPeptide, cleavageAALocations, originalPeptide
)
if (
calculate_similarities_between_strings(originalPeptide, shuffledPeptide)
< 0.7
):
return "".join(shuffledPeptide)
if calculate_similarities_between_strings(originalPeptide, shuffledPeptide) >= 0.7:
return peptide
return "".join(shuffledPeptide)


def shuffle_non_cleavage_amino_acids(otherAAs, i):
shuffledPeptide = otherAAs[:]
random.shuffle(shuffledPeptide)
if i % 10 == 0:
randomly_swap_single_amino_acid_in_shuffled_peptide_to_increase_variability(
shuffledPeptide
)
return shuffledPeptide


def randomly_swap_single_amino_acid_in_shuffled_peptide_to_increase_variability(
shuffledPeptide,
):
randomIdx = random.randint(0, len(shuffledPeptide) - 1)
shuffledPeptide[randomIdx] = random.choice(nonCleavageAminoAcids)
return shuffledPeptide


def insert_cleavage_amino_acids_into_shuffled_peptide(
shuffledPeptide, cleavageAALocations, peptide
):
for j in range(len(cleavageAALocations) - 1, -1, -1):
cleavageAAIndex = cleavageAALocations[j][0]
if not cleavageAAIndex:
cleavageAAIndex = len(peptide)
cleavageAA = cleavageAALocations[j][1]
shuffledPeptide.insert(cleavageAAIndex, cleavageAA)
return shuffledPeptide


def insert_value(string, index, char):
return string[:index] + [char] + string[index + 1 :]


def calculate_similarities_between_strings(s1, s2):
minLength = min(len(s1), len(s2))
similarities = 0
for i in range(minLength):
if s1[i] == s2[i]:
similarities += 1
return similarities / minLength


def calculate_ion_mz(sequence, type, position, charge):
terminalSequence = determine_terminal_end_of_sequence(sequence, type, position)
mass = calculate_molecular_mass_of_sequence(terminalSequence, type, charge)
return mass


def determine_terminal_end_of_sequence(sequence, type, position):
modifiedSequence = re.findall(modificationRegexStr, sequence)
if type == "y":
return "".join(modifiedSequence[-position:])
else:
return "".join(modifiedSequence[:position])


def calculate_molecular_mass_of_sequence(sequence, type, charge):
modifications = re.findall(r"([\[\(][^\)\]]+[\]\)])", sequence)
unmodifiedSequence = re.sub(r"([\[\(][^\)\]]+[\]\)])", "", sequence)
unmodifiedMass = mass.calculate_mass(
sequence=unmodifiedSequence, ion_type=type, charge=charge
)
modificationMass = sum([modificationMassDict[m] for m in modifications])
return unmodifiedMass + modificationMass


def add_decoys_to_zodiaq_library(zodiaqLibraryDict):
decoyDict = {}
for key, value in zodiaqLibraryDict.items():
decoyKey, decoyValue = create_decoy_from_target_in_zodiaq_library(
key, value.copy()
)
if decoyKey:
decoyDict[decoyKey] = decoyValue
outputDict = recalculate_zodiaq_library_key_idx_to_account_for_inserted_decoys(
decoyDict | zodiaqLibraryDict
)
return outputDict


def create_decoy_from_target_in_zodiaq_library(targetKey, targetValue):
targetPeptide = targetKey[1]
targetPeaks = targetValue["peaks"]
fragmentTypes = targetValue["fragmentTypes"]
decoyPeptide = shuffle_peptide_sequence_with_preserved_cleavage_points(
targetPeptide
)
if decoyPeptide == targetPeptide:
return None, None
decoyMzs = [
calculate_ion_mz(decoyPeptide, *fragmentType) for fragmentType in fragmentTypes
]
decoyPeaks = [(decoyMzs[i], targetPeaks[i][1], -1) for i in range(len(targetPeaks))]
targetProteinString = targetValue["proteinName"]
decoyProteins = format_protein_string_to_list(targetProteinString)
decoyProteins = [f"DECOY_{protein}" for protein in decoyProteins]
decoyProteinString = format_protein_list_to_string(decoyProteins)
decoyValue = targetValue
decoyValue["identification"] = f"{decoyPeptide}_{targetKey[0]}_DECOY"
decoyValue["proteinName"] = decoyProteinString
decoyValue["peaks"] = decoyPeaks
decoyValue["isDecoy"] = 1
decoyValue["zodiaqKeyIdx"] = -1
return (targetKey[0], decoyPeptide), decoyValue


def recalculate_zodiaq_library_key_idx_to_account_for_inserted_decoys(outputDict):
sortedKeys = sorted(outputDict)
for i in range(len(sortedKeys)):
key = sortedKeys[i]
value = outputDict[key]
value["peaks"] = [(x[0], x[1], i) for x in value["peaks"]]
value["zodiaqKeyIdx"] = i
outputDict[key] = value
return outputDict


def determine_if_decoys_should_be_generated(zodiaqLibraryDict):
if there_are_any_decoys_in_library(zodiaqLibraryDict):
return False
if any_library_entries_dont_have_fragment_type_data_requirement_for_decoy_generation(
zodiaqLibraryDict
):
return False
return True


def there_are_any_decoys_in_library(zodiaqLibraryDict):
for key, value in zodiaqLibraryDict.items():
if value["isDecoy"]:
return True
return False


def any_library_entries_dont_have_fragment_type_data_requirement_for_decoy_generation(
zodiaqLibraryDict,
):
for key, value in zodiaqLibraryDict.items():
if "fragmentTypes" not in value:
return True
return False
6 changes: 4 additions & 2 deletions src/zodiaq/loaders/library/libraryLoaderContext.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def __init__(self, libraryFilePath: os.PathLike):
"The .msp library format is not currently supported. If the library file was generated via prosit, please reset the output into a tab-delimited (.tsv) format."
)

def load_zodiaq_library_dict(self):
def load_zodiaq_library_dict(self, isTest=False):
"""See 'load_zodiaq_library_dict_from_file' in 'LibraryLoaderStrategy'"""
return self._strategy.load_zodiaq_library_dict_from_file(self._libraryFilePath)
return self._strategy.load_zodiaq_library_dict_from_file(
self._libraryFilePath, isTest
)
16 changes: 14 additions & 2 deletions src/zodiaq/loaders/library/libraryLoaderStrategy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from abc import ABC, abstractmethod
import os
from zodiaq.loaders.library.decoyGenerationFunctions import (
determine_if_decoys_should_be_generated,
add_decoys_to_zodiaq_library,
)
import random


class LibraryLoaderStrategy(ABC):
Expand Down Expand Up @@ -60,7 +65,9 @@ def _format_raw_library_object_into_zodiaq_library_dict(self) -> dict:
"""
pass

def load_zodiaq_library_dict_from_file(self, libraryFilePath: os.PathLike) -> dict:
def load_zodiaq_library_dict_from_file(
self, libraryFilePath: os.PathLike, isTest
) -> dict:
"""
Outputs a standardized dictionary object from a library file.
The LibraryLoaderContext class calls this function, which uses
Expand All @@ -77,8 +84,12 @@ def load_zodiaq_library_dict_from_file(self, libraryFilePath: os.PathLike) -> di
zodiaqLibDict : dict
see _format_raw_library_object_into_zodiaq_library_dict return value.
"""
random.seed(0)
self._load_raw_library_object_from_file(libraryFilePath)
return self._format_raw_library_object_into_zodiaq_library_dict()
zodiaqLibDict = self._format_raw_library_object_into_zodiaq_library_dict()
if determine_if_decoys_should_be_generated(zodiaqLibDict) and not isTest:
zodiaqLibDict = add_decoys_to_zodiaq_library(zodiaqLibDict)
return zodiaqLibDict


def create_peaks_from_mz_intensity_lists_and_zodiaq_key_id(
Expand Down Expand Up @@ -139,4 +150,5 @@ def remove_low_intensity_peaks_below_max_peak_num(peaks: list, maxPeakNum: int)
"peaks": "peaks",
"zodiaqKeyIdx": "zodiaqKeyIdx",
"isDecoy": "isDecoy",
"fragmentTypes": "fragmentTypes",
}
42 changes: 39 additions & 3 deletions src/zodiaq/loaders/library/libraryLoaderStrategyTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@ def _load_raw_library_object_from_file(self, libraryFilePath: os.PathLike) -> No
self.oldToNewColumnDict.keys(), self.rawUploadedLibraryObject.columns
)

def _format_raw_library_object_into_zodiaq_library_dict(self) -> dict:
def _format_raw_library_object_into_zodiaq_library_dict(
self, maxPeakNum=10
) -> dict:
"""abstract class implementation - see 'libraryLoaderStrategy.py' for details"""
maxPeakNum = 10
reformattedLibDf = _reformat_raw_library_object_columns(
self.rawUploadedLibraryObject, self.oldToNewColumnDict
)
Expand Down Expand Up @@ -93,16 +94,40 @@ def _organize_data_by_zodiaq_library_dict_keys(df: pd.DataFrame) -> dict:
keys = sorted(set(df["zodiaqLibKey"]))
mz = df.groupby("zodiaqLibKey")["peakMz"].apply(list).to_dict()
intensities = df.groupby("zodiaqLibKey")["peakIntensity"].apply(list).to_dict()
fragmentTypes = (
df.groupby("zodiaqLibKey")
.apply(
lambda group: list(
zip(
group["fragmentType"],
group["fragmentNumber"],
group["fragmentCharge"],
)
)
)
.to_dict()
)
df.drop_duplicates(subset="zodiaqLibKey", inplace=True)
df.set_index("zodiaqLibKey", drop=True, inplace=True)
df.drop(
["precursorMz", "peakMz", "peptideName", "peakIntensity"], axis=1, inplace=True
[
"precursorMz",
"peakMz",
"peptideName",
"peakIntensity",
"fragmentType",
"fragmentNumber",
"fragmentCharge",
],
axis=1,
inplace=True,
)
metadata = df.to_dict(orient="index")
return {
"zodiaqKeys": keys,
"mz": mz,
"intensities": intensities,
"fragmentTypes": fragmentTypes,
"metadata": metadata,
}

Expand All @@ -116,7 +141,17 @@ def _create_zodiaq_library_entry(
organizedDataDict["intensities"][zodiaqKey],
zodiaqKeyIdx,
)
fragments = list(
zip(
organizedDataDict["mz"][zodiaqKey],
organizedDataDict["intensities"][zodiaqKey],
organizedDataDict["fragmentTypes"][zodiaqKey],
)
)
reducedPeaks = remove_low_intensity_peaks_below_max_peak_num(peaks, maxPeakNum)
reducedFragments = remove_low_intensity_peaks_below_max_peak_num(
fragments, maxPeakNum
)
isDecoy = int(
"decoy" in organizedDataDict["metadata"][zodiaqKey]["proteinName"].lower()
)
Expand All @@ -133,6 +168,7 @@ def _create_zodiaq_library_entry(
finalVariableNames["peaks"]: sorted(reducedPeaks),
finalVariableNames["zodiaqKeyIdx"]: zodiaqKeyIdx,
finalVariableNames["isDecoy"]: isDecoy,
finalVariableNames["fragmentTypes"]: [x[-1] for x in sorted(reducedFragments)],
}


Expand Down
12 changes: 12 additions & 0 deletions src/zodiaq/loaders/library/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
"precursorCharge",
"identification",
"proteinName",
"fragmentType",
"fragmentNumber",
"fragmentCharge",
]

oldColumnsSpectrast = [
Expand All @@ -16,6 +19,9 @@
"PrecursorCharge",
"transition_group_id",
"ProteinName",
"FragmentType",
"FragmentSeriesNumber",
"FragmentCharge",
]

oldColumnsFragpipe = [
Expand All @@ -26,6 +32,9 @@
"PrecursorCharge",
"PeptideSequence",
"ProteinId",
"FragmentType",
"FragmentSeriesNumber",
"FragmentCharge",
]

oldColumnsProsit = [
Expand All @@ -36,4 +45,7 @@
"PrecursorCharge",
"StrippedPeptide",
"FragmentLossType",
"FragmentType",
"FragmentNumber",
"FragmentCharge",
]
Loading