xomicsdatascience · CCranney · Nov 16, 2023 · Nov 16, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/src/zodiaq/loaders/library/decoyGenerationFunctions.py b/src/zodiaq/loaders/library/decoyGenerationFunctions.py
@@ -0,0 +1,195 @@
+import re
+import random
+from pyteomics import mass
+from zodiaq.utils import format_protein_string_to_list, format_protein_list_to_string
+from zodiaq.loaders.library.modificationMassDict import modificationMassDict
+
+nonCleavageAminoAcids = [
+    "A",
+    "N",
+    "D",
+    "C",
+    "E",
+    "Q",
+    "G",
+    "H",
+    "I",
+    "L",
+    "M",
+    "F",
+    "S",
+    "T",
+    "W",
+    "Y",
+    "V",
+]
+cleavageAminoAcids = set(["K", "P", "R"])
+modificationRegexStr = r"([A-Z](?:[\[\(][^\)\]]+[\]\)])?)"
+
+
+def shuffle_peptide_sequence_with_preserved_cleavage_points(peptide):
+    peptide = re.sub("^(?:[\[\(][^\)\]]+[\]\)])", "", peptide)
+    originalPeptide = re.findall(modificationRegexStr, peptide)
+    cleavageAALocations = [
+        (i - len(originalPeptide) + 1, char)
+        for (i, char) in enumerate(originalPeptide)
+        if char in cleavageAminoAcids
+    ]
+    otherAAs = [char for char in originalPeptide if char not in cleavageAminoAcids]
+    for i in range(100):
+        shuffledPeptide = shuffle_non_cleavage_amino_acids(otherAAs, i)
+        shuffledPeptide = insert_cleavage_amino_acids_into_shuffled_peptide(
+            shuffledPeptide, cleavageAALocations, originalPeptide
+        )
+        if (
+            calculate_similarities_between_strings(originalPeptide, shuffledPeptide)
+            < 0.7
+        ):
+            return "".join(shuffledPeptide)
+    if calculate_similarities_between_strings(originalPeptide, shuffledPeptide) >= 0.7:
+        return peptide
+    return "".join(shuffledPeptide)
+
+
+def shuffle_non_cleavage_amino_acids(otherAAs, i):
+    shuffledPeptide = otherAAs[:]
+    random.shuffle(shuffledPeptide)
+    if i % 10 == 0:
+        randomly_swap_single_amino_acid_in_shuffled_peptide_to_increase_variability(
+            shuffledPeptide
+        )
+    return shuffledPeptide
+
+
+def randomly_swap_single_amino_acid_in_shuffled_peptide_to_increase_variability(
+    shuffledPeptide,
+):
+    randomIdx = random.randint(0, len(shuffledPeptide) - 1)
+    shuffledPeptide[randomIdx] = random.choice(nonCleavageAminoAcids)
+    return shuffledPeptide
+
+
+def insert_cleavage_amino_acids_into_shuffled_peptide(
+    shuffledPeptide, cleavageAALocations, peptide
+):
+    for j in range(len(cleavageAALocations) - 1, -1, -1):
+        cleavageAAIndex = cleavageAALocations[j][0]
+        if not cleavageAAIndex:
+            cleavageAAIndex = len(peptide)
+        cleavageAA = cleavageAALocations[j][1]
+        shuffledPeptide.insert(cleavageAAIndex, cleavageAA)
+    return shuffledPeptide
+
+
+def insert_value(string, index, char):
+    return string[:index] + [char] + string[index + 1 :]
+
+
+def calculate_similarities_between_strings(s1, s2):
+    minLength = min(len(s1), len(s2))
+    similarities = 0
+    for i in range(minLength):
+        if s1[i] == s2[i]:
+            similarities += 1
+    return similarities / minLength
+
+
+def calculate_ion_mz(sequence, type, position, charge):
+    terminalSequence = determine_terminal_end_of_sequence(sequence, type, position)
+    mass = calculate_molecular_mass_of_sequence(terminalSequence, type, charge)
+    return mass
+
+
+def determine_terminal_end_of_sequence(sequence, type, position):
+    modifiedSequence = re.findall(modificationRegexStr, sequence)
+    if type == "y":
+        return "".join(modifiedSequence[-position:])
+    else:
+        return "".join(modifiedSequence[:position])
+
+
+def calculate_molecular_mass_of_sequence(sequence, type, charge):
+    modifications = re.findall(r"([\[\(][^\)\]]+[\]\)])", sequence)
+    unmodifiedSequence = re.sub(r"([\[\(][^\)\]]+[\]\)])", "", sequence)
+    unmodifiedMass = mass.calculate_mass(
+        sequence=unmodifiedSequence, ion_type=type, charge=charge
+    )
+    modificationMass = sum([modificationMassDict[m] for m in modifications])
+    return unmodifiedMass + modificationMass
+
+
+def add_decoys_to_zodiaq_library(zodiaqLibraryDict):
+    decoyDict = {}
+    for key, value in zodiaqLibraryDict.items():
+        decoyKey, decoyValue = create_decoy_from_target_in_zodiaq_library(
+            key, value.copy()
+        )
+        if decoyKey:
+            decoyDict[decoyKey] = decoyValue
+    outputDict = recalculate_zodiaq_library_key_idx_to_account_for_inserted_decoys(
+        decoyDict | zodiaqLibraryDict
+    )
+    return outputDict
+
+
+def create_decoy_from_target_in_zodiaq_library(targetKey, targetValue):
+    targetPeptide = targetKey[1]
+    targetPeaks = targetValue["peaks"]
+    fragmentTypes = targetValue["fragmentTypes"]
+    decoyPeptide = shuffle_peptide_sequence_with_preserved_cleavage_points(
+        targetPeptide
+    )
+    if decoyPeptide == targetPeptide:
+        return None, None
+    decoyMzs = [
+        calculate_ion_mz(decoyPeptide, *fragmentType) for fragmentType in fragmentTypes
+    ]
+    decoyPeaks = [(decoyMzs[i], targetPeaks[i][1], -1) for i in range(len(targetPeaks))]
+    targetProteinString = targetValue["proteinName"]
+    decoyProteins = format_protein_string_to_list(targetProteinString)
+    decoyProteins = [f"DECOY_{protein}" for protein in decoyProteins]
+    decoyProteinString = format_protein_list_to_string(decoyProteins)
+    decoyValue = targetValue
+    decoyValue["identification"] = f"{decoyPeptide}_{targetKey[0]}_DECOY"
+    decoyValue["proteinName"] = decoyProteinString
+    decoyValue["peaks"] = decoyPeaks
+    decoyValue["isDecoy"] = 1
+    decoyValue["zodiaqKeyIdx"] = -1
+    return (targetKey[0], decoyPeptide), decoyValue
+
+
+def recalculate_zodiaq_library_key_idx_to_account_for_inserted_decoys(outputDict):
+    sortedKeys = sorted(outputDict)
+    for i in range(len(sortedKeys)):
+        key = sortedKeys[i]
+        value = outputDict[key]
+        value["peaks"] = [(x[0], x[1], i) for x in value["peaks"]]
+        value["zodiaqKeyIdx"] = i
+        outputDict[key] = value
+    return outputDict
+
+
+def determine_if_decoys_should_be_generated(zodiaqLibraryDict):
+    if there_are_any_decoys_in_library(zodiaqLibraryDict):
+        return False
+    if any_library_entries_dont_have_fragment_type_data_requirement_for_decoy_generation(
+        zodiaqLibraryDict
+    ):
+        return False
+    return True
+
+
+def there_are_any_decoys_in_library(zodiaqLibraryDict):
+    for key, value in zodiaqLibraryDict.items():
+        if value["isDecoy"]:
+            return True
+    return False
+
+
+def any_library_entries_dont_have_fragment_type_data_requirement_for_decoy_generation(
+    zodiaqLibraryDict,
+):
+    for key, value in zodiaqLibraryDict.items():
+        if "fragmentTypes" not in value:
+            return True
+    return False
diff --git a/src/zodiaq/loaders/library/libraryLoaderContext.py b/src/zodiaq/loaders/library/libraryLoaderContext.py
@@ -31,6 +31,8 @@ def __init__(self, libraryFilePath: os.PathLike):
                 "The .msp library format is not currently supported. If the library file was generated via prosit, please reset the output into a tab-delimited (.tsv) format."
             )
 
-    def load_zodiaq_library_dict(self):
+    def load_zodiaq_library_dict(self, isTest=False):
         """See 'load_zodiaq_library_dict_from_file' in 'LibraryLoaderStrategy'"""
-        return self._strategy.load_zodiaq_library_dict_from_file(self._libraryFilePath)
+        return self._strategy.load_zodiaq_library_dict_from_file(
+            self._libraryFilePath, isTest
+        )
diff --git a/src/zodiaq/loaders/library/libraryLoaderStrategy.py b/src/zodiaq/loaders/library/libraryLoaderStrategy.py
@@ -1,5 +1,10 @@
 from abc import ABC, abstractmethod
 import os
+from zodiaq.loaders.library.decoyGenerationFunctions import (
+    determine_if_decoys_should_be_generated,
+    add_decoys_to_zodiaq_library,
+)
+import random
 
 
 class LibraryLoaderStrategy(ABC):
@@ -60,7 +65,9 @@ def _format_raw_library_object_into_zodiaq_library_dict(self) -> dict:
         """
         pass
 
-    def load_zodiaq_library_dict_from_file(self, libraryFilePath: os.PathLike) -> dict:
+    def load_zodiaq_library_dict_from_file(
+        self, libraryFilePath: os.PathLike, isTest
+    ) -> dict:
         """
         Outputs a standardized dictionary object from a library file.
             The LibraryLoaderContext class calls this function, which uses
@@ -77,8 +84,12 @@ def load_zodiaq_library_dict_from_file(self, libraryFilePath: os.PathLike) -> di
         zodiaqLibDict : dict
             see _format_raw_library_object_into_zodiaq_library_dict return value.
         """
+        random.seed(0)
         self._load_raw_library_object_from_file(libraryFilePath)
-        return self._format_raw_library_object_into_zodiaq_library_dict()
+        zodiaqLibDict = self._format_raw_library_object_into_zodiaq_library_dict()
+        if determine_if_decoys_should_be_generated(zodiaqLibDict) and not isTest:
+            zodiaqLibDict = add_decoys_to_zodiaq_library(zodiaqLibDict)
+        return zodiaqLibDict
 
 
 def create_peaks_from_mz_intensity_lists_and_zodiaq_key_id(
@@ -139,4 +150,5 @@ def remove_low_intensity_peaks_below_max_peak_num(peaks: list, maxPeakNum: int)
     "peaks": "peaks",
     "zodiaqKeyIdx": "zodiaqKeyIdx",
     "isDecoy": "isDecoy",
+    "fragmentTypes": "fragmentTypes",
 }
diff --git a/src/zodiaq/loaders/library/libraryLoaderStrategyTable.py b/src/zodiaq/loaders/library/libraryLoaderStrategyTable.py
@@ -50,9 +50,10 @@ def _load_raw_library_object_from_file(self, libraryFilePath: os.PathLike) -> No
             self.oldToNewColumnDict.keys(), self.rawUploadedLibraryObject.columns
         )
 
-    def _format_raw_library_object_into_zodiaq_library_dict(self) -> dict:
+    def _format_raw_library_object_into_zodiaq_library_dict(
+        self, maxPeakNum=10
+    ) -> dict:
         """abstract class implementation - see 'libraryLoaderStrategy.py' for details"""
-        maxPeakNum = 10
         reformattedLibDf = _reformat_raw_library_object_columns(
             self.rawUploadedLibraryObject, self.oldToNewColumnDict
         )
@@ -93,16 +94,40 @@ def _organize_data_by_zodiaq_library_dict_keys(df: pd.DataFrame) -> dict:
     keys = sorted(set(df["zodiaqLibKey"]))
     mz = df.groupby("zodiaqLibKey")["peakMz"].apply(list).to_dict()
     intensities = df.groupby("zodiaqLibKey")["peakIntensity"].apply(list).to_dict()
+    fragmentTypes = (
+        df.groupby("zodiaqLibKey")
+        .apply(
+            lambda group: list(
+                zip(
+                    group["fragmentType"],
+                    group["fragmentNumber"],
+                    group["fragmentCharge"],
+                )
+            )
+        )
+        .to_dict()
+    )
     df.drop_duplicates(subset="zodiaqLibKey", inplace=True)
     df.set_index("zodiaqLibKey", drop=True, inplace=True)
     df.drop(
-        ["precursorMz", "peakMz", "peptideName", "peakIntensity"], axis=1, inplace=True
+        [
+            "precursorMz",
+            "peakMz",
+            "peptideName",
+            "peakIntensity",
+            "fragmentType",
+            "fragmentNumber",
+            "fragmentCharge",
+        ],
+        axis=1,
+        inplace=True,
     )
     metadata = df.to_dict(orient="index")
     return {
         "zodiaqKeys": keys,
         "mz": mz,
         "intensities": intensities,
+        "fragmentTypes": fragmentTypes,
         "metadata": metadata,
     }
 
@@ -116,7 +141,17 @@ def _create_zodiaq_library_entry(
         organizedDataDict["intensities"][zodiaqKey],
         zodiaqKeyIdx,
     )
+    fragments = list(
+        zip(
+            organizedDataDict["mz"][zodiaqKey],
+            organizedDataDict["intensities"][zodiaqKey],
+            organizedDataDict["fragmentTypes"][zodiaqKey],
+        )
+    )
     reducedPeaks = remove_low_intensity_peaks_below_max_peak_num(peaks, maxPeakNum)
+    reducedFragments = remove_low_intensity_peaks_below_max_peak_num(
+        fragments, maxPeakNum
+    )
     isDecoy = int(
         "decoy" in organizedDataDict["metadata"][zodiaqKey]["proteinName"].lower()
     )
@@ -133,6 +168,7 @@ def _create_zodiaq_library_entry(
         finalVariableNames["peaks"]: sorted(reducedPeaks),
         finalVariableNames["zodiaqKeyIdx"]: zodiaqKeyIdx,
         finalVariableNames["isDecoy"]: isDecoy,
+        finalVariableNames["fragmentTypes"]: [x[-1] for x in sorted(reducedFragments)],
     }
 
 

diff --git a/src/zodiaq/loaders/library/mappings.py b/src/zodiaq/loaders/library/mappings.py
@@ -6,6 +6,9 @@
     "precursorCharge",
     "identification",
     "proteinName",
+    "fragmentType",
+    "fragmentNumber",
+    "fragmentCharge",
 ]
 
 oldColumnsSpectrast = [
@@ -16,6 +19,9 @@
     "PrecursorCharge",
     "transition_group_id",
     "ProteinName",
+    "FragmentType",
+    "FragmentSeriesNumber",
+    "FragmentCharge",
 ]
 
 oldColumnsFragpipe = [
@@ -26,6 +32,9 @@
     "PrecursorCharge",
     "PeptideSequence",
     "ProteinId",
+    "FragmentType",
+    "FragmentSeriesNumber",
+    "FragmentCharge",
 ]
 
 oldColumnsProsit = [
@@ -36,4 +45,7 @@
     "PrecursorCharge",
     "StrippedPeptide",
     "FragmentLossType",
+    "FragmentType",
+    "FragmentNumber",
+    "FragmentCharge",
 ]