Merge pull request #78 from xomicsdatascience/dev

various minor bug fixes pushed to main
xomicsdatascience · Mar 19, 2024 · 4313beb · 4313beb
2 parents 054075f + ed4ebc4
commit 4313beb
Show file tree

Hide file tree

Showing 11 changed files with 31 additions and 19 deletions.
diff --git a/src/zodiaq/identification/identifier.py b/src/zodiaq/identification/identifier.py
@@ -9,10 +9,7 @@
     calculate_ppm_offset_tolerance,
     create_ppm_histogram,
 )
-from zodiaq.scoring import (
-    score_library_to_query_matches,
-    determine_index_of_fdr_cutoff,
-)
+from zodiaq.scoring import score_library_to_query_matches
 from zodiaq.identification.outputFormattingFunctions import (
     extract_metadata_from_match_and_score_dataframes,
     format_output_line,
@@ -168,6 +165,11 @@ def _apply_correction_to_match_dataframe(self, matchDf):
         offset, tolerance = calculate_ppm_offset_tolerance(
             matchDf["ppmDifference"], self._commandLineArgs["correctionDegree"]
         )
+        toleranceMinimumCutoff = 5
+        if not self._commandLineArgs["correctionDegree"] and tolerance < toleranceMinimumCutoff:
+            _, tolerance = calculate_ppm_offset_tolerance(
+                matchDf["ppmDifference"], 0.5
+            )
         queryFile = self._queryContext.filePath.split("/")[-1]
         outFile = os.path.splitext(queryFile)[0] + "_correctionHistogram.png"
         if self._commandLineArgs["histogram"]:
@@ -180,12 +182,6 @@ def _apply_correction_to_match_dataframe(self, matchDf):
         matchDf = filter_matches_by_ppm_offset_and_tolerance(matchDf, offset, tolerance)
         return eliminate_low_count_matches(matchDf)
 
-    def _apply_correction_to_score_dataframe(self, matchDf, scoreDf):
-        scoreDf = score_library_to_query_matches(matchDf)
-        isDecoyArray = identify_all_decoys(self._decoySet, scoreDf)
-        scoreDfCutoffIdx = determine_index_of_fdr_cutoff(isDecoyArray)
-        return scoreDf.iloc[:scoreDfCutoffIdx, :]
-
     def _format_identifications_as_dataframe(self, matchDf, scoreDf):
         """
         The final match/score identifications are consolidated into a dataframe.

diff --git a/src/zodiaq/identification/matchingFunctions.py b/src/zodiaq/identification/matchingFunctions.py
@@ -154,7 +154,10 @@ def match_library_to_query_pooled_spectra(libraryPeaks, queryPeaks, ppmTolerance
             libraryArray, queryArray, ppmTolerance, baselineLibraryIdx, baselineQueryIdx
         )
         dataArrays.append(dataArray)
-    data = np.concatenate(dataArrays, axis=0)
+    if len(dataArrays) == 0:
+        data = []
+    else:
+        data = np.concatenate(dataArrays, axis=0)
     matchDf = pd.DataFrame(
         data,
         columns=[

diff --git a/src/zodiaq/identification/outputFormattingFunctions.py b/src/zodiaq/identification/outputFormattingFunctions.py
@@ -20,6 +20,7 @@ def format_output_line(libMetadata, queMetadata, matchMetadata):
         queMetadata["CV"],
         queMetadata["windowWidth"],
         matchMetadata["exclude_num"],
+        queMetadata["retentionTime"],
     ]
 
 
@@ -69,6 +70,7 @@ def format_output_as_pandas_dataframe(inputFileName, outputData):
         "CompensationVoltage",
         "totalWindowWidth",
         "exclude_num",
+        "retentionTime",
     ]
     outputDf = pd.DataFrame(outputData, columns=columns)
     outputDf.insert(0, "fileName", [inputFileName] * len(outputDf.index))

diff --git a/src/zodiaq/loaders/library/decoyGenerationFunctions.py b/src/zodiaq/loaders/library/decoyGenerationFunctions.py
@@ -36,6 +36,8 @@ def shuffle_peptide_sequence_with_preserved_cleavage_points(peptide):
         if char in cleavageAminoAcids
     ]
     otherAAs = [char for char in originalPeptide if char not in cleavageAminoAcids]
+    if len(otherAAs) < 2:
+        return peptide
     for i in range(100):
         shuffledPeptide = shuffle_non_cleavage_amino_acids(otherAAs, i)
         shuffledPeptide = insert_cleavage_amino_acids_into_shuffled_peptide(

diff --git a/src/zodiaq/loaders/query/queryLoaderStrategyMzxml.py b/src/zodiaq/loaders/query/queryLoaderStrategyMzxml.py
@@ -42,6 +42,7 @@ def extract_metadata_from_query_scans(self) -> dict:
                 metadataDict["precursorMz"] = spec["precursorMz"][0]["precursorMz"]
                 metadataDict["windowWidth"] = spec["precursorMz"][0]["windowWideness"]
                 metadataDict["peaksCount"] = spec["peaksCount"]
+                metadataDict["retentionTime"] = spec["retentionTime"]
                 if "nameValue" in spec:
                     for key, value in spec["nameValue"].items():
                         spec[key] = value

diff --git a/src/zodiaq/scoring/fdrCalculationFunctions.py b/src/zodiaq/scoring/fdrCalculationFunctions.py
@@ -11,9 +11,11 @@ def create_spectral_fdr_output_from_full_output_sorted_by_desired_score(
     fullDf, fdrCutoff=0.01
 ):
     fdrs = calculate_fdr_rates_of_decoy_array(fullDf["isDecoy"])
-    scoreDfCutoffIdx = np.argmax(fdrs > fdrCutoff)
     spectralDf = fullDf.copy()
     spectralDf["spectralFDR"] = fdrs
+    if len(fdrs) == 0:
+        return spectralDf
+    scoreDfCutoffIdx = np.argmax(fdrs > fdrCutoff)
     return spectralDf.iloc[:scoreDfCutoffIdx, :]
 
 
@@ -23,6 +25,8 @@ def create_peptide_fdr_output_from_full_output_sorted_by_desired_score(
     peptideDf = drop_duplicate_values_from_df_in_given_column(fullDf, "peptide")
     fdrs = calculate_fdr_rates_of_decoy_array(peptideDf["isDecoy"])
     peptideDf["peptideFDR"] = fdrs
+    if len(fdrs) == 0:
+        return peptideDf
     scoreDfCutoffIdx = np.argmax(fdrs > fdrCutoff)
     if not scoreDfCutoffIdx:
         return peptideDf

diff --git a/src/zodiaq/scoring/quantificationFunctions.py b/src/zodiaq/scoring/quantificationFunctions.py
@@ -2,14 +2,13 @@
 import numpy as np
 from zodiaq.utils import format_protein_string_to_list
 from .idpickerFunctions import initialize__format_peptide_protein_connections
-from scipy import mean
 from collections import defaultdict
 from itertools import chain
 import scipy.linalg as linalg
 
 
 def calculate_ion_count_from_peptides_of_protein(ionCountList):
-    return mean(ionCountList)
+    return np.mean(ionCountList)
 
 
 def calculate_ion_count_for_each_protein_in_protein_fdr_df(proteinDf):

diff --git a/src/zodiaq/scoring/scoringFunctions.py b/src/zodiaq/scoring/scoringFunctions.py
@@ -32,6 +32,8 @@ def determine_index_of_fdr_cutoff(isDecoyArray, fdrCutoff=1e-2):
     if 1 not in isDecoyArray:
         return len(isDecoyArray)
     fdrs = calculate_fdr_rates_of_decoy_array(isDecoyArray)
+    if len(fdrs) == 0:
+        return 0
     lastDecoyIdx = np.argmax(fdrs > fdrCutoff)
     return lastDecoyIdx
 

diff --git a/tests/system/system_tests/scoring/testFileContentCreators/MaccScoresBreakdown.py b/tests/system/system_tests/scoring/testFileContentCreators/MaccScoresBreakdown.py
@@ -22,11 +22,11 @@ def _create_input_template_for_scoring_module(self):
         )
         duplicatePeptideInputDf["zLIB"] = [2] * len(duplicatePeptideInputDf.index)
         allDecoyDf = self._create_input_for_all_decoys()
-        firstDecoyLayerInputDf = allDecoyDf.iloc[:2]
+        firstDecoyLayerInputDf = allDecoyDf.iloc[:2].copy()
         firstDecoyLayerInputDf["rank"] = [3] * len(firstDecoyLayerInputDf.index)
-        secondDecoyLayerInputDf = allDecoyDf.iloc[2:4]
+        secondDecoyLayerInputDf = allDecoyDf.iloc[2:4].copy()
         secondDecoyLayerInputDf["rank"] = [4] * len(secondDecoyLayerInputDf.index)
-        thirdDecoyLayerInputDf = allDecoyDf.iloc[4:]
+        thirdDecoyLayerInputDf = allDecoyDf.iloc[4:].copy()
         thirdDecoyLayerInputDf["rank"] = [5] * len(thirdDecoyLayerInputDf.index)
         extraSpectralInputDf = self._create_input_for_generic_peptides(
             rank=6, startingRow=200

diff --git a/...s/system/system_tests/scoring/testFileContentCreators/ProteinCosineEvalScoresBreakdown.py b/...s/system/system_tests/scoring/testFileContentCreators/ProteinCosineEvalScoresBreakdown.py
@@ -21,9 +21,9 @@ def _create_input_template_for_scoring_module(self):
         )
         peptideInputDf = self._create_input_for_generic_peptides(rank=2, startingRow=2)
         allDecoyDf = self._create_input_for_all_decoys(numRows=3)
-        firstDecoyLayerInputDf = allDecoyDf.iloc[:2]
+        firstDecoyLayerInputDf = allDecoyDf.iloc[:2].copy()
         firstDecoyLayerInputDf["rank"] = [3] * len(firstDecoyLayerInputDf.index)
-        secondDecoyLayerInputDf = allDecoyDf.iloc[2:]
+        secondDecoyLayerInputDf = allDecoyDf.iloc[2:].copy()
         secondDecoyLayerInputDf["rank"] = [4] * len(secondDecoyLayerInputDf.index)
         self.inputDf = pd.concat(
             [

diff --git a/tests/unit/identification/test_outputFormattingFunctions.py b/tests/unit/identification/test_outputFormattingFunctions.py
@@ -28,6 +28,7 @@ def identifierOutputData():
         6,
         7,
         11,
+        12,
     ]
 
 
@@ -47,6 +48,7 @@ def test__output_formatting_functions__format_output_line(identifierOutputData):
         "peaksCount": 5,
         "CV": 6,
         "windowWidth": 7,
+        "retentionTime":12,
     }
     matchDict = {
         "cosineSimilarityScore": 8,
@@ -172,6 +174,7 @@ def test__output_formatting_functions__format_output_as_pandas_dataframe(
         "CompensationVoltage",
         "totalWindowWidth",
         "exclude_num",
+        "retentionTime",
     ]
     inputFileName = "dummyFile"
     expectedOutputDf = pd.DataFrame(