From 3c0fac4a5fe1a5621bfabe624e27f7c6bc3e2a96 Mon Sep 17 00:00:00 2001 From: CCranney <11773171+CCranney@users.noreply.github.com> Date: Tue, 19 Mar 2024 11:49:17 -0700 Subject: [PATCH 1/2] fixing minor warnings that appeared while running pytest --- src/zodiaq/scoring/quantificationFunctions.py | 3 +-- .../scoring/testFileContentCreators/MaccScoresBreakdown.py | 6 +++--- .../ProteinCosineEvalScoresBreakdown.py | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/zodiaq/scoring/quantificationFunctions.py b/src/zodiaq/scoring/quantificationFunctions.py index b93215e..fe8ba85 100644 --- a/src/zodiaq/scoring/quantificationFunctions.py +++ b/src/zodiaq/scoring/quantificationFunctions.py @@ -2,14 +2,13 @@ import numpy as np from zodiaq.utils import format_protein_string_to_list from .idpickerFunctions import initialize__format_peptide_protein_connections -from scipy import mean from collections import defaultdict from itertools import chain import scipy.linalg as linalg def calculate_ion_count_from_peptides_of_protein(ionCountList): - return mean(ionCountList) + return np.mean(ionCountList) def calculate_ion_count_for_each_protein_in_protein_fdr_df(proteinDf): diff --git a/tests/system/system_tests/scoring/testFileContentCreators/MaccScoresBreakdown.py b/tests/system/system_tests/scoring/testFileContentCreators/MaccScoresBreakdown.py index 15ff944..4cb8f95 100644 --- a/tests/system/system_tests/scoring/testFileContentCreators/MaccScoresBreakdown.py +++ b/tests/system/system_tests/scoring/testFileContentCreators/MaccScoresBreakdown.py @@ -22,11 +22,11 @@ def _create_input_template_for_scoring_module(self): ) duplicatePeptideInputDf["zLIB"] = [2] * len(duplicatePeptideInputDf.index) allDecoyDf = self._create_input_for_all_decoys() - firstDecoyLayerInputDf = allDecoyDf.iloc[:2] + firstDecoyLayerInputDf = allDecoyDf.iloc[:2].copy() firstDecoyLayerInputDf["rank"] = [3] * len(firstDecoyLayerInputDf.index) - secondDecoyLayerInputDf = allDecoyDf.iloc[2:4] + secondDecoyLayerInputDf = allDecoyDf.iloc[2:4].copy() secondDecoyLayerInputDf["rank"] = [4] * len(secondDecoyLayerInputDf.index) - thirdDecoyLayerInputDf = allDecoyDf.iloc[4:] + thirdDecoyLayerInputDf = allDecoyDf.iloc[4:].copy() thirdDecoyLayerInputDf["rank"] = [5] * len(thirdDecoyLayerInputDf.index) extraSpectralInputDf = self._create_input_for_generic_peptides( rank=6, startingRow=200 diff --git a/tests/system/system_tests/scoring/testFileContentCreators/ProteinCosineEvalScoresBreakdown.py b/tests/system/system_tests/scoring/testFileContentCreators/ProteinCosineEvalScoresBreakdown.py index 811918b..9301366 100644 --- a/tests/system/system_tests/scoring/testFileContentCreators/ProteinCosineEvalScoresBreakdown.py +++ b/tests/system/system_tests/scoring/testFileContentCreators/ProteinCosineEvalScoresBreakdown.py @@ -21,9 +21,9 @@ def _create_input_template_for_scoring_module(self): ) peptideInputDf = self._create_input_for_generic_peptides(rank=2, startingRow=2) allDecoyDf = self._create_input_for_all_decoys(numRows=3) - firstDecoyLayerInputDf = allDecoyDf.iloc[:2] + firstDecoyLayerInputDf = allDecoyDf.iloc[:2].copy() firstDecoyLayerInputDf["rank"] = [3] * len(firstDecoyLayerInputDf.index) - secondDecoyLayerInputDf = allDecoyDf.iloc[2:] + secondDecoyLayerInputDf = allDecoyDf.iloc[2:].copy() secondDecoyLayerInputDf["rank"] = [4] * len(secondDecoyLayerInputDf.index) self.inputDf = pd.concat( [ From 65e020880a1d247317966b5a836ca159d8d30e2c Mon Sep 17 00:00:00 2001 From: CCranney <11773171+CCranney@users.noreply.github.com> Date: Tue, 19 Mar 2024 16:43:51 -0700 Subject: [PATCH 2/2] various minor bug fixes --- src/zodiaq/identification/identifier.py | 16 ++++++---------- src/zodiaq/identification/matchingFunctions.py | 5 ++++- .../identification/outputFormattingFunctions.py | 2 ++ .../loaders/library/decoyGenerationFunctions.py | 2 ++ .../loaders/query/queryLoaderStrategyMzxml.py | 1 + src/zodiaq/scoring/fdrCalculationFunctions.py | 6 +++++- src/zodiaq/scoring/scoringFunctions.py | 2 ++ .../test_outputFormattingFunctions.py | 3 +++ 8 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/zodiaq/identification/identifier.py b/src/zodiaq/identification/identifier.py index 9386285..76e4617 100644 --- a/src/zodiaq/identification/identifier.py +++ b/src/zodiaq/identification/identifier.py @@ -9,10 +9,7 @@ calculate_ppm_offset_tolerance, create_ppm_histogram, ) -from zodiaq.scoring import ( - score_library_to_query_matches, - determine_index_of_fdr_cutoff, -) +from zodiaq.scoring import score_library_to_query_matches from zodiaq.identification.outputFormattingFunctions import ( extract_metadata_from_match_and_score_dataframes, format_output_line, @@ -168,6 +165,11 @@ def _apply_correction_to_match_dataframe(self, matchDf): offset, tolerance = calculate_ppm_offset_tolerance( matchDf["ppmDifference"], self._commandLineArgs["correctionDegree"] ) + toleranceMinimumCutoff = 5 + if not self._commandLineArgs["correctionDegree"] and tolerance < toleranceMinimumCutoff: + _, tolerance = calculate_ppm_offset_tolerance( + matchDf["ppmDifference"], 0.5 + ) queryFile = self._queryContext.filePath.split("/")[-1] outFile = os.path.splitext(queryFile)[0] + "_correctionHistogram.png" if self._commandLineArgs["histogram"]: @@ -180,12 +182,6 @@ def _apply_correction_to_match_dataframe(self, matchDf): matchDf = filter_matches_by_ppm_offset_and_tolerance(matchDf, offset, tolerance) return eliminate_low_count_matches(matchDf) - def _apply_correction_to_score_dataframe(self, matchDf, scoreDf): - scoreDf = score_library_to_query_matches(matchDf) - isDecoyArray = identify_all_decoys(self._decoySet, scoreDf) - scoreDfCutoffIdx = determine_index_of_fdr_cutoff(isDecoyArray) - return scoreDf.iloc[:scoreDfCutoffIdx, :] - def _format_identifications_as_dataframe(self, matchDf, scoreDf): """ The final match/score identifications are consolidated into a dataframe. diff --git a/src/zodiaq/identification/matchingFunctions.py b/src/zodiaq/identification/matchingFunctions.py index b613e29..a0132b9 100644 --- a/src/zodiaq/identification/matchingFunctions.py +++ b/src/zodiaq/identification/matchingFunctions.py @@ -154,7 +154,10 @@ def match_library_to_query_pooled_spectra(libraryPeaks, queryPeaks, ppmTolerance libraryArray, queryArray, ppmTolerance, baselineLibraryIdx, baselineQueryIdx ) dataArrays.append(dataArray) - data = np.concatenate(dataArrays, axis=0) + if len(dataArrays) == 0: + data = [] + else: + data = np.concatenate(dataArrays, axis=0) matchDf = pd.DataFrame( data, columns=[ diff --git a/src/zodiaq/identification/outputFormattingFunctions.py b/src/zodiaq/identification/outputFormattingFunctions.py index 1e10c81..9c81422 100644 --- a/src/zodiaq/identification/outputFormattingFunctions.py +++ b/src/zodiaq/identification/outputFormattingFunctions.py @@ -20,6 +20,7 @@ def format_output_line(libMetadata, queMetadata, matchMetadata): queMetadata["CV"], queMetadata["windowWidth"], matchMetadata["exclude_num"], + queMetadata["retentionTime"], ] @@ -69,6 +70,7 @@ def format_output_as_pandas_dataframe(inputFileName, outputData): "CompensationVoltage", "totalWindowWidth", "exclude_num", + "retentionTime", ] outputDf = pd.DataFrame(outputData, columns=columns) outputDf.insert(0, "fileName", [inputFileName] * len(outputDf.index)) diff --git a/src/zodiaq/loaders/library/decoyGenerationFunctions.py b/src/zodiaq/loaders/library/decoyGenerationFunctions.py index 7b0a865..1b62255 100644 --- a/src/zodiaq/loaders/library/decoyGenerationFunctions.py +++ b/src/zodiaq/loaders/library/decoyGenerationFunctions.py @@ -36,6 +36,8 @@ def shuffle_peptide_sequence_with_preserved_cleavage_points(peptide): if char in cleavageAminoAcids ] otherAAs = [char for char in originalPeptide if char not in cleavageAminoAcids] + if len(otherAAs) < 2: + return peptide for i in range(100): shuffledPeptide = shuffle_non_cleavage_amino_acids(otherAAs, i) shuffledPeptide = insert_cleavage_amino_acids_into_shuffled_peptide( diff --git a/src/zodiaq/loaders/query/queryLoaderStrategyMzxml.py b/src/zodiaq/loaders/query/queryLoaderStrategyMzxml.py index 63e0c21..6231071 100644 --- a/src/zodiaq/loaders/query/queryLoaderStrategyMzxml.py +++ b/src/zodiaq/loaders/query/queryLoaderStrategyMzxml.py @@ -42,6 +42,7 @@ def extract_metadata_from_query_scans(self) -> dict: metadataDict["precursorMz"] = spec["precursorMz"][0]["precursorMz"] metadataDict["windowWidth"] = spec["precursorMz"][0]["windowWideness"] metadataDict["peaksCount"] = spec["peaksCount"] + metadataDict["retentionTime"] = spec["retentionTime"] if "nameValue" in spec: for key, value in spec["nameValue"].items(): spec[key] = value diff --git a/src/zodiaq/scoring/fdrCalculationFunctions.py b/src/zodiaq/scoring/fdrCalculationFunctions.py index a82c8ec..ab202d1 100644 --- a/src/zodiaq/scoring/fdrCalculationFunctions.py +++ b/src/zodiaq/scoring/fdrCalculationFunctions.py @@ -11,9 +11,11 @@ def create_spectral_fdr_output_from_full_output_sorted_by_desired_score( fullDf, fdrCutoff=0.01 ): fdrs = calculate_fdr_rates_of_decoy_array(fullDf["isDecoy"]) - scoreDfCutoffIdx = np.argmax(fdrs > fdrCutoff) spectralDf = fullDf.copy() spectralDf["spectralFDR"] = fdrs + if len(fdrs) == 0: + return spectralDf + scoreDfCutoffIdx = np.argmax(fdrs > fdrCutoff) return spectralDf.iloc[:scoreDfCutoffIdx, :] @@ -23,6 +25,8 @@ def create_peptide_fdr_output_from_full_output_sorted_by_desired_score( peptideDf = drop_duplicate_values_from_df_in_given_column(fullDf, "peptide") fdrs = calculate_fdr_rates_of_decoy_array(peptideDf["isDecoy"]) peptideDf["peptideFDR"] = fdrs + if len(fdrs) == 0: + return peptideDf scoreDfCutoffIdx = np.argmax(fdrs > fdrCutoff) if not scoreDfCutoffIdx: return peptideDf diff --git a/src/zodiaq/scoring/scoringFunctions.py b/src/zodiaq/scoring/scoringFunctions.py index 4790a8b..b7b4f63 100644 --- a/src/zodiaq/scoring/scoringFunctions.py +++ b/src/zodiaq/scoring/scoringFunctions.py @@ -32,6 +32,8 @@ def determine_index_of_fdr_cutoff(isDecoyArray, fdrCutoff=1e-2): if 1 not in isDecoyArray: return len(isDecoyArray) fdrs = calculate_fdr_rates_of_decoy_array(isDecoyArray) + if len(fdrs) == 0: + return 0 lastDecoyIdx = np.argmax(fdrs > fdrCutoff) return lastDecoyIdx diff --git a/tests/unit/identification/test_outputFormattingFunctions.py b/tests/unit/identification/test_outputFormattingFunctions.py index 23096b2..ccd4253 100644 --- a/tests/unit/identification/test_outputFormattingFunctions.py +++ b/tests/unit/identification/test_outputFormattingFunctions.py @@ -28,6 +28,7 @@ def identifierOutputData(): 6, 7, 11, + 12, ] @@ -47,6 +48,7 @@ def test__output_formatting_functions__format_output_line(identifierOutputData): "peaksCount": 5, "CV": 6, "windowWidth": 7, + "retentionTime":12, } matchDict = { "cosineSimilarityScore": 8, @@ -172,6 +174,7 @@ def test__output_formatting_functions__format_output_as_pandas_dataframe( "CompensationVoltage", "totalWindowWidth", "exclude_num", + "retentionTime", ] inputFileName = "dummyFile" expectedOutputDf = pd.DataFrame(