Skip to content

Commit

Permalink
Merge pull request #78 from xomicsdatascience/dev
Browse files Browse the repository at this point in the history
various minor bug fixes pushed to main
  • Loading branch information
CCranney authored Mar 19, 2024
2 parents 054075f + ed4ebc4 commit 4313beb
Show file tree
Hide file tree
Showing 11 changed files with 31 additions and 19 deletions.
16 changes: 6 additions & 10 deletions src/zodiaq/identification/identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@
calculate_ppm_offset_tolerance,
create_ppm_histogram,
)
from zodiaq.scoring import (
score_library_to_query_matches,
determine_index_of_fdr_cutoff,
)
from zodiaq.scoring import score_library_to_query_matches
from zodiaq.identification.outputFormattingFunctions import (
extract_metadata_from_match_and_score_dataframes,
format_output_line,
Expand Down Expand Up @@ -168,6 +165,11 @@ def _apply_correction_to_match_dataframe(self, matchDf):
offset, tolerance = calculate_ppm_offset_tolerance(
matchDf["ppmDifference"], self._commandLineArgs["correctionDegree"]
)
toleranceMinimumCutoff = 5
if not self._commandLineArgs["correctionDegree"] and tolerance < toleranceMinimumCutoff:
_, tolerance = calculate_ppm_offset_tolerance(
matchDf["ppmDifference"], 0.5
)
queryFile = self._queryContext.filePath.split("/")[-1]
outFile = os.path.splitext(queryFile)[0] + "_correctionHistogram.png"
if self._commandLineArgs["histogram"]:
Expand All @@ -180,12 +182,6 @@ def _apply_correction_to_match_dataframe(self, matchDf):
matchDf = filter_matches_by_ppm_offset_and_tolerance(matchDf, offset, tolerance)
return eliminate_low_count_matches(matchDf)

def _apply_correction_to_score_dataframe(self, matchDf, scoreDf):
scoreDf = score_library_to_query_matches(matchDf)
isDecoyArray = identify_all_decoys(self._decoySet, scoreDf)
scoreDfCutoffIdx = determine_index_of_fdr_cutoff(isDecoyArray)
return scoreDf.iloc[:scoreDfCutoffIdx, :]

def _format_identifications_as_dataframe(self, matchDf, scoreDf):
"""
The final match/score identifications are consolidated into a dataframe.
Expand Down
5 changes: 4 additions & 1 deletion src/zodiaq/identification/matchingFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,10 @@ def match_library_to_query_pooled_spectra(libraryPeaks, queryPeaks, ppmTolerance
libraryArray, queryArray, ppmTolerance, baselineLibraryIdx, baselineQueryIdx
)
dataArrays.append(dataArray)
data = np.concatenate(dataArrays, axis=0)
if len(dataArrays) == 0:
data = []
else:
data = np.concatenate(dataArrays, axis=0)
matchDf = pd.DataFrame(
data,
columns=[
Expand Down
2 changes: 2 additions & 0 deletions src/zodiaq/identification/outputFormattingFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def format_output_line(libMetadata, queMetadata, matchMetadata):
queMetadata["CV"],
queMetadata["windowWidth"],
matchMetadata["exclude_num"],
queMetadata["retentionTime"],
]


Expand Down Expand Up @@ -69,6 +70,7 @@ def format_output_as_pandas_dataframe(inputFileName, outputData):
"CompensationVoltage",
"totalWindowWidth",
"exclude_num",
"retentionTime",
]
outputDf = pd.DataFrame(outputData, columns=columns)
outputDf.insert(0, "fileName", [inputFileName] * len(outputDf.index))
Expand Down
2 changes: 2 additions & 0 deletions src/zodiaq/loaders/library/decoyGenerationFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def shuffle_peptide_sequence_with_preserved_cleavage_points(peptide):
if char in cleavageAminoAcids
]
otherAAs = [char for char in originalPeptide if char not in cleavageAminoAcids]
if len(otherAAs) < 2:
return peptide
for i in range(100):
shuffledPeptide = shuffle_non_cleavage_amino_acids(otherAAs, i)
shuffledPeptide = insert_cleavage_amino_acids_into_shuffled_peptide(
Expand Down
1 change: 1 addition & 0 deletions src/zodiaq/loaders/query/queryLoaderStrategyMzxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def extract_metadata_from_query_scans(self) -> dict:
metadataDict["precursorMz"] = spec["precursorMz"][0]["precursorMz"]
metadataDict["windowWidth"] = spec["precursorMz"][0]["windowWideness"]
metadataDict["peaksCount"] = spec["peaksCount"]
metadataDict["retentionTime"] = spec["retentionTime"]
if "nameValue" in spec:
for key, value in spec["nameValue"].items():
spec[key] = value
Expand Down
6 changes: 5 additions & 1 deletion src/zodiaq/scoring/fdrCalculationFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ def create_spectral_fdr_output_from_full_output_sorted_by_desired_score(
fullDf, fdrCutoff=0.01
):
fdrs = calculate_fdr_rates_of_decoy_array(fullDf["isDecoy"])
scoreDfCutoffIdx = np.argmax(fdrs > fdrCutoff)
spectralDf = fullDf.copy()
spectralDf["spectralFDR"] = fdrs
if len(fdrs) == 0:
return spectralDf
scoreDfCutoffIdx = np.argmax(fdrs > fdrCutoff)
return spectralDf.iloc[:scoreDfCutoffIdx, :]


Expand All @@ -23,6 +25,8 @@ def create_peptide_fdr_output_from_full_output_sorted_by_desired_score(
peptideDf = drop_duplicate_values_from_df_in_given_column(fullDf, "peptide")
fdrs = calculate_fdr_rates_of_decoy_array(peptideDf["isDecoy"])
peptideDf["peptideFDR"] = fdrs
if len(fdrs) == 0:
return peptideDf
scoreDfCutoffIdx = np.argmax(fdrs > fdrCutoff)
if not scoreDfCutoffIdx:
return peptideDf
Expand Down
3 changes: 1 addition & 2 deletions src/zodiaq/scoring/quantificationFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
import numpy as np
from zodiaq.utils import format_protein_string_to_list
from .idpickerFunctions import initialize__format_peptide_protein_connections
from scipy import mean
from collections import defaultdict
from itertools import chain
import scipy.linalg as linalg


def calculate_ion_count_from_peptides_of_protein(ionCountList):
return mean(ionCountList)
return np.mean(ionCountList)


def calculate_ion_count_for_each_protein_in_protein_fdr_df(proteinDf):
Expand Down
2 changes: 2 additions & 0 deletions src/zodiaq/scoring/scoringFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def determine_index_of_fdr_cutoff(isDecoyArray, fdrCutoff=1e-2):
if 1 not in isDecoyArray:
return len(isDecoyArray)
fdrs = calculate_fdr_rates_of_decoy_array(isDecoyArray)
if len(fdrs) == 0:
return 0
lastDecoyIdx = np.argmax(fdrs > fdrCutoff)
return lastDecoyIdx

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ def _create_input_template_for_scoring_module(self):
)
duplicatePeptideInputDf["zLIB"] = [2] * len(duplicatePeptideInputDf.index)
allDecoyDf = self._create_input_for_all_decoys()
firstDecoyLayerInputDf = allDecoyDf.iloc[:2]
firstDecoyLayerInputDf = allDecoyDf.iloc[:2].copy()
firstDecoyLayerInputDf["rank"] = [3] * len(firstDecoyLayerInputDf.index)
secondDecoyLayerInputDf = allDecoyDf.iloc[2:4]
secondDecoyLayerInputDf = allDecoyDf.iloc[2:4].copy()
secondDecoyLayerInputDf["rank"] = [4] * len(secondDecoyLayerInputDf.index)
thirdDecoyLayerInputDf = allDecoyDf.iloc[4:]
thirdDecoyLayerInputDf = allDecoyDf.iloc[4:].copy()
thirdDecoyLayerInputDf["rank"] = [5] * len(thirdDecoyLayerInputDf.index)
extraSpectralInputDf = self._create_input_for_generic_peptides(
rank=6, startingRow=200
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ def _create_input_template_for_scoring_module(self):
)
peptideInputDf = self._create_input_for_generic_peptides(rank=2, startingRow=2)
allDecoyDf = self._create_input_for_all_decoys(numRows=3)
firstDecoyLayerInputDf = allDecoyDf.iloc[:2]
firstDecoyLayerInputDf = allDecoyDf.iloc[:2].copy()
firstDecoyLayerInputDf["rank"] = [3] * len(firstDecoyLayerInputDf.index)
secondDecoyLayerInputDf = allDecoyDf.iloc[2:]
secondDecoyLayerInputDf = allDecoyDf.iloc[2:].copy()
secondDecoyLayerInputDf["rank"] = [4] * len(secondDecoyLayerInputDf.index)
self.inputDf = pd.concat(
[
Expand Down
3 changes: 3 additions & 0 deletions tests/unit/identification/test_outputFormattingFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def identifierOutputData():
6,
7,
11,
12,
]


Expand All @@ -47,6 +48,7 @@ def test__output_formatting_functions__format_output_line(identifierOutputData):
"peaksCount": 5,
"CV": 6,
"windowWidth": 7,
"retentionTime":12,
}
matchDict = {
"cosineSimilarityScore": 8,
Expand Down Expand Up @@ -172,6 +174,7 @@ def test__output_formatting_functions__format_output_as_pandas_dataframe(
"CompensationVoltage",
"totalWindowWidth",
"exclude_num",
"retentionTime",
]
inputFileName = "dummyFile"
expectedOutputDf = pd.DataFrame(
Expand Down

0 comments on commit 4313beb

Please sign in to comment.