Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rt length to output ii #373

Open
wants to merge 13 commits into
base: add_rt_length_to_output
Choose a base branch
from
120 changes: 83 additions & 37 deletions alphadia/outputtransform.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# native imports
import logging
import os
from collections import defaultdict
from collections.abc import Iterator

import directlfq.config as lfqconfig
Expand All @@ -25,6 +26,7 @@
)
from alphadia.transferlearning.train import FinetuneManager
from alphadia.workflow import manager, peptidecentric
from alphadia.workflow.managers.raw_file_manager import RawFileManager

logger = logging.getLogger()

Expand Down Expand Up @@ -938,61 +940,59 @@ def _build_run_stat_df(
Dataframe containing the statistics

"""
optimization_manager_path = os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PATH
)

calibration_manager_path = os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.CALIBRATION_MANAGER_PATH
)

if channels is None:
channels = [0]
out_df = []
all_stats = []

for channel in channels:
channel_df = run_df[run_df["channel"] == channel]

base_dict = {
stats = {
"run": raw_name,
"channel": channel,
"precursors": len(channel_df),
"proteins": channel_df["pg"].nunique(),
}

stats["fwhm_rt"] = np.nan
if "cycle_fwhm" in channel_df.columns:
base_dict["fwhm_rt"] = np.mean(channel_df["cycle_fwhm"])
stats["fwhm_rt"] = np.mean(channel_df["cycle_fwhm"])

stats["fwhm_mobility"] = np.nan
if "mobility_fwhm" in channel_df.columns:
base_dict["fwhm_mobility"] = np.mean(channel_df["mobility_fwhm"])
stats["fwhm_mobility"] = np.mean(channel_df["mobility_fwhm"])

# collect optimization stats
base_dict["optimization.ms2_error"] = np.nan
base_dict["optimization.ms1_error"] = np.nan
base_dict["optimization.rt_error"] = np.nan
base_dict["optimization.mobility_error"] = np.nan

if os.path.exists(optimization_manager_path):
optimization_stats = defaultdict(lambda: np.nan)
if os.path.exists(
optimization_manager_path := os.path.join(
folder,
peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PKL_NAME,
)
):
optimization_manager = manager.OptimizationManager(
path=optimization_manager_path
)
base_dict["optimization.ms2_error"] = optimization_manager.ms2_error
base_dict["optimization.ms1_error"] = optimization_manager.ms1_error
base_dict["optimization.rt_error"] = optimization_manager.rt_error
base_dict["optimization.mobility_error"] = (
optimization_manager.mobility_error
)

optimization_stats["ms2_error"] = optimization_manager.ms2_error
optimization_stats["ms1_error"] = optimization_manager.ms1_error
optimization_stats["rt_error"] = optimization_manager.rt_error
optimization_stats["mobility_error"] = optimization_manager.mobility_error
else:
logger.warning(f"Error reading optimization manager for {raw_name}")

# collect calibration stats
base_dict["calibration.ms2_median_accuracy"] = np.nan
base_dict["calibration.ms2_median_precision"] = np.nan
base_dict["calibration.ms1_median_accuracy"] = np.nan
base_dict["calibration.ms1_median_precision"] = np.nan
prefix = "optimization."
for key in ["ms2_error", "ms1_error", "rt_error", "mobility_error"]:
stats[f"{prefix}{key}"] = optimization_stats[key]

if os.path.exists(calibration_manager_path):
# collect calibration stats
calibration_stats = defaultdict(lambda: np.nan)
if os.path.exists(
calibration_manager_path := os.path.join(
folder,
peptidecentric.PeptideCentricWorkflow.CALIBRATION_MANAGER_PKL_NAME,
)
):
calibration_manager = manager.CalibrationManager(
path=calibration_manager_path
)
Expand All @@ -1002,10 +1002,10 @@ def _build_run_stat_df(
"fragment", "mz"
)
) and (fragment_mz_metrics := fragment_mz_estimator.metrics):
base_dict["calibration.ms2_median_accuracy"] = fragment_mz_metrics[
calibration_stats["ms2_median_accuracy"] = fragment_mz_metrics[
"median_accuracy"
]
base_dict["calibration.ms2_median_precision"] = fragment_mz_metrics[
calibration_stats["ms2_median_precision"] = fragment_mz_metrics[
"median_precision"
]

Expand All @@ -1014,19 +1014,65 @@ def _build_run_stat_df(
"precursor", "mz"
)
) and (precursor_mz_metrics := precursor_mz_estimator.metrics):
base_dict["calibration.ms1_median_accuracy"] = precursor_mz_metrics[
calibration_stats["ms1_median_accuracy"] = precursor_mz_metrics[
"median_accuracy"
]
base_dict["calibration.ms1_median_precision"] = precursor_mz_metrics[
calibration_stats["ms1_median_precision"] = precursor_mz_metrics[
"median_precision"
]

else:
logger.warning(f"Error reading calibration manager for {raw_name}")

out_df.append(base_dict)
prefix = "calibration."
for key in [
"ms2_median_accuracy",
"ms2_median_precision",
"ms1_median_accuracy",
"ms1_median_precision",
]:
stats[f"{prefix}{key}"] = calibration_stats[key]

# collect raw stats
raw_stats = defaultdict(lambda: np.nan)
if os.path.exists(
raw_file_manager_path := os.path.join(
folder, peptidecentric.PeptideCentricWorkflow.RAW_FILE_MANAGER_PKL_NAME
)
):
raw_stats = RawFileManager(
path=raw_file_manager_path, load_from_file=True
).stats
else:
logger.warning(f"Error reading raw file manager for {raw_name}")

# deliberately mapping explicitly to avoid coupling raw_stats to the output too tightly
prefix = "raw."

stats[f"{prefix}gradient_min_m"] = raw_stats["rt_limit_min"] / 60
stats[f"{prefix}gradient_max_m"] = raw_stats["rt_limit_max"] / 60
stats[f"{prefix}gradient_length_m"] = (
raw_stats["rt_limit_max"] - raw_stats["rt_limit_min"]
) / 60
for key in [
"cycle_length",
"cycle_duration",
"cycle_number",
"msms_range_min",
"msms_range_max",
]:
stats[f"{prefix}{key}"] = raw_stats[key]

all_stats.append(stats)

return pd.DataFrame(all_stats)


return pd.DataFrame(out_df)
def _get_value_or_nan(d: dict, key: str):
try:
return d[key]
except KeyError:
return np.nan


def _build_run_internal_df(
Expand All @@ -1048,7 +1094,7 @@ def _build_run_internal_df(

"""
timing_manager_path = os.path.join(
folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH
folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PKL_NAME
)
raw_name = os.path.basename(folder_path)

Expand Down
22 changes: 11 additions & 11 deletions alphadia/workflow/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ class WorkflowBase:
It also initializes the calibration_manager and fdr_manager for the workflow.
"""

RAW_FILE_MANAGER_PATH = "raw_file_manager.pkl"
CALIBRATION_MANAGER_PATH = "calibration_manager.pkl"
OPTIMIZATION_MANAGER_PATH = "optimization_manager.pkl"
TIMING_MANAGER_PATH = "timing_manager.pkl"
FDR_MANAGER_PATH = "fdr_manager.pkl"
FIGURE_PATH = "figures"
RAW_FILE_MANAGER_PKL_NAME = "raw_file_manager.pkl"
CALIBRATION_MANAGER_PKL_NAME = "calibration_manager.pkl"
OPTIMIZATION_MANAGER_PKL_NAME = "optimization_manager.pkl"
TIMING_MANAGER_PKL_NAME = "timing_manager.pkl"
FDR_MANAGER_PKL_NAME = "fdr_manager.pkl"
FIGURES_FOLDER_NAME = "figures"

def __init__(
self,
Expand Down Expand Up @@ -87,7 +87,7 @@ def load(
self.reporter.log_event("loading_data", {"progress": 0})
raw_file_manager = RawFileManager(
self.config,
path=os.path.join(self.path, self.RAW_FILE_MANAGER_PATH),
path=os.path.join(self.path, self.RAW_FILE_MANAGER_PKL_NAME),
reporter=self.reporter,
)

Expand All @@ -102,7 +102,7 @@ def load(
# initialize the calibration manager
self._calibration_manager = manager.CalibrationManager(
self.config["calibration_manager"],
path=os.path.join(self.path, self.CALIBRATION_MANAGER_PATH),
path=os.path.join(self.path, self.CALIBRATION_MANAGER_PKL_NAME),
load_from_file=self.config["general"]["reuse_calibration"],
reporter=self.reporter,
)
Expand All @@ -115,14 +115,14 @@ def load(
self._optimization_manager = manager.OptimizationManager(
self.config,
gradient_length=self.dia_data.rt_values.max(),
path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PATH),
path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PKL_NAME),
load_from_file=self.config["general"]["reuse_calibration"],
figure_path=os.path.join(self.path, self.FIGURE_PATH),
figure_path=os.path.join(self.path, self.FIGURES_FOLDER_NAME),
reporter=self.reporter,
)

self._timing_manager = manager.TimingManager(
path=os.path.join(self.path, self.TIMING_MANAGER_PATH),
path=os.path.join(self.path, self.TIMING_MANAGER_PKL_NAME),
load_from_file=self.config["general"]["reuse_calibration"],
)

Expand Down
72 changes: 36 additions & 36 deletions alphadia/workflow/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,51 +83,51 @@ def is_fitted(self, value):

def save(self):
"""Save the state to pickle file."""
if self.path is not None:
try:
with open(self.path, "wb") as f:
pickle.dump(self, f)
except Exception as e:
self.reporter.log_string(
f"Failed to save {self.__class__.__name__} to {self.path}: {str(e)}",
verbosity="error",
)
# Log the full traceback
if self.path is None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the refactor 😄

return

self.reporter.log_string(
f"Traceback: {traceback.format_exc()}", verbosity="error"
)
try:
with open(self.path, "wb") as f:
pickle.dump(self, f)
except Exception as e:
self.reporter.log_string(
f"Failed to save {self.__class__.__name__} to {self.path}: {str(e)}",
verbosity="error",
)

self.reporter.log_string(
f"Traceback: {traceback.format_exc()}", verbosity="error"
)

def load(self):
"""Load the state from pickle file."""
if self.path is not None:
if os.path.exists(self.path):
try:
with open(self.path, "rb") as f:
loaded_state = pickle.load(f)

if loaded_state._version == self._version:
self.__dict__.update(loaded_state.__dict__)
self.is_loaded_from_file = True
else:
self.reporter.log_string(
f"Version mismatch while loading {self.__class__}: {loaded_state._version} != {self._version}. Will not load.",
verbosity="warning",
)
except Exception:
if self.path is None or not os.path.exists(self.path):
self.reporter.log_string(
f"{self.__class__.__name__} not found at: {self.path}",
verbosity="warning",
)
return

try:
with open(self.path, "rb") as f:
loaded_state = pickle.load(f)

if loaded_state._version == self._version:
self.__dict__.update(loaded_state.__dict__)
self.is_loaded_from_file = True
self.reporter.log_string(
f"Failed to load {self.__class__.__name__} from {self.path}",
verbosity="error",
f"Loaded {self.__class__.__name__} from {self.path}"
)
else:
self.reporter.log_string(
f"Loaded {self.__class__.__name__} from {self.path}"
f"Version mismatch while loading {self.__class__}: {loaded_state._version} != {self._version}. Will not load.",
verbosity="warning",
)
else:
self.reporter.log_string(
f"{self.__class__.__name__} not found at: {self.path}",
verbosity="warning",
)
except Exception:
self.reporter.log_string(
f"Failed to load {self.__class__.__name__} from {self.path}",
verbosity="error",
)

def fit(self):
"""Fit the workflow property of the manager."""
Expand Down
15 changes: 8 additions & 7 deletions alphadia/workflow/managers/raw_file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,14 @@ def __init__(
self.stats = {} # needs to be before super().__init__ to avoid overwriting loaded values

super().__init__(path=path, load_from_file=load_from_file, **kwargs)
self.reporter.log_string(f"Initializing {self.__class__.__name__}")
self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})

self._config: Config = config

# deliberately not saving the actual raw data here to avoid the saved manager file being too large
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a comment why the dia_data goes back into PeptideCentric and doesn't stay here as one might expect?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes it is


self.reporter.log_string(f"Initializing {self.__class__.__name__}")
self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"})

def get_dia_data_object(
self, dia_data_path: str
) -> bruker.TimsTOFTranspose | alpharaw_wrapper.AlphaRaw:
Expand Down Expand Up @@ -121,8 +124,6 @@ def _calc_stats(
stats["rt_limit_min"] = rt_values.min()
stats["rt_limit_max"] = rt_values.max()

stats["rt_duration_sec"] = rt_values.max() - rt_values.min()

cycle_length = cycle.shape[1]
stats["cycle_length"] = cycle_length
stats["cycle_duration"] = np.diff(rt_values[::cycle_length]).mean()
Expand All @@ -138,13 +139,13 @@ def _calc_stats(

def _log_stats(self):
"""Log the statistics calculated from the DIA data."""
rt_duration_min = self.stats["rt_duration_sec"] / 60
rt_duration = self.stats["rt_limit_max"] - self.stats["rt_limit_min"]

logger.info(
f"{'RT (min)':<20}: {self.stats['rt_limit_min']/60:.1f} - {self.stats['rt_limit_max']/60:.1f}"
)
logger.info(f"{'RT duration (sec)':<20}: {self.stats['rt_duration_sec']:.1f}")
logger.info(f"{'RT duration (min)':<20}: {rt_duration_min:.1f}")
logger.info(f"{'RT duration (sec)':<20}: {rt_duration:.1f}")
logger.info(f"{'RT duration (min)':<20}: {rt_duration/60:.1f}")

logger.info(f"{'Cycle len (scans)':<20}: {self.stats['cycle_length']:.0f}")
logger.info(f"{'Cycle len (sec)':<20}: {self.stats['cycle_duration']:.2f}")
Expand Down
1 change: 1 addition & 0 deletions docs/methods.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ methods/command-line
methods/configuration
methods/calibration
methods/transfer-learning
methods/output-format
```
Loading
Loading