Skip to content

Commit

Permalink
Fix logging of step runtime statistics
Browse files Browse the repository at this point in the history
This had to be moved to _ensemble.py as the state inside _snapshot.py
only had information about steps that fell into the same batching
window, thus long lasting steps would have start_time=None
  • Loading branch information
berland committed Oct 31, 2024
1 parent c077aad commit b5eb5b7
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 33 deletions.
34 changes: 33 additions & 1 deletion src/ert/ensemble_evaluator/_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@
Union,
)

from _ert.events import Event, Id, event_from_dict, event_to_json
from _ert.events import (
Event,
FMEvent,
Id,
event_from_dict,
event_to_json,
)
from _ert.forward_model_runner.client import Client
from ert.config import ForwardModelStep, QueueConfig
from ert.run_arg import RunArg
Expand Down Expand Up @@ -142,9 +148,35 @@ def _create_snapshot(self) -> EnsembleSnapshot:
def get_successful_realizations(self) -> List[int]:
return self.snapshot.get_successful_realizations()

def _log_completed_fm_step(self, event: FMEvent, step_snapshot: FMStepSnapshot):

Check failure on line 151 in src/ert/ensemble_evaluator/_ensemble.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Function is missing a return type annotation
step_name = step_snapshot.get("name", "")
start_time = step_snapshot.get("start_time")
cpu_seconds = step_snapshot.get("cpu_seconds")
current_memory_usage = step_snapshot.get("current_memory_usage")
if start_time is not None and event.time is not None:
walltime = (event.time - start_time).total_seconds()
else:
walltime = "NaN"

Check failure on line 159 in src/ert/ensemble_evaluator/_ensemble.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Incompatible types in assignment (expression has type "str", variable has type "float")
logger.warning(
f"{event.event_type} {step_name} "
f"{walltime=} "
f"{cpu_seconds=} "
f"{current_memory_usage=} "
f"step_index={event.fm_step} "
f"real={event.real} "
f"ensemble={event.ensemble}"
)

def update_snapshot(self, events: Sequence[Event]) -> EnsembleSnapshot:
snapshot_mutate_event = EnsembleSnapshot()
for event in events:
if event.event_type in [
Id.FORWARD_MODEL_STEP_SUCCESS,
Id.FORWARD_MODEL_STEP_FAILURE,
]:
self._log_completed_fm_step(
event, self.snapshot.reals[event.real]["fm_steps"][event.fm_step]

Check failure on line 178 in src/ert/ensemble_evaluator/_ensemble.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Argument 1 to "_log_completed_fm_step" of "LegacyEnsemble" has incompatible type "ForwardModelStepStart | ForwardModelStepRunning | ForwardModelStepSuccess | ForwardModelStepFailure | ForwardModelStepChecksum | RealizationPending | RealizationRunning | RealizationSuccess | RealizationFailed | RealizationTimeout | RealizationUnknown | RealizationWaiting | EESnapshot | EESnapshotUpdate | EETerminated | EEUserCancel | EEUserDone | EnsembleStarted | EnsembleSucceeded | EnsembleFailed | EnsembleCancelled"; expected "ForwardModelStepStart | ForwardModelStepRunning | ForwardModelStepSuccess | ForwardModelStepFailure"

Check failure on line 178 in src/ert/ensemble_evaluator/_ensemble.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Item "EESnapshot" of "ForwardModelStepStart | ForwardModelStepRunning | ForwardModelStepSuccess | ForwardModelStepFailure | ForwardModelStepChecksum | RealizationPending | RealizationRunning | RealizationSuccess | RealizationFailed | RealizationTimeout | RealizationUnknown | RealizationWaiting | EESnapshot | EESnapshotUpdate | EETerminated | EEUserCancel | EEUserDone | EnsembleStarted | EnsembleSucceeded | EnsembleFailed | EnsembleCancelled" has no attribute "real"

Check failure on line 178 in src/ert/ensemble_evaluator/_ensemble.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Item "EESnapshotUpdate" of "ForwardModelStepStart | ForwardModelStepRunning | ForwardModelStepSuccess | ForwardModelStepFailure | ForwardModelStepChecksum | RealizationPending | RealizationRunning | RealizationSuccess | RealizationFailed | RealizationTimeout | RealizationUnknown | RealizationWaiting | EESnapshot | EESnapshotUpdate | EETerminated | EEUserCancel | EEUserDone | EnsembleStarted | EnsembleSucceeded | EnsembleFailed | EnsembleCancelled" has no attribute "real"

Check failure on line 178 in src/ert/ensemble_evaluator/_ensemble.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Item "EETerminated" of "ForwardModelStepStart | ForwardModelStepRunning | ForwardModelStepSuccess | ForwardModelStepFailure | ForwardModelStepChecksum | RealizationPending | RealizationRunning | RealizationSuccess | RealizationFailed | RealizationTimeout | RealizationUnknown | RealizationWaiting | EESnapshot | EESnapshotUpdate | EETerminated | EEUserCancel | EEUserDone | EnsembleStarted | EnsembleSucceeded | EnsembleFailed | EnsembleCancelled" has no attribute "real"

Check failure on line 178 in src/ert/ensemble_evaluator/_ensemble.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Item "EEUserCancel" of "ForwardModelStepStart | ForwardModelStepRunning | ForwardModelStepSuccess | ForwardModelStepFailure | ForwardModelStepChecksum | RealizationPending | RealizationRunning | RealizationSuccess | RealizationFailed | RealizationTimeout | RealizationUnknown | RealizationWaiting | EESnapshot | EESnapshotUpdate | EETerminated | EEUserCancel | EEUserDone | EnsembleStarted | EnsembleSucceeded | EnsembleFailed | EnsembleCancelled" has no attribute "real"

Check failure on line 178 in src/ert/ensemble_evaluator/_ensemble.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Item "EEUserDone" of "ForwardModelStepStart | ForwardModelStepRunning | ForwardModelStepSuccess | ForwardModelStepFailure | ForwardModelStepChecksum | RealizationPending | RealizationRunning | RealizationSuccess | RealizationFailed | RealizationTimeout | RealizationUnknown | RealizationWaiting | EESnapshot | EESnapshotUpdate | EETerminated | EEUserCancel | EEUserDone | EnsembleStarted | EnsembleSucceeded | EnsembleFailed | EnsembleCancelled" has no attribute "real"

Check failure on line 178 in src/ert/ensemble_evaluator/_ensemble.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Item "EnsembleStarted" of "ForwardModelStepStart | ForwardModelStepRunning | ForwardModelStepSuccess | ForwardModelStepFailure | ForwardModelStepChecksum | RealizationPending | RealizationRunning | RealizationSuccess | RealizationFailed | RealizationTimeout | RealizationUnknown | RealizationWaiting | EESnapshot | EESnapshotUpdate | EETerminated | EEUserCancel | EEUserDone | EnsembleStarted | EnsembleSucceeded | EnsembleFailed | EnsembleCancelled" has no attribute "real"

Check failure on line 178 in src/ert/ensemble_evaluator/_ensemble.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Item "EnsembleSucceeded" of "ForwardModelStepStart | ForwardModelStepRunning | ForwardModelStepSuccess | ForwardModelStepFailure | ForwardModelStepChecksum | RealizationPending | RealizationRunning | RealizationSuccess | RealizationFailed | RealizationTimeout | RealizationUnknown | RealizationWaiting | EESnapshot | EESnapshotUpdate | EETerminated | EEUserCancel | EEUserDone | EnsembleStarted | EnsembleSucceeded | EnsembleFailed | EnsembleCancelled" has no attribute "real"
)
snapshot_mutate_event = snapshot_mutate_event.update_from_event(
event, source_snapshot=self.snapshot
)
Expand Down
32 changes: 0 additions & 32 deletions src/ert/ensemble_evaluator/snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,38 +335,6 @@ def update_from_event(
elif e_type in {ForwardModelStepSuccess, ForwardModelStepFailure}:
end_time = convert_iso8601_to_datetime(timestamp)

try:
start_time = self._fm_step_snapshots[event.real, event.fm_step].get(
"start_time"
)
cpu_seconds = self._fm_step_snapshots[
event.real, event.fm_step
].get("cpu_seconds")
fm_step_name = source_snapshot.reals[event.real]["fm_steps"][
event.fm_step
]["name"]
if start_time is not None:
logger.warning(
f"{event.event_type} {fm_step_name} "
f"walltime={(end_time - start_time).total_seconds()} "
f"cputime={cpu_seconds} "
f"ensemble={event.ensemble} "
f"step_index={event.fm_step} "
f"real={event.real}"
)
else:
logger.warning(
f"Should log fm_step runtime, but start_time was None, "
f"{event.event_type} {fm_step_name=} "
f"endtime={end_time.isoformat()} "
f"cputime={cpu_seconds} "
f"ensemble={event.ensemble} "
f"step_index={event.fm_step} "
f"real={event.real}"
)
except BaseException as e:
logger.warning(f"Should log fm_step runtime, but got exception {e}")

if type(event) is ForwardModelStepFailure:
error = event.error_msg if event.error_msg else ""
else:
Expand Down

0 comments on commit b5eb5b7

Please sign in to comment.