Skip to content

Commit

Permalink
feature: trainer logic
Browse files Browse the repository at this point in the history
adds a exception type that triggers a node restart
  • Loading branch information
denniswittich committed Oct 24, 2024
1 parent c8ac35d commit c4dbe7a
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
12 changes: 11 additions & 1 deletion learning_loop_node/trainer/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
class CriticalError(Exception):
pass
'''
CriticalError is raised when the training cannot be continued.
In this case the trainer jumps to the TrainerState.ReadyForCleanup and tries to upload the latest model.
'''


class NodeNeedsRestartError(Exception):
'''
NodeNeedsRestartError is raised when the node needs to be restarted.
This is e.g. the case when the GPU is not available anymore.
'''
5 changes: 4 additions & 1 deletion learning_loop_node/trainer/trainer_logic_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
TrainingOut, TrainingStateData)
from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4
from .downloader import TrainingsDownloader
from .exceptions import CriticalError
from .exceptions import CriticalError, NodeNeedsRestartError
from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO

if TYPE_CHECKING:
Expand Down Expand Up @@ -294,6 +294,9 @@ async def _perform_state(self, error_key: str, state_during: TrainerState, state
logger.error('CriticalError in %s - Exception: %s', state_during, e)
self.errors.set(error_key, str(e))
self.training.training_state = TrainerState.ReadyForCleanup
except NodeNeedsRestartError:
logger.error('Node Restart Requested')
sys.exit(0)
except Exception as e:
self.errors.set(error_key, str(e))
logger.exception('Error in %s - Exception: %s', state_during, e)
Expand Down

0 comments on commit c4dbe7a

Please sign in to comment.