Skip to content

Commit

Permalink
automatic trainer shutdown (#27)
Browse files Browse the repository at this point in the history
* add option to automatically shutdown trainer after a defined idle time (used for fly trainers)

* improve logging

* correct env variable name

* move check_idle_timeout into separate function

* make time calculation more accurate

* print only two decimal places

---------

Co-authored-by: Niklas Neugebauer <[email protected]>
  • Loading branch information
denniswittich and NiklasNeugebauer authored Aug 30, 2024
1 parent 5dfe990 commit c0e229c
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 13 deletions.
27 changes: 14 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,20 @@ To start a node you have to implement the logic by inheriting from the correspon

You can configure connection to our Learning Loop by specifying the following environment variables before starting:

| Name | Alias | Purpose | Required by |
| ----------------------- | ------------ | ------------------------------------------------------------ | -------------------- |
| LOOP_HOST | HOST | Learning Loop address (e.g. learning-loop.ai) | all |
| LOOP_USERNAME | USERNAME | Learning Loop user name | all besides Detector |
| LOOP_PASSWORD | PASSWORD | Learning Loop password | all besides Detector |
| LOOP_SSL_CERT_PATH | - | Path to the SSL certificate | all (opt.) |
| LOOP_ORGANIZATION | ORGANIZATION | Organization name | Detector |
| LOOP_PROJECT | PROJECT | Project name | Detector |
| MIN_UNCERTAIN_THRESHOLD | PROJECT | smallest confidence (float) at which auto-upload will happen | Detector |
| MAX_UNCERTAIN_THRESHOLD | PROJECT | largest confidence (float) at which auto-upload will happen | Detector |
| INFERENCE_BATCH_SIZE | - | Batch size of trainer when calculating detections | Trainer (opt.) |
| RESTART_AFTER_TRAINING | - | Restart the trainer after training (set to 1) | Trainer (opt.) |
| KEEP_OLD_TRAININGS | - | Do not delete old trainings (set to 1) | Trainer (opt.) |
| Name | Alias | Purpose | Required by |
| ------------------------ | ------------ | ------------------------------------------------------------ | -------------------- |
| LOOP_HOST | HOST | Learning Loop address (e.g. learning-loop.ai) | all |
| LOOP_USERNAME | USERNAME | Learning Loop user name | all besides Detector |
| LOOP_PASSWORD | PASSWORD | Learning Loop password | all besides Detector |
| LOOP_SSL_CERT_PATH | - | Path to the SSL certificate | all (opt.) |
| LOOP_ORGANIZATION | ORGANIZATION | Organization name | Detector |
| LOOP_PROJECT | PROJECT | Project name | Detector |
| MIN_UNCERTAIN_THRESHOLD | PROJECT | smallest confidence (float) at which auto-upload will happen | Detector |
| MAX_UNCERTAIN_THRESHOLD | PROJECT | largest confidence (float) at which auto-upload will happen | Detector |
| INFERENCE_BATCH_SIZE | - | Batch size of trainer when calculating detections | Trainer (opt.) |
| RESTART_AFTER_TRAINING | - | Restart the trainer after training (set to 1) | Trainer (opt.) |
| KEEP_OLD_TRAININGS | - | Do not delete old trainings (set to 1) | Trainer (opt.) |
| TRAINER_IDLE_TIMEOUT_SEC | - | Automatically shutdown trainer after timeout (in seconds) | Trainer (opt.) |

#### Testing

Expand Down
26 changes: 26 additions & 0 deletions learning_loop_node/trainer/trainer_node.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import sys
import time
from dataclasses import asdict
from typing import Dict, Optional

Expand All @@ -20,6 +23,12 @@ def __init__(self, name: str, trainer_logic: TrainerLogicGeneric, uuid: Optional
self.last_training_io = LastTrainingIO(self.uuid)
self.trainer_logic._last_training_io = self.last_training_io

self.first_idle_time: float | None = None
self.idle_timeout = float(os.environ.get('TRAINER_IDLE_TIMEOUT_SEC', 0))
if self.idle_timeout:
self.log.info(
f'Trainer started with an idle_timeout of {self.idle_timeout} seconds. Note that shutdown does not work if docker container has the restart policy set to always')

self.include_router(controls.router, tags=["controls"])
if use_backdoor_controls:
self.include_router(backdoor_controls.router, tags=["controls"])
Expand All @@ -38,6 +47,7 @@ async def on_repeat(self):
if await self.trainer_logic.try_continue_run_if_incomplete():
return # NOTE: we prevent sending idle status after starting a continuation
await self.send_status()
self.check_idle_timeout()
except exceptions.TimeoutError:
self.log.warning('timeout when sending status to learning loop, reconnecting sio_client')
await self.sio_client.disconnect() # NOTE: reconnect happens in node._on_repeat
Expand Down Expand Up @@ -90,3 +100,19 @@ async def send_status(self):
result = await self.sio_client.call('update_trainer', jsonable_encoder(asdict(status)), timeout=30)
if isinstance(result, Dict) and not result['success']:
self.log.error(f'Error when sending status update: Response from loop was:\n {result}')

def check_idle_timeout(self):
if not self.idle_timeout:
return

if self.trainer_logic.state == 'idle':
if self.first_idle_time is None:
self.first_idle_time = time.time()
idle_time = time.time() - self.first_idle_time
if idle_time > self.idle_timeout:
self.log.info('Trainer has been idle for %.2f s (with timeout %.2f s). Shutting down.',
idle_time, self.idle_timeout)
sys.exit(0)
self.log.debug('idle time: %.2f s / %.2f s', idle_time, self.idle_timeout)
else:
self.first_idle_time = None

0 comments on commit c0e229c

Please sign in to comment.