From ff4721870b5441b7e23d793ddc86040bea6f75ca Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 6 Mar 2024 15:13:18 +0100
Subject: [PATCH 01/62] Start cleanup process. Remove converter code and do
 major cleanup of data_exchanger. No intendet API changes so far

---
 learning_loop_node/converter/__init__.py      |   0
 .../converter/converter_logic.py              |  68 -------
 .../converter/converter_node.py               | 125 ------------
 .../converter/tests/test_converter.py         |  55 -----
 learning_loop_node/data_exchanger.py          | 189 ++++++------------
 .../detector/tests/testing_detector.py        |   9 +-
 learning_loop_node/helpers/misc.py            |  36 ++++
 learning_loop_node/node.py                    |  33 ++-
 learning_loop_node/tests/test_downloader.py   |  11 +-
 learning_loop_node/trainer/downloader.py      |   2 +-
 learning_loop_node/trainer/trainer_logic.py   |  16 +-
 11 files changed, 124 insertions(+), 420 deletions(-)
 delete mode 100644 learning_loop_node/converter/__init__.py
 delete mode 100644 learning_loop_node/converter/converter_logic.py
 delete mode 100644 learning_loop_node/converter/converter_node.py
 delete mode 100644 learning_loop_node/converter/tests/test_converter.py

diff --git a/learning_loop_node/converter/__init__.py b/learning_loop_node/converter/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/learning_loop_node/converter/converter_logic.py b/learning_loop_node/converter/converter_logic.py
deleted file mode 100644
index cef82eff..00000000
--- a/learning_loop_node/converter/converter_logic.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import json
-import os
-import shutil
-from abc import abstractmethod
-from typing import List, Optional
-
-from ..data_classes import ModelInformation
-from ..node import Node
-
-
-class ConverterLogic():
-
-    def __init__(
-            self, source_format: str, target_format: str):
-        self.source_format = source_format
-        self.target_format = target_format
-        self._node: Optional[Node] = None
-        self.model_folder: Optional[str] = None
-
-    def init(self, node: Node) -> None:
-        self._node = node
-
-    @property
-    def node(self) -> Node:
-        if self._node is None:
-            raise Exception('ConverterLogic not initialized')
-        return self._node
-
-    async def convert(self, model_information: ModelInformation) -> None:
-        project_folder = Node.create_project_folder(model_information.context)
-
-        self.model_folder = ConverterLogic.create_model_folder(project_folder, model_information.id)
-        await self.node.data_exchanger.download_model(self.model_folder,
-                                                      model_information.context,
-                                                      model_information.id,
-                                                      self.source_format)
-
-        with open(f'{self.model_folder}/model.json', 'r') as f:
-            content = json.load(f)
-            if 'resolution' in content:
-                model_information.resolution = content['resolution']
-
-        await self._convert(model_information)
-
-    async def upload_model(self, context, model_id: str) -> None:
-        files = self.get_converted_files(model_id)
-        await self.node.data_exchanger.upload_model(context, files, model_id, self.target_format)
-
-    @abstractmethod
-    async def _convert(self, model_information: ModelInformation) -> None:
-        """Converts the model in self.model_folder to the target format."""
-
-    @abstractmethod
-    def get_converted_files(self, model_id) -> List[str]:
-        """Returns a list of files that should be uploaded to the server."""
-
-    @staticmethod
-    def create_convert_folder(project_folder: str) -> str:
-        image_folder = f'{project_folder}/images'
-        os.makedirs(image_folder, exist_ok=True)
-        return image_folder
-
-    @staticmethod
-    def create_model_folder(project_folder: str, model_id: str) -> str:
-        model_folder = f'{project_folder}/{model_id}'
-        shutil.rmtree(model_folder, ignore_errors=True)  # cleanup
-        os.makedirs(model_folder, exist_ok=True)
-        return model_folder
diff --git a/learning_loop_node/converter/converter_node.py b/learning_loop_node/converter/converter_node.py
deleted file mode 100644
index f23dd26e..00000000
--- a/learning_loop_node/converter/converter_node.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import logging
-from dataclasses import asdict
-from http import HTTPStatus
-from typing import List, Optional
-
-from dacite import from_dict
-from fastapi.encoders import jsonable_encoder
-from fastapi_utils.tasks import repeat_every
-from socketio import AsyncClient
-
-from ..data_classes import Category, ModelInformation, NodeState
-from ..node import Node
-from .converter_logic import ConverterLogic
-
-
-class ConverterNode(Node):
-    converter: ConverterLogic
-    skip_check_state: bool = False
-    bad_model_ids: List[str] = []
-
-    def __init__(self, name: str, converter: ConverterLogic, uuid: Optional[str] = None):
-        super().__init__(name, uuid)
-        self.converter = converter
-        converter.init(self)
-
-        @self.on_event("startup")
-        @repeat_every(seconds=60, raise_exceptions=True, wait_first=False)
-        async def check_state():
-            if not self.skip_check_state:
-                try:
-                    await self.check_state()
-                except Exception:
-                    logging.error('could not check state. Is loop reachable?')
-
-    async def convert_model(self, model_information: ModelInformation):
-        if model_information.id in self.bad_model_ids:
-            logging.info(
-                f'skipping bad model model {model_information.id} for {model_information.context.organization}/{model_information.context.project}.')
-            return
-        try:
-            logging.info(
-                f'converting model {jsonable_encoder(asdict(model_information))}')
-            await self.converter.convert(model_information)
-            logging.info('uploading model ')
-            await self.converter.upload_model(model_information.context, model_information.id)
-        except Exception as e:
-            self.bad_model_ids.append(model_information.id)
-            logging.error(
-                f'could not convert model {model_information.id} for {model_information.context.organization}/{model_information.context.project}. Details: {str(e)}.')
-
-    async def check_state(self):
-        logging.info(f'checking state: {self.status.state}')
-
-        if self.status.state == NodeState.Running:
-            return
-        self.status.state = NodeState.Running
-        try:
-            await self.convert_models()
-        except Exception as exc:
-            logging.error(str(exc))
-
-        self.status.state = NodeState.Idle
-
-    async def convert_models(self) -> None:
-        try:
-            response = await self.loop_communicator.get('/projects')
-            assert response.status_code == 200, f'Assert statuscode 200, but was {response.status_code}.'
-            content = response.json()
-            projects = content['projects']
-
-            for project in projects:
-                organization_id = project['organization_id']
-                project_id = project['project_id']
-
-                response = await self.loop_communicator.get(f'{project["resource"]}')
-                if response.status_code != HTTPStatus.OK:
-                    logging.error(f'got bad response for {response.url}: {str(response.status_code)}')
-                    continue
-
-                project_categories = [from_dict(data_class=Category, data=c) for c in response.json()['categories']]
-
-                path = f'{project["resource"]}/models'
-                models_response = await self.loop_communicator.get(path)
-                assert models_response.status_code == 200
-                content = models_response.json()
-                models = content['models']
-
-                for model in models:
-                    if (model['version']
-                                and self.converter.source_format in model['formats']
-                                and self.converter.target_format not in model['formats']
-                            ):
-                        # if self.converter.source_format in model['formats'] and project_id == 'drawingbot' and model['version'] == "6.0":
-                        model_information = ModelInformation(
-                            host=self.loop_communicator.base_url,
-                            organization=organization_id,
-                            project=project_id,
-                            id=model['id'],
-                            categories=project_categories,
-                            version=model['version'],
-                        )
-                        await self.convert_model(model_information)
-        except Exception:
-            logging.exception('could not convert models')
-
-    async def send_status(self):
-        pass
-
-    async def on_startup(self):
-        pass
-
-    async def on_shutdown(self):
-        pass
-
-    async def on_repeat(self):
-        pass
-
-    def register_sio_events(self, sio_client: AsyncClient):
-        pass
-
-    async def get_state(self):
-        return NodeState.Idle  # NOTE unused for this node type
-
-    def get_node_type(self):
-        return 'converter'
diff --git a/learning_loop_node/converter/tests/test_converter.py b/learning_loop_node/converter/tests/test_converter.py
deleted file mode 100644
index 7328806f..00000000
--- a/learning_loop_node/converter/tests/test_converter.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import logging
-from typing import List
-
-import pytest
-
-from learning_loop_node.converter.converter_logic import ConverterLogic
-from learning_loop_node.converter.converter_node import ConverterNode
-from learning_loop_node.data_classes import ModelInformation
-from learning_loop_node.loop_communication import LoopCommunicator
-from learning_loop_node.tests import test_helper
-
-
-class TestConverter(ConverterLogic):
-    __test__ = False  # hint for pytest
-
-    def __init__(self, source_format: str, target_format: str,  models: List[ModelInformation]):
-        super().__init__(source_format, target_format)
-        self.models = models
-
-    async def _convert(self, model_information: ModelInformation) -> None:
-        self.models.append(model_information)
-
-    def get_converted_files(self, model_id) -> List[str]:
-        return []  # test: test_meta_information fails because model cannot be uploaded
-
-
-@pytest.mark.asyncio
-@pytest.fixture()
-async def setup_converter_test_project(glc: LoopCommunicator):
-    await glc.delete("/zauberzeug/projects/pytest_conv?keep_images=true")
-    project_configuration = {
-        'project_name': 'pytest_conv', 'box_categories': 1, 'point_categories': 1, 'inbox': 0, 'annotate': 0, 'review': 0,
-        'complete': 0, 'image_style': 'plain', 'thumbs': False, 'trainings': 1}
-    r = await glc.post("/zauberzeug/projects/generator", json=project_configuration)
-    assert r.status_code == 200
-    yield
-    await glc.delete("/zauberzeug/projects/pytest?keep_images=true")
-
-
-# pylint: disable=redefined-outer-name, unused-argument
-@pytest.mark.asyncio
-async def test_meta_information(setup_converter_test_project):
-    model_id = await test_helper.get_latest_model_id(project='pytest_conv')
-
-    converter = TestConverter(source_format='mocked', target_format='test', models=[])
-    node = ConverterNode(name='test', converter=converter)
-    await node.convert_models()
-
-    pytest_project_model = [m for m in converter.models if m.id == model_id][0]
-
-    categories = pytest_project_model.categories
-    assert len(categories) == 2
-    category_types = [category.type for category in categories]
-    assert 'box' in category_types
-    assert 'point' in category_types
diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py
index 23f19976..361f66cf 100644
--- a/learning_loop_node/data_exchanger.py
+++ b/learning_loop_node/data_exchanger.py
@@ -2,23 +2,18 @@
 import logging
 import os
 import shutil
-import time
 import zipfile
 from glob import glob
 from http import HTTPStatus
 from io import BytesIO
-from time import perf_counter
 from typing import Dict, List, Optional
 
 import aiofiles
-from tqdm.asyncio import tqdm
 
 from .data_classes import Context
-from .helpers.misc import create_resource_paths, create_task
+from .helpers.misc import create_resource_paths, create_task, is_valid_image
 from .loop_communication import LoopCommunicator
 
-check_jpeg = shutil.which('jpeginfo') is not None
-
 
 class DownloadError(Exception):
 
@@ -30,120 +25,81 @@ def __init__(self, cause: str, *args: object) -> None:
 class DataExchanger():
 
     def __init__(self, context: Optional[Context], loop_communicator: LoopCommunicator):
-        self.context = context
+        self.set_context(context)
+        self.progress = 0.0
         self.loop_communicator = loop_communicator
+
+        self.check_jpeg = shutil.which('jpeginfo') is not None
+        if self.check_jpeg:
+            logging.info('Detected command line tool "jpeginfo". Images will be checked for validity')
+        else:
+            logging.error('Missing command line tool "jpeginfo". We cannot check for validity of images.')
+
+    def set_context(self, context: Optional[Context]) -> None:
+        self._context = context
         self.progress = 0.0
 
-    def set_context(self, context: Context):
-        self.context = context
+    @property
+    def context(self) -> Context:
+        assert self._context, 'DataExchanger: Context was not set yet.. call set_context() first.'
+        return self._context
 
-    async def fetch_image_ids(self, query_params: Optional[str] = '') -> List[str]:
-        if self.context is None:
-            logging.warning('context was not set yet')
-            return []
+    # ---------------------------- END OF INIT ----------------------------
+
+    async def fetch_image_uuids(self, query_params: Optional[str] = '') -> List[str]:
+        """Fetch image uuids from the learning loop data endpoint."""
+        logging.info(f'Fetching image uuids for {self.context.organization}/{self.context.project}..')
 
         response = await self.loop_communicator.get(f'/{self.context.organization}/projects/{self.context.project}/data?{query_params}')
         assert response.status_code == 200, response
         return (response.json())['image_ids']
 
-    async def download_images_data(self, ids: List[str]) -> List[Dict]:
-        '''Download image annotations etc.'''
-        if self.context is None:
-            logging.warning('context was not set yet')
-            return []
-
-        return await self._download_images_data(self.context.organization, self.context.project, ids)
-
-    async def download_images(self, image_ids: List[str], image_folder: str) -> None:
-        '''Download images. Will skip existing images'''
-        if self.context is None:
-            logging.warning('context was not set yet')
-            return
-
-        new_image_ids = await asyncio.get_event_loop().run_in_executor(None, DataExchanger.filter_existing_images, image_ids, image_folder)
-        paths, ids = create_resource_paths(self.context.organization, self.context.project, new_image_ids)
-        await self._download_images(paths, ids, image_folder)
-
-    @staticmethod
-    async def delete_corrupt_images(image_folder: str) -> None:
-        logging.info('deleting corrupt images')
-        n_deleted = 0
-        for image in glob(f'{image_folder}/*.jpg'):
-            if not await DataExchanger.is_valid_image(image):
-                logging.debug(f'  deleting image {image}')
-                os.remove(image)
-                n_deleted += 1
-
-        logging.info(f'deleted {n_deleted} images')
-
-    @staticmethod
-    def filter_existing_images(all_image_ids, image_folder) -> List[str]:
-        logging.info(f'### Going to filter {len(all_image_ids)} images ids')
-        start = perf_counter()
-        ids = [os.path.splitext(os.path.basename(image))[0]
-               for image in glob(f'{image_folder}/*.jpg')]
-        logging.info(f'found {len(ids)} images on disc')
-        result = [id for id in all_image_ids if id not in ids]
-        end = perf_counter()
-        logging.info(f'calculated {len(result)} new image ids, which took {end-start:0.2f} seconds')
-        return result
-
-    def jepeg_check_info(self):
-        if check_jpeg:
-            logging.info('Detected command line tool "jpeginfo". Images will be checked for validity')
-        else:
-            logging.error('Missing command line tool "jpeginfo". We can not check for validity of images.')
+    async def download_images_data(self, image_uuids: List[str], chunk_size: int = 100) -> List[Dict]:
+        """Download image annotations, tags, set and other information for the given image uuids."""
+        logging.info(f'Fetching annotations, tags, sets, etc. for {len(image_uuids)} images..')
 
-    async def _download_images_data(self, organization: str, project: str, image_ids: List[str], chunk_size: int = 100) -> List[Dict]:
-        logging.info('fetching annotations and other image data')
-        num_image_ids = len(image_ids)
-        self.jepeg_check_info()
-        images_data = []
+        num_image_ids = len(image_uuids)
         if num_image_ids == 0:
             logging.info('got empty list. No images were downloaded')
-            return images_data
-        starttime = time.time()
+            return []
+
         progress_factor = 0.5 / num_image_ids  # 50% of progress is for downloading data
-        for i in tqdm(range(0, num_image_ids, chunk_size), position=0, leave=True):
+        images_data: List[Dict] = []
+        for i in range(0, num_image_ids, chunk_size):
             self.progress = i * progress_factor
-            chunk_ids = image_ids[i:i+chunk_size]
-            response = await self.loop_communicator.get(f'/{organization}/projects/{project}/images?ids={",".join(chunk_ids)}')
+            chunk_ids = image_uuids[i:i+chunk_size]
+            response = await self.loop_communicator.get(f'/{self.context.organization}/projects/{self.context.project}/images?ids={",".join(chunk_ids)}')
             if response.status_code != 200:
-                logging.error(
-                    f'Error during downloading list of images. Statuscode is {response.status_code}')
+                logging.error(f'Error {response.status_code} during downloading image data. Continue with next batch..')
                 continue
             images_data += response.json()['images']
-            total_time = round(time.time() - starttime, 1)
-            if images_data:
-                per100 = total_time / len(images_data) * 100
-                logging.debug(f'[+] Performance: {total_time} sec total. Per 100 : {per100:.1f} sec')
-            else:
-                logging.debug(f'[+] Performance: {total_time} sec total.')
+
         return images_data
 
-    async def _download_images(self, paths: List[str], image_ids: List[str], image_folder: str, chunk_size: int = 10) -> None:
-        num_image_ids = len(image_ids)
-        if num_image_ids == 0:
-            logging.debug('got empty list. No images were downloaded')
+    async def download_images(self, image_uuids: List[str], image_folder: str, chunk_size: int = 10) -> None:
+        """Downloads images (actual image data). Will skip existing images"""
+        logging.info(f'Downloading {len(image_uuids)} images (actual image data).. skipping existing images.')
+        if not image_uuids:
             return
-        logging.info('fetching image files')
-        starttime = time.time()
+
+        existing_uuids = {os.path.splitext(os.path.basename(image))[0] for image in glob(f'{image_folder}/*.jpg')}
+        new_image_uuids = [id for id in image_uuids if id not in existing_uuids]
+
+        paths, ids = create_resource_paths(self.context.organization, self.context.project, new_image_uuids)
+        num_image_ids = len(image_uuids)
         os.makedirs(image_folder, exist_ok=True)
 
         progress_factor = 0.5 / num_image_ids  # second 50% of progress is for downloading images
-        for i in tqdm(range(0, num_image_ids, chunk_size), position=0, leave=True):
+        for i in range(0, num_image_ids, chunk_size):
             self.progress = 0.5 + i * progress_factor
             chunk_paths = paths[i:i+chunk_size]
-            chunk_ids = image_ids[i:i+chunk_size]
+            chunk_ids = image_uuids[i:i+chunk_size]
             tasks = []
             for j, chunk_j in enumerate(chunk_paths):
-                tasks.append(create_task(self.download_one_image(chunk_j, chunk_ids[j], image_folder)))
+                tasks.append(create_task(self._download_one_image(chunk_j, chunk_ids[j], image_folder)))
             await asyncio.gather(*tasks)
-            total_time = round(time.time() - starttime, 1)
-            per100 = total_time / (i + len(tasks)) * 100
-            logging.debug(f'[+] Performance (image files): {total_time} sec total. Per 100 : {per100:.1f}')
 
-    async def download_one_image(self, path: str, image_id: str, image_folder: str) -> None:
+    async def _download_one_image(self, path: str, image_id: str, image_folder: str) -> None:
         response = await self.loop_communicator.get(path)
         if response.status_code != HTTPStatus.OK:
             logging.error(f'bad status code {response.status_code} for {path}')
@@ -151,41 +107,25 @@ async def download_one_image(self, path: str, image_id: str, image_folder: str)
         filename = f'{image_folder}/{image_id}.jpg'
         async with aiofiles.open(filename, 'wb') as f:
             await f.write(response.content)
-        if not await self.is_valid_image(filename):
+        if not await is_valid_image(filename, self.check_jpeg):
             os.remove(filename)
 
-    @staticmethod
-    async def is_valid_image(filename: str) -> bool:
-        if not os.path.isfile(filename) or os.path.getsize(filename) == 0:
-            return False
-        if not check_jpeg:
-            return True
-
-        info = await asyncio.create_subprocess_shell(
-            f'jpeginfo -c {filename}',
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE)
-        out, _ = await info.communicate()
-        return "OK" in out.decode()
-
     async def download_model(self, target_folder: str, context: Context, model_id: str, model_format: str) -> List[str]:
+        """Downloads a model and returns the paths of the downloaded files."""
+        logging.info(f'Downloading model {model_id} to {target_folder}..')
+
         path = f'/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
         response = await self.loop_communicator.get(path, requires_login=False)
         if response.status_code != 200:
             content = response.json()
-            logging.error(
-                f'could not download {self.loop_communicator.base_url}/{path}: {response.status_code}, content: {content}')
+            logging.error(f'could not download loop/{path}: {response.status_code}, content: {content}')
             raise DownloadError(content['detail'])
         try:
             provided_filename = response.headers.get(
                 "Content-Disposition").split("filename=")[1].strip('"')
             content = response.content
         except:
-            logging.error(f'Error during downloading model {path}:')
-            try:
-                logging.exception(response.json())
-            except Exception:
-                pass
+            logging.exception(f'Error during downloading model {path}:')
             raise
 
         # unzip and place downloaded model
@@ -194,29 +134,20 @@ async def download_model(self, target_folder: str, context: Context, model_id: s
         with zipfile.ZipFile(BytesIO(content), 'r') as zip_:
             zip_.extractall(tmp_path)
 
-        logging.info(f'---- downloaded model {model_id} to {tmp_path}.')
-
         created_files = []
-        files = glob(f'{tmp_path}/**/*', recursive=True)
-        for file in files:
+        for file in glob(f'{tmp_path}/**/*', recursive=True):
             new_file = shutil.move(file, target_folder)
-            logging.info(f'moved model file {os.path.basename(file)} to {new_file}.')
             created_files.append(new_file)
-        return created_files
 
-    async def upload_model(self, context: Context, files: List[str], model_id: str, mformat: str) -> None:
-        response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/models/{model_id}/{mformat}/file', files=files)
-        if response.status_code != 200:
-            msg = f'---- could not upload model with id {model_id} and format {mformat}. Details: {response.text}'
-            raise Exception(msg)
-        logging.info(f'---- uploaded model with id {model_id} and format {mformat}.')
+        logging.info(f'---- downloaded model {model_id}/{model_format} to {tmp_path}. Moved to {target_folder}.')
+        return created_files
 
-    async def upload_model_for_training(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]:
-        """Returns the new model uuid to use for detection."""
+    async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]:
+        """Used by the trainers. Function returns the new model uuid to use for detection."""
         response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/trainings/{training_number}/models/latest/{mformat}/file', files=files)
         if response.status_code != 200:
-            msg = f'---- could not upload model for training {training_number} and format {mformat}. Details: {response.text}'
-            logging.error(msg)
+            logging.error(
+                f'---- could not upload model for training {training_number} and format {mformat}. Details: {response.text}')
             response.raise_for_status()
             return None
         else:
diff --git a/learning_loop_node/detector/tests/testing_detector.py b/learning_loop_node/detector/tests/testing_detector.py
index ed710824..95dd1300 100644
--- a/learning_loop_node/detector/tests/testing_detector.py
+++ b/learning_loop_node/detector/tests/testing_detector.py
@@ -4,7 +4,7 @@
 
 from learning_loop_node import DetectorLogic
 from learning_loop_node.conftest import get_dummy_detections
-from learning_loop_node.data_classes import Category, Detections, ModelInformation
+from learning_loop_node.data_classes import Detections
 
 
 class TestingDetectorLogic(DetectorLogic):
@@ -20,10 +20,3 @@ def init(self) -> None:
     def evaluate(self, image: np.ndarray) -> Detections:
         logging.info('evaluating')
         return self.det_to_return
-
-        # return Detections(
-        #     box_detections=[BoxDetection(category_name='some_category_name', x=1, y=2, height=3, width=4,
-        #                                  model_name='some_model', confidence=.42, category_id='some_id')],
-        #     point_detections=[PointDetection(category_name='some_category_name_2', x=10, y=12,
-        #                                      model_name='some_model', confidence=.42, category_id='some_id')]
-        # )
diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py
index 3eda99c5..b7e7d18f 100644
--- a/learning_loop_node/helpers/misc.py
+++ b/learning_loop_node/helpers/misc.py
@@ -4,7 +4,9 @@
 import logging
 import os
 from dataclasses import asdict
+from glob import glob
 from typing import Any, Coroutine, List, Optional, Tuple, TypeVar
+from uuid import UUID
 
 import pynvml
 
@@ -56,6 +58,32 @@ def get_free_memory_mb() -> float:  # TODO check if this is used
     return free
 
 
+async def is_valid_image(filename: str, check_jpeg: bool) -> bool:
+    if not os.path.isfile(filename) or os.path.getsize(filename) == 0:
+        return False
+    if not check_jpeg:
+        return True
+
+    info = await asyncio.create_subprocess_shell(f'jpeginfo -c {filename}',
+                                                 stdout=asyncio.subprocess.PIPE,
+                                                 stderr=asyncio.subprocess.PIPE)
+    out, _ = await info.communicate()
+    return "OK" in out.decode()
+
+
+@staticmethod
+async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) -> None:
+    logging.info('deleting corrupt images')
+    n_deleted = 0
+    for image in glob(f'{image_folder}/*.jpg'):
+        if not await is_valid_image(image, check_jpeg):
+            logging.debug(f'  deleting image {image}')
+            os.remove(image)
+            n_deleted += 1
+
+    logging.info(f'deleted {n_deleted} images')
+
+
 def create_resource_paths(organization_name: str, project_name: str, image_ids: List[str]) -> Tuple[List[str], List[str]]:
     # TODO: experimental:
     return [f'/{organization_name}/projects/{project_name}/images/{id}/main' for id in image_ids], image_ids
@@ -107,3 +135,11 @@ async def wrapper_ensure_socket_response(*args, **kwargs):
             return asdict(SocketResponse.for_failure(str(e)))
 
     return wrapper_ensure_socket_response
+
+
+def is_valid_uuid4(val):
+    try:
+        _ = UUID(str(val)).version
+        return True
+    except ValueError:
+        return False
diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py
index ffce72f7..00e0313b 100644
--- a/learning_loop_node/node.py
+++ b/learning_loop_node/node.py
@@ -24,34 +24,38 @@
 
 class Node(FastAPI):
 
-    def __init__(self, name: str, uuid: Optional[str] = None):
+    def __init__(self, name: str, uuid: Optional[str] = None, needs_login: bool = True):
         """Base class for all nodes. A node is a process that communicates with the zauberzeug learning loop.
 
         Args:
             name (str): The name of the node. This name is used to generate a uuid.
             uuid (Optional[str]): The uuid of the node. If None, a uuid is generated based on the name 
                 and stored in f'{GLOBALS.data_folder}/uuids.json'. 
-                From the second run, the uuid is recovered based on the name of the node. Defaults to None.
+                From the second run, the uuid is recovered based on the name of the node.
+            needs_login (bool): If True, the node will try to login to the learning loop.
         """
 
         super().__init__()
         log_conf.init()
 
+        self.name = name
+        self.uuid = uuid or self.read_or_create_uuid(self.name)
+        self.needs_login = needs_login
+
         self.log = logging.getLogger()
         self.loop_communicator = LoopCommunicator()
         self.data_exchanger = DataExchanger(None, self.loop_communicator)
 
-        host = environment_reader.host(default='learning-loop.ai')
-        self.ws_url = f'ws{"s" if "learning-loop.ai" in host else ""}://' + host
+        loop_url = environment_reader.host(default='learning-loop.ai')
+        self.websocket_url = f'ws{"s" if "learning-loop.ai" in loop_url else ""}://' + loop_url
 
-        self.name = name
-        self.uuid = self.read_or_create_uuid(self.name) if uuid is None else uuid
         self.startup_time = datetime.now()
         self._sio_client: Optional[AsyncClient] = None
         self.status = NodeStatus(id=self.uuid, name=self.name)
-        # NOTE this is can be set to False for Nodes which do not need to authenticate with the backend (like the DetectorNode)
-        self.needs_login = True
-        self._setup_sio_headers()
+
+        self.sio_headers = {'organization': self.loop_communicator.organization,
+                            'project': self.loop_communicator.project,
+                            'nodeType': self.get_node_type()}
         self._register_lifecycle_events()
 
     @property
@@ -82,11 +86,6 @@ def read_or_create_uuid(self, identifier: str) -> str:
                 json.dump(uuids, f)
         return uuid
 
-    def _setup_sio_headers(self) -> None:
-        self.sio_headers = {'organization': self.loop_communicator.organization,
-                            'project': self.loop_communicator.project,
-                            'nodeType': self.get_node_type()}
-
     # --------------------------------------------------- APPLICATION LIFECYCLE ---------------------------------------------------
 
     def _register_lifecycle_events(self):
@@ -176,14 +175,14 @@ async def connect_sio(self):
         except Exception:
             pass
 
-        self.log.info(f'(re)connecting to Learning Loop at {self.ws_url}')
+        self.log.info(f'(re)connecting to Learning Loop at {self.websocket_url}')
         try:
-            await self.sio_client.connect(f"{self.ws_url}", headers=self.sio_headers, socketio_path="/ws/socket.io")
+            await self.sio_client.connect(f"{self.websocket_url}", headers=self.sio_headers, socketio_path="/ws/socket.io")
             self.log.info('connected to Learning Loop')
         except socketio.exceptions.ConnectionError:  # type: ignore
             self.log.warning('connection error')
         except Exception:
-            self.log.exception(f'error while connecting to "{self.ws_url}". Exception:')
+            self.log.exception(f'error while connecting to "{self.websocket_url}". Exception:')
 
     async def _update_send_state(self, state: NodeState):
         self.status.state = state
diff --git a/learning_loop_node/tests/test_downloader.py b/learning_loop_node/tests/test_downloader.py
index bf2e10e8..7b2143d1 100644
--- a/learning_loop_node/tests/test_downloader.py
+++ b/learning_loop_node/tests/test_downloader.py
@@ -1,3 +1,4 @@
+from ..helpers.misc import delete_corrupt_images
 import os
 import shutil
 
@@ -33,26 +34,26 @@ async def test_download_model(data_exchanger: DataExchanger):
 
 # pylint: disable=redefined-outer-name
 async def test_fetching_image_ids(data_exchanger: DataExchanger):
-    ids = await data_exchanger.fetch_image_ids()
+    ids = await data_exchanger.fetch_image_uuids()
     assert len(ids) == 3
 
 
 async def test_download_images(data_exchanger: DataExchanger):
     _, image_folder, _ = test_helper.create_needed_folders()
-    image_ids = await data_exchanger.fetch_image_ids()
+    image_ids = await data_exchanger.fetch_image_uuids()
     await data_exchanger.download_images(image_ids, image_folder)
     files = test_helper.get_files_in_folder(GLOBALS.data_folder)
     assert len(files) == 3
 
 
 async def test_download_training_data(data_exchanger: DataExchanger):
-    image_ids = await data_exchanger.fetch_image_ids()
+    image_ids = await data_exchanger.fetch_image_uuids()
     image_data = await data_exchanger.download_images_data(image_ids)
     assert len(image_data) == 3
 
 
 async def test_removal_of_corrupted_images(data_exchanger: DataExchanger):
-    image_ids = await data_exchanger.fetch_image_ids()
+    image_ids = await data_exchanger.fetch_image_uuids()
 
     shutil.rmtree('/tmp/img_folder', ignore_errors=True)
     os.makedirs('/tmp/img_folder', exist_ok=True)
@@ -65,7 +66,7 @@ async def test_removal_of_corrupted_images(data_exchanger: DataExchanger):
     with open('/tmp/img_folder/c1.jpg', 'w') as f:
         f.write('I am no image')
 
-    await data_exchanger.delete_corrupt_images('/tmp/img_folder')
+    await delete_corrupt_images('/tmp/img_folder', True)
 
     assert len(os.listdir('/tmp/img_folder')) == num_images if check_jpeg else num_images - 1
     shutil.rmtree('/tmp/img_folder', ignore_errors=True)
diff --git a/learning_loop_node/trainer/downloader.py b/learning_loop_node/trainer/downloader.py
index 94cd0516..7deb59cf 100644
--- a/learning_loop_node/trainer/downloader.py
+++ b/learning_loop_node/trainer/downloader.py
@@ -12,7 +12,7 @@ def __init__(self, data_exchanger: DataExchanger, data_query_params: Optional[st
         self.data_exchanger = data_exchanger
 
     async def download_training_data(self, image_folder: str) -> Tuple[List[Dict], int]:
-        image_ids = await self.data_exchanger.fetch_image_ids(query_params=self.data_query_params)
+        image_ids = await self.data_exchanger.fetch_image_uuids(query_params=self.data_query_params)
         image_data, skipped_image_count = await self.download_images_and_annotations(image_ids, image_folder)
         return (image_data, skipped_image_count)
 
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 1b11b4e3..06eac0aa 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -19,7 +19,7 @@
 
 from ..data_classes import (BasicModel, Category, Context, Detections, Errors, Hyperparameter, ModelInformation,
                             PretrainedModel, Training, TrainingData, TrainingError, TrainingState)
-from ..helpers.misc import create_image_folder
+from ..helpers.misc import create_image_folder, delete_corrupt_images, is_valid_uuid4
 from ..node import Node
 from . import training_syncronizer
 from .downloader import TrainingsDownloader
@@ -30,14 +30,6 @@
     from .trainer_node import TrainerNode
 
 
-def is_valid_uuid4(val):
-    try:
-        _ = UUID(str(val)).version
-        return True
-    except ValueError:
-        return False
-
-
 class TrainerLogic():
 
     def __init__(self, model_format: str) -> None:
@@ -371,7 +363,7 @@ async def _upload_model_return_new_id(self, context: Context) -> Optional[str]:
             # model.json was mandatory in previous versions. Now its forbidden to provide an own model.json file.
             assert not any(f for f in _files if 'model.json' in f), "Upload 'model.json' not allowed (added automatically)."
             _files.append(model_json_path)
-            new_id = await self.node.data_exchanger.upload_model_for_training(context, _files, self.training.training_number, file_format)
+            new_id = await self.node.data_exchanger.upload_model_get_uuid(context, _files, self.training.training_number, file_format)
             if new_id is None:
                 return None
 
@@ -420,12 +412,12 @@ async def _do_detections(self) -> None:
         for state, p in zip(['inbox', 'annotate', 'review', 'complete'], [0.1, 0.2, 0.3, 0.4]):
             self.detection_progress = p
             logging.info(f'fetching image ids of {state}')
-            new_ids = await self.node.data_exchanger.fetch_image_ids(query_params=f'state={state}')
+            new_ids = await self.node.data_exchanger.fetch_image_uuids(query_params=f'state={state}')
             image_ids += new_ids
             logging.info(f'downloading {len(new_ids)} images')
             await self.node.data_exchanger.download_images(new_ids, image_folder)
         self.detection_progress = 0.42
-        await self.node.data_exchanger.delete_corrupt_images(image_folder)
+        # await delete_corrupt_images(image_folder)
 
         images = await asyncio.get_event_loop().run_in_executor(None, TrainerLogic.images_for_ids, image_ids, image_folder)
         num_images = len(images)

From edb451077f0c135ee5e5a64ebcc204ac8f58ccf4 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 6 Mar 2024 15:36:22 +0100
Subject: [PATCH 02/62] proceed with cleanup, refactoring. Use lifespan
 contextmanager

---
 learning_loop_node/data_exchanger.py         | 14 +++-
 learning_loop_node/detector/detector_node.py |  2 +-
 learning_loop_node/helpers/misc.py           | 21 ++++++
 learning_loop_node/loop_communication.py     |  9 ++-
 learning_loop_node/node.py                   | 70 ++++++++------------
 5 files changed, 66 insertions(+), 50 deletions(-)

diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py
index 361f66cf..6bb30e6d 100644
--- a/learning_loop_node/data_exchanger.py
+++ b/learning_loop_node/data_exchanger.py
@@ -8,7 +8,7 @@
 from io import BytesIO
 from typing import Dict, List, Optional
 
-import aiofiles
+import aiofiles  # type: ignore
 
 from .data_classes import Context
 from .helpers.misc import create_resource_paths, create_task, is_valid_image
@@ -25,6 +25,15 @@ def __init__(self, cause: str, *args: object) -> None:
 class DataExchanger():
 
     def __init__(self, context: Optional[Context], loop_communicator: LoopCommunicator):
+        """Exchanges data with the learning loop via the loop_communicator (rest api).
+
+        Args:
+            context (Optional[Context]): The context of the node. This is the organization and project name.
+            loop_communicator (LoopCommunicator): The loop_communicator to use for communication with the learning loop.
+
+        Note:
+            The context can be set later with the set_context method.
+        """
         self.set_context(context)
         self.progress = 0.0
         self.loop_communicator = loop_communicator
@@ -85,7 +94,7 @@ async def download_images(self, image_uuids: List[str], image_folder: str, chunk
         existing_uuids = {os.path.splitext(os.path.basename(image))[0] for image in glob(f'{image_folder}/*.jpg')}
         new_image_uuids = [id for id in image_uuids if id not in existing_uuids]
 
-        paths, ids = create_resource_paths(self.context.organization, self.context.project, new_image_uuids)
+        paths, _ = create_resource_paths(self.context.organization, self.context.project, new_image_uuids)
         num_image_ids = len(image_uuids)
         os.makedirs(image_folder, exist_ok=True)
 
@@ -128,7 +137,6 @@ async def download_model(self, target_folder: str, context: Context, model_id: s
             logging.exception(f'Error during downloading model {path}:')
             raise
 
-        # unzip and place downloaded model
         tmp_path = f'/tmp/{os.path.splitext(provided_filename)[0]}'
         shutil.rmtree(tmp_path, ignore_errors=True)
         with zipfile.ZipFile(BytesIO(content), 'r') as zip_:
diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py
index 785a10fe..7a19142b 100644
--- a/learning_loop_node/detector/detector_node.py
+++ b/learning_loop_node/detector/detector_node.py
@@ -256,7 +256,7 @@ async def send_status(self) -> Union[str, Literal[False]]:
             name=self.name,
             state=self.status.state,
             errors=self.status.errors,
-            uptime=int((datetime.now() - self.startup_time).total_seconds()),
+            uptime=int((datetime.now() - self.startup_datetime).total_seconds()),
             operation_mode=self.operation_mode,
             current_model=current_model,
             target_model=self.target_model,
diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py
index b7e7d18f..81cfc284 100644
--- a/learning_loop_node/helpers/misc.py
+++ b/learning_loop_node/helpers/misc.py
@@ -1,4 +1,6 @@
 """original copied from https://quantlane.com/blog/ensure-asyncio-task-exceptions-get-logged/"""
+import json
+from uuid import uuid4
 import asyncio
 import functools
 import logging
@@ -11,6 +13,7 @@
 import pynvml
 
 from ..data_classes import SocketResponse
+from ..globals import GLOBALS
 
 T = TypeVar('T')
 
@@ -102,6 +105,24 @@ def create_image_folder(project_folder: str) -> str:
     return image_folder
 
 
+def read_or_create_uuid(identifier: str) -> str:
+    identifier = identifier.lower().replace(' ', '_')
+    uuids = {}
+    os.makedirs(GLOBALS.data_folder, exist_ok=True)
+    file_path = f'{GLOBALS.data_folder}/uuids.json'
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as f:
+            uuids = json.load(f)
+
+    uuid = uuids.get(identifier, None)
+    if not uuid:
+        uuid = str(uuid4())
+        uuids[identifier] = uuid
+        with open(file_path, 'w') as f:
+            json.dump(uuids, f)
+    return uuid
+
+
 def ensure_socket_response(func):
     """Decorator to ensure that the return value of a socket.io event handler is a SocketResponse.
 
diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index d4b3dadf..9ba7519b 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -27,9 +27,8 @@ def __init__(self) -> None:
 
         logging.info(f'Loop interface initialized with base_url: {self.base_url} / user: {self.username}')
 
-    # @property
-    # def project_path(self):  # TODO: remove?
-    #     return f'/{self.organization}/projects/{self.project}'
+    def websocket_url(self) -> str:
+        return f'ws{"s" if "learning-loop.ai" in self.host else ""}://' + self.host
 
     async def ensure_login(self) -> None:
         """aiohttp client session needs to be created on the event loop"""
@@ -75,12 +74,12 @@ async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/
             await self.ensure_login()
         return await self.async_client.get(api_prefix+path)
 
-    async def put(self, path, files: Optional[List[str]]=None, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response:
+    async def put(self, path, files: Optional[List[str]] = None, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response:
         if requires_login:
             await self.ensure_login()
         if files is None:
             return await self.async_client.put(api_prefix+path, **kwargs)
-        
+
         file_list = [('files', open(f, 'rb')) for f in files]  # TODO: does this properly close the files after upload?
         return await self.async_client.put(api_prefix+path, files=file_list)
 
diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py
index 00e0313b..ba9fe464 100644
--- a/learning_loop_node/node.py
+++ b/learning_loop_node/node.py
@@ -1,12 +1,11 @@
 import asyncio
-import json
 import logging
 import os
 import sys
 from abc import abstractmethod
+from contextlib import asynccontextmanager
 from datetime import datetime
 from typing import Optional
-from uuid import uuid4
 
 import aiohttp
 import socketio
@@ -17,8 +16,8 @@
 from .data_classes import Context, NodeState, NodeStatus
 from .data_exchanger import DataExchanger
 from .globals import GLOBALS
-from .helpers import environment_reader, log_conf
-from .helpers.misc import ensure_socket_response
+from .helpers import log_conf
+from .helpers.misc import ensure_socket_response, read_or_create_uuid
 from .loop_communication import LoopCommunicator
 
 
@@ -35,28 +34,29 @@ def __init__(self, name: str, uuid: Optional[str] = None, needs_login: bool = Tr
             needs_login (bool): If True, the node will try to login to the learning loop.
         """
 
-        super().__init__()
+        super().__init__(lifespan=self.lifespan)
         log_conf.init()
 
         self.name = name
-        self.uuid = uuid or self.read_or_create_uuid(self.name)
+        self.uuid = uuid or read_or_create_uuid(self.name)
         self.needs_login = needs_login
 
         self.log = logging.getLogger()
         self.loop_communicator = LoopCommunicator()
+        self.websocket_url = self.loop_communicator.websocket_url()
         self.data_exchanger = DataExchanger(None, self.loop_communicator)
 
-        loop_url = environment_reader.host(default='learning-loop.ai')
-        self.websocket_url = f'ws{"s" if "learning-loop.ai" in loop_url else ""}://' + loop_url
-
-        self.startup_time = datetime.now()
+        self.startup_datetime = datetime.now()
         self._sio_client: Optional[AsyncClient] = None
         self.status = NodeStatus(id=self.uuid, name=self.name)
 
         self.sio_headers = {'organization': self.loop_communicator.organization,
                             'project': self.loop_communicator.project,
                             'nodeType': self.get_node_type()}
-        self._register_lifecycle_events()
+
+        @repeat_every(seconds=5, raise_exceptions=False, wait_first=False)
+        async def ensure_connected() -> None:
+            await self._on_repeat()
 
     @property
     def sio_client(self) -> AsyncClient:
@@ -67,40 +67,27 @@ def sio_client(self) -> AsyncClient:
     def sio_is_initialized(self) -> bool:
         return self._sio_client is not None
 
-    # --------------------------------------------------- INIT ---------------------------------------------------
-
-    def read_or_create_uuid(self, identifier: str) -> str:
-        identifier = identifier.lower().replace(' ', '_')
-        uuids = {}
-        os.makedirs(GLOBALS.data_folder, exist_ok=True)
-        file_path = f'{GLOBALS.data_folder}/uuids.json'
-        if os.path.exists(file_path):
-            with open(file_path, 'r') as f:
-                uuids = json.load(f)
-
-        uuid = uuids.get(identifier, None)
-        if not uuid:
-            uuid = str(uuid4())
-            uuids[identifier] = uuid
-            with open(file_path, 'w') as f:
-                json.dump(uuids, f)
-        return uuid
-
     # --------------------------------------------------- APPLICATION LIFECYCLE ---------------------------------------------------
 
-    def _register_lifecycle_events(self):
-        @self.on_event("startup")
-        async def startup():
-            await self._on_startup()
+    @asynccontextmanager
+    async def lifespan(self, app: FastAPI):
+        await self.on_startup()
+        yield
+        await self.on_shutdown()
 
-        @self.on_event("shutdown")  # NOTE only used for developent ?!
-        async def shutdown():
-            await self._on_shutdown()
+    # def _register_lifecycle_events(self):
+    #     @self.on_event("startup")
+    #     async def startup():
+    #         await self._on_startup()
 
-        @self.on_event("startup")
-        @repeat_every(seconds=5, raise_exceptions=False, wait_first=False)
-        async def ensure_connected() -> None:
-            await self._on_repeat()
+    #     @self.on_event("shutdown")  # NOTE only used for developent ?!
+    #     async def shutdown():
+    #         await self._on_shutdown()
+
+    #     @self.on_event("startup")
+    #     @repeat_every(seconds=5, raise_exceptions=False, wait_first=False)
+    #     async def ensure_connected() -> None:
+    #         await self._on_repeat()
 
     async def _on_startup(self):
         self.log.info('received "startup" lifecycle-event')
@@ -122,6 +109,7 @@ async def _on_shutdown(self):
         self.log.info('successfully disconnected from loop.')
         await self.on_shutdown()
 
+    @repeat_every(seconds=5, raise_exceptions=False, wait_first=False)
     async def _on_repeat(self):
         while not self.sio_is_initialized():
             self.log.info('Waiting for sio client to be initialized')

From fda0bdbaa4705ff0ae375aa852855feac33b9b38 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 6 Mar 2024 15:38:18 +0100
Subject: [PATCH 03/62] remove mock converter

---
 mock_converter.dockerfile                     |  7 ---
 mock_converter/app_code/__init__.py           |  0
 mock_converter/app_code/backdoor_controls.py  | 55 -------------------
 .../app_code/mock_converter_logic.py          | 18 ------
 mock_converter/app_code/restart/restart.py    |  5 --
 mock_converter/app_code/tests/.gitkeep        |  0
 mock_converter/app_code/tests/test_dummy.py   |  2 -
 mock_converter/main.py                        | 24 --------
 mock_converter/pytest.ini                     |  8 ---
 mock_converter/start.sh                       |  3 -
 10 files changed, 122 deletions(-)
 delete mode 100644 mock_converter.dockerfile
 delete mode 100644 mock_converter/app_code/__init__.py
 delete mode 100644 mock_converter/app_code/backdoor_controls.py
 delete mode 100644 mock_converter/app_code/mock_converter_logic.py
 delete mode 100644 mock_converter/app_code/restart/restart.py
 delete mode 100644 mock_converter/app_code/tests/.gitkeep
 delete mode 100644 mock_converter/app_code/tests/test_dummy.py
 delete mode 100644 mock_converter/main.py
 delete mode 100644 mock_converter/pytest.ini
 delete mode 100755 mock_converter/start.sh

diff --git a/mock_converter.dockerfile b/mock_converter.dockerfile
deleted file mode 100644
index 42c883c8..00000000
--- a/mock_converter.dockerfile
+++ /dev/null
@@ -1,7 +0,0 @@
-FROM base_node:latest
-
-COPY ./mock_converter/ /app
-ENV PYTHONPATH "${PYTHONPATH}:/app:/usr/local/lib/python3.11/site-packages:/learning_loop_node/learning_loop_node"
-ENV TZ=Europe/Amsterdam
-
-EXPOSE 80
diff --git a/mock_converter/app_code/__init__.py b/mock_converter/app_code/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/mock_converter/app_code/backdoor_controls.py b/mock_converter/app_code/backdoor_controls.py
deleted file mode 100644
index 6472d4b2..00000000
--- a/mock_converter/app_code/backdoor_controls.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""These restful endpoints are only to be used for testing purposes and are not part of the 'offical' trainer behavior."""
-
-import asyncio
-import logging
-
-from fastapi import APIRouter, HTTPException, Request
-
-from learning_loop_node.data_classes import NodeState
-
-router = APIRouter()
-
-
-@router.put("/socketio")
-async def put_socketio(request: Request):
-    '''
-    Example Usage
-
-        curl -X PUT -d "on" http://localhost:8005/socketio
-    '''
-    state = str(await request.body(), 'utf-8')
-    if state == 'off':
-        if request.app.status.state != NodeState.Offline:
-            logging.info('turning socketio off')
-            asyncio.create_task(request.app.sio.disconnect())
-    if state == 'on':
-        if request.app.status.state == NodeState.Offline:
-            logging.info('turning socketio on')
-            asyncio.create_task(request.app.connect())
-
-
-@router.put("/check_state")
-async def put_check_state(request: Request):
-    value = str(await request.body(), 'utf-8')
-    print(f'turning automatically check_state {value}', flush=True)
-
-    if value == 'off':
-        request.app.skip_check_state = True
-        for _ in range(5):
-            if request.app.status.state != NodeState.Idle:
-                await asyncio.sleep(0.5)
-            else:
-                break
-        if request.app.status.state != NodeState.Idle:
-            raise HTTPException(status_code=409, detail="Could not skip auto checking. State is still not idle")
-
-    if value == 'on':
-        request.app.skip_check_state = False
-
-
-@router.post("/step")
-async def add_steps(request: Request):
-    if request.app.status.state == NodeState.Running:
-        raise HTTPException(status_code=409, detail="converter is already running")
-
-    await request.app.check_state()
diff --git a/mock_converter/app_code/mock_converter_logic.py b/mock_converter/app_code/mock_converter_logic.py
deleted file mode 100644
index 7fc68579..00000000
--- a/mock_converter/app_code/mock_converter_logic.py
+++ /dev/null
@@ -1,18 +0,0 @@
-
-import asyncio
-from typing import List
-
-from learning_loop_node.converter.converter_logic import ConverterLogic
-from learning_loop_node.data_classes import ModelInformation
-
-
-class MockConverterLogic(ConverterLogic):
-
-    async def _convert(self, model_information: ModelInformation) -> None:
-        await asyncio.sleep(1)
-
-    def get_converted_files(self, model_id: str) -> List[str]:
-        fake_converted_file = '/tmp/converted_weightfile.converted'
-        with open(fake_converted_file, 'wb') as f:
-            f.write(b'\x42')
-        return [fake_converted_file]
diff --git a/mock_converter/app_code/restart/restart.py b/mock_converter/app_code/restart/restart.py
deleted file mode 100644
index f7203baa..00000000
--- a/mock_converter/app_code/restart/restart.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# add 'reload_dirs=['./app_code/restart'] to uvicorn call in main.py
-# save this file to trigger uvicorn restart
-
-
-# TODO raus nehmen
diff --git a/mock_converter/app_code/tests/.gitkeep b/mock_converter/app_code/tests/.gitkeep
deleted file mode 100644
index e69de29b..00000000
diff --git a/mock_converter/app_code/tests/test_dummy.py b/mock_converter/app_code/tests/test_dummy.py
deleted file mode 100644
index 1f00624b..00000000
--- a/mock_converter/app_code/tests/test_dummy.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def test_always_succeed_to_ensure_ci_of_loop_will_not_fail():
-    assert True
diff --git a/mock_converter/main.py b/mock_converter/main.py
deleted file mode 100644
index b8bdb907..00000000
--- a/mock_converter/main.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import logging
-import os
-
-import uvicorn
-from app_code import backdoor_controls
-from app_code.mock_converter_logic import MockConverterLogic
-
-from learning_loop_node.converter.converter_node import ConverterNode
-
-logging.basicConfig(level=logging.DEBUG)
-
-mock_converter = MockConverterLogic(source_format='mocked', target_format='mocked_converted')
-node = ConverterNode(uuid='85ef1a58-308d-4c80-8931-43d1f752f4f3', name='mocked converter', converter=mock_converter)
-node.skip_check_state = True  # do not check states auotmatically for this mock
-
-# setting up backdoor_controls
-node.include_router(backdoor_controls.router, prefix="")
-
-
-if __name__ == "__main__":
-    reload_dirs = ['./app_code/restart'] if os.environ.get('MANUAL_RESTART', None) \
-        else ['./app_code', './learning-loop-node', '/usr/local/lib/python3.11/site-packages/learning_loop_node']
-    uvicorn.run("main:node", host="0.0.0.0", port=80, lifespan='on',
-                reload=True, use_colors=True, reload_dirs=reload_dirs)
diff --git a/mock_converter/pytest.ini b/mock_converter/pytest.ini
deleted file mode 100644
index 0d20a612..00000000
--- a/mock_converter/pytest.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-[pytest]
-# NOTE: changing default location of pytest_cache because the uvicorn file watcher somehow triggered to many reloads
-cache_dir = /tmp/pytest_cache 
-python_files = test_*.py
-asyncio_mode = auto
-
-testpaths = tests
-    
\ No newline at end of file
diff --git a/mock_converter/start.sh b/mock_converter/start.sh
deleted file mode 100755
index 125eee97..00000000
--- a/mock_converter/start.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/usr/bin/env bash
-
-uvicorn main:node --host 0.0.0.0 --port 80 --reload --lifespan on --reload-dir /app
\ No newline at end of file

From 4925fe36d32b269cdb3ce57db11c50e4db000b2b Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 6 Mar 2024 15:39:53 +0100
Subject: [PATCH 04/62] remove converter from __init__

---
 learning_loop_node/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/learning_loop_node/__init__.py b/learning_loop_node/__init__.py
index b8f0f5cd..5f4433bc 100644
--- a/learning_loop_node/__init__.py
+++ b/learning_loop_node/__init__.py
@@ -2,7 +2,6 @@
 import os
 import sys
 
-from .converter.converter_node import ConverterNode
 # from . import log_conf
 from .detector.detector_logic import DetectorLogic
 from .detector.detector_node import DetectorNode

From f238acbcc723932f1825affd68ce347401ae929c Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 6 Mar 2024 16:57:52 +0100
Subject: [PATCH 05/62] cahnge deprecated way of app lifecycle handling

---
 learning_loop_node/detector/detector_node.py  |  2 +
 learning_loop_node/detector/rest/about.py     |  1 +
 .../tests/test_client_communication.py        |  3 +-
 learning_loop_node/node.py                    | 49 ++++++++++---------
 learning_loop_node/tests/test_downloader.py   |  6 +--
 .../tests/states/test_state_upload_model.py   |  2 +-
 6 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py
index 7a19142b..e2c532ed 100644
--- a/learning_loop_node/detector/detector_node.py
+++ b/learning_loop_node/detector/detector_node.py
@@ -170,6 +170,8 @@ async def _upload(sid, data: Dict) -> Optional[Dict]:
         def _connect(sid, environ, auth) -> None:
             self.connected_clients.append(sid)
 
+        print('>>>>>>>>>>>>>>>>>>>>>>> setting up sio server', flush=True)
+
         self.sio_server = SocketManager(app=self)
         self.sio_server.on('detect', _detect)
         self.sio_server.on('info', _info)
diff --git a/learning_loop_node/detector/rest/about.py b/learning_loop_node/detector/rest/about.py
index c464b999..9f1e407e 100644
--- a/learning_loop_node/detector/rest/about.py
+++ b/learning_loop_node/detector/rest/about.py
@@ -16,6 +16,7 @@ async def get_about(request: Request):
         curl http://localhost/about
     '''
     app: 'DetectorNode' = request.app
+
     return {
         'operation_mode': app.operation_mode.value,
         'state': app.status.state,
diff --git a/learning_loop_node/detector/tests/test_client_communication.py b/learning_loop_node/detector/tests/test_client_communication.py
index be3d2d4b..16f0fa6b 100644
--- a/learning_loop_node/detector/tests/test_client_communication.py
+++ b/learning_loop_node/detector/tests/test_client_communication.py
@@ -5,7 +5,7 @@
 import requests
 
 from learning_loop_node import DetectorNode
-from learning_loop_node.data_classes import Category, ModelInformation
+from learning_loop_node.data_classes import ModelInformation
 from learning_loop_node.detector.tests.conftest import get_outbox_files
 from learning_loop_node.globals import GLOBALS
 
@@ -94,6 +94,7 @@ async def test_about_endpoint(test_detector_node: DetectorNode):
 
     assert response.status_code == 200
     response_dict = json.loads(response.content)
+    assert response_dict['model_info']
     model_information = ModelInformation.from_dict(response_dict['model_info'])
 
     assert response_dict['operation_mode'] == 'idle'
diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py
index ba9fe464..bbf7bd77 100644
--- a/learning_loop_node/node.py
+++ b/learning_loop_node/node.py
@@ -5,12 +5,11 @@
 from abc import abstractmethod
 from contextlib import asynccontextmanager
 from datetime import datetime
-from typing import Optional
+from typing import Any, Optional
 
 import aiohttp
 import socketio
 from fastapi import FastAPI
-from fastapi_utils.tasks import repeat_every
 from socketio import AsyncClient
 
 from .data_classes import Context, NodeState, NodeStatus
@@ -54,9 +53,7 @@ def __init__(self, name: str, uuid: Optional[str] = None, needs_login: bool = Tr
                             'project': self.loop_communicator.project,
                             'nodeType': self.get_node_type()}
 
-        @repeat_every(seconds=5, raise_exceptions=False, wait_first=False)
-        async def ensure_connected() -> None:
-            await self._on_repeat()
+        self.repeat_task: Any = None
 
     @property
     def sio_client(self) -> AsyncClient:
@@ -68,26 +65,21 @@ def sio_is_initialized(self) -> bool:
         return self._sio_client is not None
 
     # --------------------------------------------------- APPLICATION LIFECYCLE ---------------------------------------------------
-
     @asynccontextmanager
     async def lifespan(self, app: FastAPI):
-        await self.on_startup()
-        yield
-        await self.on_shutdown()
-
-    # def _register_lifecycle_events(self):
-    #     @self.on_event("startup")
-    #     async def startup():
-    #         await self._on_startup()
-
-    #     @self.on_event("shutdown")  # NOTE only used for developent ?!
-    #     async def shutdown():
-    #         await self._on_shutdown()
 
-    #     @self.on_event("startup")
-    #     @repeat_every(seconds=5, raise_exceptions=False, wait_first=False)
-    #     async def ensure_connected() -> None:
-    #         await self._on_repeat()
+        try:
+            self.repeat_task = asyncio.create_task(self.repeat_loop())
+            await self._on_startup()
+            yield
+        finally:
+            await self._on_shutdown()
+            if self.repeat_task is not None:
+                self.repeat_task.cancel()
+                try:
+                    await self.repeat_task
+                except asyncio.CancelledError:
+                    pass
 
     async def _on_startup(self):
         self.log.info('received "startup" lifecycle-event')
@@ -109,8 +101,19 @@ async def _on_shutdown(self):
         self.log.info('successfully disconnected from loop.')
         await self.on_shutdown()
 
-    @repeat_every(seconds=5, raise_exceptions=False, wait_first=False)
+    async def repeat_loop(self) -> None:
+        while True:
+            try:
+                await self._on_repeat()
+            except asyncio.CancelledError:
+                return
+            except Exception as e:
+                self.log.exception(f'error in repeat loop: {e}')
+            await asyncio.sleep(5)
+
     async def _on_repeat(self):
+        print('received "repeat" lifecycle-event', flush=True)
+        logging.info('received "repeat" lifecycle-event')
         while not self.sio_is_initialized():
             self.log.info('Waiting for sio client to be initialized')
             await asyncio.sleep(1)
diff --git a/learning_loop_node/tests/test_downloader.py b/learning_loop_node/tests/test_downloader.py
index 7b2143d1..43ee4c6f 100644
--- a/learning_loop_node/tests/test_downloader.py
+++ b/learning_loop_node/tests/test_downloader.py
@@ -1,11 +1,11 @@
-from ..helpers.misc import delete_corrupt_images
 import os
 import shutil
 
 from learning_loop_node.data_classes import Context
-from learning_loop_node.data_exchanger import DataExchanger, check_jpeg
+from learning_loop_node.data_exchanger import DataExchanger
 from learning_loop_node.globals import GLOBALS
 
+from ..helpers.misc import delete_corrupt_images
 from . import test_helper
 
 
@@ -68,5 +68,5 @@ async def test_removal_of_corrupted_images(data_exchanger: DataExchanger):
 
     await delete_corrupt_images('/tmp/img_folder', True)
 
-    assert len(os.listdir('/tmp/img_folder')) == num_images if check_jpeg else num_images - 1
+    assert len(os.listdir('/tmp/img_folder')) == num_images if data_exchanger.check_jpeg else num_images - 1
     shutil.rmtree('/tmp/img_folder', ignore_errors=True)
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
index 05eaa8ed..41a5a4a8 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
@@ -84,4 +84,4 @@ async def test_mock_loop_response_example(mocker: MockerFixture, test_initialize
 
 
 def mock_upload_model_for_training(mocker, return_value):
-    mocker.patch('learning_loop_node.data_exchanger.DataExchanger.upload_model_for_training', return_value=return_value)
+    mocker.patch('learning_loop_node.data_exchanger.DataExchanger.upload_model_get_uuid', return_value=return_value)

From 421f529640f553ef388e27760b73292334a7b1b1 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 6 Mar 2024 17:54:19 +0100
Subject: [PATCH 06/62] remove unused states from node base class

---
 .../annotation/annotator_node.py              |  9 +-
 learning_loop_node/data_classes/general.py    |  2 +-
 learning_loop_node/detector/detector_node.py  |  4 +-
 learning_loop_node/helpers/misc.py            | 69 +++++++++++++-
 learning_loop_node/loop_communication.py      |  2 +-
 learning_loop_node/node.py                    | 95 ++++---------------
 learning_loop_node/tests/test_helper.py       |  4 +-
 learning_loop_node/trainer/trainer_logic.py   | 14 +--
 learning_loop_node/trainer/trainer_node.py    |  5 -
 .../app_code/tests/test_detections.py         |  4 +-
 10 files changed, 98 insertions(+), 110 deletions(-)

diff --git a/learning_loop_node/annotation/annotator_node.py b/learning_loop_node/annotation/annotator_node.py
index b1781b73..474f28e6 100644
--- a/learning_loop_node/annotation/annotator_node.py
+++ b/learning_loop_node/annotation/annotator_node.py
@@ -8,7 +8,7 @@
 from ..data_classes import AnnotationNodeStatus, Context, NodeState, UserInput
 from ..data_classes.socket_response import SocketResponse
 from ..data_exchanger import DataExchanger
-from ..helpers.misc import create_image_folder
+from ..helpers.misc import create_image_folder, create_project_folder
 from ..node import Node
 from .annotator_logic import AnnotatorLogic
 
@@ -50,8 +50,6 @@ async def _handle_user_input(self, user_input_dict: Dict) -> str:
             raise
 
         if tool_result.annotation:
-            if not self.sio_is_initialized():
-                raise Exception('Socket client waas not initialized')
             await self.sio_client.call('update_segmentation_annotation', (user_input.data.context.organization,
                                                                           user_input.data.context.project,
                                                                           jsonable_encoder(asdict(tool_result.annotation))), timeout=30)
@@ -85,15 +83,12 @@ async def send_status(self):
             self.log.error(f'Error for updating: Response from loop was : {asdict(response)}')
 
     async def download_image(self, context: Context, uuid: str):
-        project_folder = Node.create_project_folder(context)
+        project_folder = create_project_folder(context)
         images_folder = create_image_folder(project_folder)
 
         downloader = DataExchanger(context=context, loop_communicator=self.loop_communicator)
         await downloader.download_images([uuid], images_folder)
 
-    async def get_state(self):
-        return NodeState.Online
-
     def get_node_type(self):
         return 'annotation_node'
 
diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py
index 8404ab22..9d5c893e 100644
--- a/learning_loop_node/data_classes/general.py
+++ b/learning_loop_node/data_classes/general.py
@@ -121,7 +121,7 @@ class NodeState(str, Enum):
 class NodeStatus():
     id: str
     name: str
-    state: Optional[NodeState] = NodeState.Offline
+    state: Optional[NodeState] = NodeState.Online
     uptime: Optional[int] = 0
     errors: Dict = field(default_factory=dict)
     capabilities: List[str] = field(default_factory=list)
diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py
index e2c532ed..00271f64 100644
--- a/learning_loop_node/detector/detector_node.py
+++ b/learning_loop_node/detector/detector_node.py
@@ -274,13 +274,11 @@ async def send_status(self) -> Union[str, Literal[False]]:
             return False
 
         assert socket_response.payload is not None
+        # TODO This is weird because target_model_version is stored in self and target_model_id is returned
         self.target_model = socket_response.payload['target_model_version']
         self.log.info(f'After sending status. Target_model is {self.target_model}')
         return socket_response.payload['target_model_id']
 
-    async def get_state(self):
-        return NodeState.Online  # NOTE At the moment only trainer-nodes use a meaningful state
-
     async def set_operation_mode(self, mode: OperationMode):
         self.operation_mode = mode
         await self.send_status()
diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py
index 81cfc284..1f2e297d 100644
--- a/learning_loop_node/helpers/misc.py
+++ b/learning_loop_node/helpers/misc.py
@@ -1,18 +1,20 @@
 """original copied from https://quantlane.com/blog/ensure-asyncio-task-exceptions-get-logged/"""
-import json
-from uuid import uuid4
 import asyncio
 import functools
+import json
 import logging
 import os
+import shutil
+import sys
 from dataclasses import asdict
 from glob import glob
+from time import perf_counter
 from typing import Any, Coroutine, List, Optional, Tuple, TypeVar
-from uuid import UUID
+from uuid import UUID, uuid4
 
 import pynvml
 
-from ..data_classes import SocketResponse
+from ..data_classes import Context, SocketResponse, Training
 from ..globals import GLOBALS
 
 T = TypeVar('T')
@@ -164,3 +166,62 @@ def is_valid_uuid4(val):
         return True
     except ValueError:
         return False
+
+
+def create_project_folder(context: Context) -> str:
+    project_folder = f'{GLOBALS.data_folder}/{context.organization}/{context.project}'
+    os.makedirs(project_folder, exist_ok=True)
+    return project_folder
+
+
+def activate_asyncio_warnings() -> None:
+    '''Produce warnings for coroutines which take too long on the main loop and hence clog the event loop'''
+    try:
+        if sys.version_info.major >= 3 and sys.version_info.minor >= 7:  # most
+            loop = asyncio.get_running_loop()
+        else:
+            loop = asyncio.get_event_loop()
+
+        loop.set_debug(True)
+        loop.slow_callback_duration = 0.2
+        logging.info('activated asyncio warnings')
+    except Exception:
+        logging.exception('could not activate asyncio warnings. Exception:')
+
+
+@staticmethod
+def images_for_ids(image_ids, image_folder) -> List[str]:
+    logging.info(f'### Going to get images for {len(image_ids)} images ids')
+    start = perf_counter()
+    images = [img for img in glob(f'{image_folder}/**/*.*', recursive=True)
+              if os.path.splitext(os.path.basename(img))[0] in image_ids]
+    end = perf_counter()
+    logging.info(f'found {len(images)} images for {len(image_ids)} image ids, which took {end-start:0.2f} seconds')
+    return images
+
+
+@staticmethod
+def generate_training(project_folder: str, context: Context) -> Training:
+    training_uuid = str(uuid4())
+    return Training(
+        id=training_uuid,
+        context=context,
+        project_folder=project_folder,
+        images_folder=create_image_folder(project_folder),
+        training_folder=create_training_folder(project_folder, training_uuid)
+    )
+
+
+@staticmethod
+def delete_all_training_folders(project_folder: str):
+    if not os.path.exists(f'{project_folder}/trainings'):
+        return
+    for uuid in os.listdir(f'{project_folder}/trainings'):
+        shutil.rmtree(f'{project_folder}/trainings/{uuid}', ignore_errors=True)
+
+
+@staticmethod
+def create_training_folder(project_folder: str, trainings_id: str) -> str:
+    training_folder = f'{project_folder}/trainings/{trainings_id}'
+    os.makedirs(training_folder, exist_ok=True)
+    return training_folder
diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index 9ba7519b..75c57189 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -50,7 +50,7 @@ async def logout(self) -> None:
             logging.info(f'Logout failed with response: {response}')
             raise LoopCommunicationException('Logout failed with response: ' + str(response))
 
-    async def get_cookies(self) -> Cookies:
+    def get_cookies(self) -> Cookies:
         return self.async_client.cookies
 
     async def shutdown(self):
diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py
index bbf7bd77..5e7adf33 100644
--- a/learning_loop_node/node.py
+++ b/learning_loop_node/node.py
@@ -1,6 +1,5 @@
 import asyncio
 import logging
-import os
 import sys
 from abc import abstractmethod
 from contextlib import asynccontextmanager
@@ -12,11 +11,10 @@
 from fastapi import FastAPI
 from socketio import AsyncClient
 
-from .data_classes import Context, NodeState, NodeStatus
+from .data_classes import NodeStatus
 from .data_exchanger import DataExchanger
-from .globals import GLOBALS
 from .helpers import log_conf
-from .helpers.misc import ensure_socket_response, read_or_create_uuid
+from .helpers.misc import activate_asyncio_warnings, ensure_socket_response, read_or_create_uuid
 from .loop_communication import LoopCommunicator
 
 
@@ -61,16 +59,12 @@ def sio_client(self) -> AsyncClient:
             raise Exception('sio_client not yet initialized')
         return self._sio_client
 
-    def sio_is_initialized(self) -> bool:
-        return self._sio_client is not None
-
     # --------------------------------------------------- APPLICATION LIFECYCLE ---------------------------------------------------
     @asynccontextmanager
     async def lifespan(self, app: FastAPI):
-
         try:
-            self.repeat_task = asyncio.create_task(self.repeat_loop())
             await self._on_startup()
+            self.repeat_task = asyncio.create_task(self.repeat_loop())
             yield
         finally:
             await self._on_shutdown()
@@ -83,7 +77,7 @@ async def lifespan(self, app: FastAPI):
 
     async def _on_startup(self):
         self.log.info('received "startup" lifecycle-event')
-        Node._activate_asyncio_warnings()
+        activate_asyncio_warnings()
         if self.needs_login:
             await self.loop_communicator.backend_ready()
             self.log.info('ensuring login')
@@ -102,6 +96,7 @@ async def _on_shutdown(self):
         await self.on_shutdown()
 
     async def repeat_loop(self) -> None:
+        """NOTE: with the lifespan approach, we cannot use @repeat_every anymore :("""
         while True:
             try:
                 await self._on_repeat()
@@ -112,11 +107,6 @@ async def repeat_loop(self) -> None:
             await asyncio.sleep(5)
 
     async def _on_repeat(self):
-        print('received "repeat" lifecycle-event', flush=True)
-        logging.info('received "repeat" lifecycle-event')
-        while not self.sio_is_initialized():
-            self.log.info('Waiting for sio client to be initialized')
-            await asyncio.sleep(1)
         if not self.sio_client.connected:
             self.log.info('Reconnecting to loop via sio')
             await self.connect_sio()
@@ -128,8 +118,11 @@ async def _on_repeat(self):
     # --------------------------------------------------- SOCKET.IO ---------------------------------------------------
 
     async def create_sio_client(self):
-        cookies = await self.loop_communicator.get_cookies()
-        self._sio_client = AsyncClient(request_timeout=20, http_session=aiohttp.ClientSession(cookies=cookies))
+        """Create a socket.io client that communicates with the learning loop and register the events.
+        Note: The method is called in startup and soft restart of detector, so the _sio_client should always be available."""
+
+        self._sio_client = AsyncClient(request_timeout=20,
+                                       http_session=aiohttp.ClientSession(cookies=self.loop_communicator.get_cookies()))
 
         # pylint: disable=protected-access
         self.sio_client._trigger_event = ensure_socket_response(self.sio_client._trigger_event)
@@ -137,30 +130,19 @@ async def create_sio_client(self):
         @self._sio_client.event
         async def connect():
             self.log.info('received "connect" via sio from loop.')
-            self.status = NodeStatus(id=self.uuid, name=self.name)
-            state = await self.get_state()
-            try:
-                await self._update_send_state(state)
-            except:
-                self.log.exception('Error sending state. Exception:')
-                raise
 
         @self._sio_client.event
         async def disconnect():
             self.log.info('received "disconnect" via sio from loop.')
-            await self._update_send_state(NodeState.Offline)
 
         @self._sio_client.event
         async def restart():
-            self.log.info('received "restart" via sio from loop.')
-            self.restart()
+            self.log.info('received "restart" via sio from loop -> restarting node.')
+            sys.exit(0)
 
         self.register_sio_events(self._sio_client)
 
     async def connect_sio(self):
-        if not self.sio_is_initialized():
-            self.log.warning('sio client not yet initialized')
-            return
         try:
             await self.sio_client.disconnect()
         except Exception:
@@ -175,30 +157,11 @@ async def connect_sio(self):
         except Exception:
             self.log.exception(f'error while connecting to "{self.websocket_url}". Exception:')
 
-    async def _update_send_state(self, state: NodeState):
-        self.status.state = state
-        if self.status.state != NodeState.Offline:
-            await self.send_status()
-
     # --------------------------------------------------- ABSTRACT METHODS ---------------------------------------------------
 
-    @abstractmethod
-    def register_sio_events(self, sio_client: AsyncClient):
-        """Register socket.io events for the communication with the learning loop.
-        The events: connect and disconnect are already registered and should not be overwritten."""
-
-    @abstractmethod
-    async def send_status(self):
-        """Send the current status to the learning loop.
-        Note that currently this method is also used to react to the response of the learning loop."""
-
-    @abstractmethod
-    async def get_state(self) -> NodeState:
-        """Return the current state of the node."""
-
     @abstractmethod
     def get_node_type(self):
-        pass
+        """Return the type of the node. This is used to register the node at the learning loop."""
 
     @abstractmethod
     async def on_startup(self):
@@ -211,32 +174,8 @@ async def on_shutdown(self):
     @abstractmethod
     async def on_repeat(self):
         """This method is called every 10 seconds."""
-    # --------------------------------------------------- SHARED FUNCTIONS ---------------------------------------------------
-
-    def restart(self):
-        """Restart the node."""
-        self.log.info('restarting node')
-        sys.exit(0)
-
-    # --------------------------------------------------- HELPER ---------------------------------------------------
 
-    @staticmethod
-    def create_project_folder(context: Context) -> str:
-        project_folder = f'{GLOBALS.data_folder}/{context.organization}/{context.project}'
-        os.makedirs(project_folder, exist_ok=True)
-        return project_folder
-
-    @staticmethod
-    def _activate_asyncio_warnings() -> None:
-        '''Produce warnings for coroutines which take too long on the main loop and hence clog the event loop'''
-        try:
-            if sys.version_info.major >= 3 and sys.version_info.minor >= 7:  # most
-                loop = asyncio.get_running_loop()
-            else:
-                loop = asyncio.get_event_loop()
-
-            loop.set_debug(True)
-            loop.slow_callback_duration = 0.2
-            logging.info('activated asyncio warnings')
-        except Exception:
-            logging.exception('could not activate asyncio warnings. Exception:')
+    @abstractmethod
+    def register_sio_events(self, sio_client: AsyncClient):
+        """Register (additional) socket.io events for the communication with the learning loop.
+        The events: connect, disconnect and restart are already registered and should not be overwritten."""
diff --git a/learning_loop_node/tests/test_helper.py b/learning_loop_node/tests/test_helper.py
index 88a94af2..1f485506 100644
--- a/learning_loop_node/tests/test_helper.py
+++ b/learning_loop_node/tests/test_helper.py
@@ -7,7 +7,7 @@
 from typing import Callable
 
 from learning_loop_node.data_classes import Context
-from learning_loop_node.helpers.misc import create_image_folder
+from learning_loop_node.helpers.misc import create_image_folder, create_project_folder
 from learning_loop_node.loop_communication import LoopCommunicator
 from learning_loop_node.node import Node
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
@@ -65,7 +65,7 @@ def _update_attribute_dict(obj: dict, **kwargs) -> None:
 
 
 def create_needed_folders(training_uuid: str = 'some_uuid'):  # pylint: disable=unused-argument
-    project_folder = Node.create_project_folder(
+    project_folder = create_project_folder(
         Context(organization='zauberzeug', project='pytest'))
     image_folder = create_image_folder(project_folder)
     training_folder = TrainerLogic.create_training_folder(project_folder, training_uuid)
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 06eac0aa..2cff2d6f 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import shutil
+import sys
 import time
 from abc import abstractmethod
 from dataclasses import asdict
@@ -10,7 +11,7 @@
 from glob import glob
 from time import perf_counter
 from typing import TYPE_CHECKING, Coroutine, Dict, List, Optional, Union
-from uuid import UUID, uuid4
+from uuid import uuid4
 
 import socketio
 from dacite import from_dict
@@ -19,7 +20,7 @@
 
 from ..data_classes import (BasicModel, Category, Context, Detections, Errors, Hyperparameter, ModelInformation,
                             PretrainedModel, Training, TrainingData, TrainingError, TrainingState)
-from ..helpers.misc import create_image_folder, delete_corrupt_images, is_valid_uuid4
+from ..helpers.misc import create_image_folder, create_project_folder, generate_training, is_valid_uuid4
 from ..node import Node
 from . import training_syncronizer
 from .downloader import TrainingsDownloader
@@ -80,11 +81,11 @@ def init_new_training(self, context: Context, details: Dict) -> None:
         Note that details needs the entries 'categories' and 'training_number'"""
 
         try:
-            project_folder = Node.create_project_folder(context)
+            project_folder = create_project_folder(context)
             if not self.keep_old_trainings:
                 # NOTE: We delete all existing training folders because they are not needed anymore.
                 TrainerLogic.delete_all_training_folders(project_folder)
-            self._training = TrainerLogic.generate_training(project_folder, context)
+            self._training = generate_training(project_folder, context)
             self._training.data = TrainingData(categories=Category.from_list(details['categories']))
             self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details)
             self._training.training_number = details['training_number']
@@ -405,7 +406,7 @@ async def _do_detections(self) -> None:
             content = json.load(f)
             model_information = from_dict(data_class=ModelInformation, data=content)
 
-        project_folder = Node.create_project_folder(context)
+        project_folder = create_project_folder(context)
         image_folder = create_image_folder(project_folder)
         self.node.data_exchanger.set_context(context)
         image_ids = []
@@ -528,8 +529,7 @@ def get_log(self) -> str:
     def may_restart(self) -> None:
         if self.restart_after_training:
             logging.info('restarting')
-            assert self._node is not None
-            self._node.restart()
+            sys.exit(0)
         else:
             logging.info('not restarting')
 
diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py
index d2ae3249..d26831a2 100644
--- a/learning_loop_node/trainer/trainer_node.py
+++ b/learning_loop_node/trainer/trainer_node.py
@@ -126,11 +126,6 @@ async def continue_run_if_incomplete(self) -> bool:
             return True
         return False
 
-    async def get_state(self):
-        if self.trainer_logic._executor is not None and self.trainer_logic._executor.is_process_running():  # pylint: disable=protected-access
-            return NodeState.Running
-        return NodeState.Idle
-
     def get_node_type(self):
         return 'trainer'
 
diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py
index 47781d3d..df6e1292 100644
--- a/mock_trainer/app_code/tests/test_detections.py
+++ b/mock_trainer/app_code/tests/test_detections.py
@@ -5,8 +5,8 @@
 
 from learning_loop_node.data_classes import Category, Context
 from learning_loop_node.globals import GLOBALS
+from learning_loop_node.helpers.misc import create_project_folder
 from learning_loop_node.loop_communication import LoopCommunicator
-from learning_loop_node.node import Node
 from learning_loop_node.tests import test_helper
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 from learning_loop_node.trainer.trainer_node import TrainerNode
@@ -32,7 +32,7 @@ async def test_all(setup_test_project1, glc: LoopCommunicator):  # pylint: disab
     trainer._node = node  # pylint: disable=protected-access
     trainer.init_new_training(context=context, details=details)
 
-    project_folder = Node.create_project_folder(context)
+    project_folder = create_project_folder(context)
     training = TrainerLogic.generate_training(project_folder, context)
     training.model_id_for_detecting = latest_model_id
     trainer._training = training  # pylint: disable=protected-access

From f38e8468cac2b9ef2290937cd0b05cf087f67df6 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 6 Mar 2024 18:12:41 +0100
Subject: [PATCH 07/62] Simplify declaration of node_type

---
 .../annotation/annotator_node.py              |  5 +-
 learning_loop_node/data_classes/__init__.py   | 16 ++---
 learning_loop_node/data_classes/training.py   |  7 +-
 learning_loop_node/detector/detector_node.py  |  8 +--
 learning_loop_node/node.py                    |  9 +--
 .../tests/states/test_state_detecting.py      |  4 +-
 .../trainer/tests/test_trainer_states.py      |  7 +-
 learning_loop_node/trainer/trainer_logic.py   | 67 ++++++++++---------
 learning_loop_node/trainer/trainer_node.py    | 61 +++++++----------
 9 files changed, 82 insertions(+), 102 deletions(-)

diff --git a/learning_loop_node/annotation/annotator_node.py b/learning_loop_node/annotation/annotator_node.py
index 474f28e6..d12bcc0f 100644
--- a/learning_loop_node/annotation/annotator_node.py
+++ b/learning_loop_node/annotation/annotator_node.py
@@ -18,7 +18,7 @@
 class AnnotatorNode(Node):
 
     def __init__(self, name: str, annotator_logic: AnnotatorLogic, uuid: Optional[str] = None):
-        super().__init__(name, uuid)
+        super().__init__(name, uuid, 'annotation_node')
         self.tool = annotator_logic
         self.histories: Dict = {}
         annotator_logic.init(self)
@@ -89,9 +89,6 @@ async def download_image(self, context: Context, uuid: str):
         downloader = DataExchanger(context=context, loop_communicator=self.loop_communicator)
         await downloader.download_images([uuid], images_folder)
 
-    def get_node_type(self):
-        return 'annotation_node'
-
     async def on_startup(self):
         pass
 
diff --git a/learning_loop_node/data_classes/__init__.py b/learning_loop_node/data_classes/__init__.py
index bc2980cd..0e0a10e9 100644
--- a/learning_loop_node/data_classes/__init__.py
+++ b/learning_loop_node/data_classes/__init__.py
@@ -1,12 +1,8 @@
-from .annotations import (AnnotationData, AnnotationEventType,
-                          SegmentationAnnotation, ToolOutput, UserInput)
-from .detections import (BoxDetection, ClassificationDetection, Detections,
-                         Observation, Point, PointDetection,
+from .annotations import AnnotationData, AnnotationEventType, SegmentationAnnotation, ToolOutput, UserInput
+from .detections import (BoxDetection, ClassificationDetection, Detections, Observation, Point, PointDetection,
                          SegmentationDetection, Shape)
-from .general import (AnnotationNodeStatus, Category, CategoryType, Context,
-                      DetectionStatus, ErrorConfiguration, ModelInformation,
-                      NodeState, NodeStatus)
+from .general import (AnnotationNodeStatus, Category, CategoryType, Context, DetectionStatus, ErrorConfiguration,
+                      ModelInformation, NodeState, NodeStatus)
 from .socket_response import SocketResponse
-from .training import (BasicModel, Errors, Hyperparameter, Model,
-                       PretrainedModel, Training, TrainingData, TrainingError,
-                       TrainingOut, TrainingState, TrainingStatus)
+from .training import (BasicModel, Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData,
+                       TrainingError, TrainingOut, TrainingStatus)
diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index 49432925..b78190ef 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -41,7 +41,8 @@ class PretrainedModel():
     description: str
 
 
-class TrainingState(str, Enum):
+class TrainerState(str, Enum):
+    Idle = 'idle'
     Initialized = 'initialized'
     Preparing = 'preparing'
     DataDownloading = 'data_downloading'
@@ -64,7 +65,7 @@ class TrainingState(str, Enum):
 class TrainingStatus():
     id: str  # TODO this must not be changed, but tests wont detect it -> update tests!
     name: str
-    state: Union[Optional[TrainingState], str]
+    state: Union[Optional[TrainerState], str]
     errors: Optional[Dict]
     uptime: Optional[float]
     progress: Optional[float]
@@ -98,7 +99,7 @@ class Training():
     base_model_id: Optional[str] = None
     data: Optional[TrainingData] = None
     training_number: Optional[int] = None
-    training_state: Optional[Union[TrainingState, str]] = None
+    training_state: Optional[Union[TrainerState, str]] = None
     model_id_for_detecting: Optional[str] = None
     hyperparameters: Optional[Dict] = None
 
diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py
index 00271f64..18b8ab6c 100644
--- a/learning_loop_node/detector/detector_node.py
+++ b/learning_loop_node/detector/detector_node.py
@@ -14,7 +14,7 @@
 from fastapi_socketio import SocketManager
 from socketio import AsyncClient
 
-from ..data_classes import Category, Context, Detections, DetectionStatus, ModelInformation, NodeState, Shape
+from ..data_classes import Category, Context, Detections, DetectionStatus, ModelInformation, Shape
 from ..data_classes.socket_response import SocketResponse
 from ..data_exchanger import DataExchanger, DownloadError
 from ..globals import GLOBALS
@@ -34,9 +34,8 @@
 class DetectorNode(Node):
 
     def __init__(self, name: str, detector: DetectorLogic, uuid: Optional[str] = None, use_backdoor_controls: bool = False) -> None:
-        super().__init__(name, uuid)
+        super().__init__(name, uuid, 'detector', False)
         self.detector_logic = detector
-        self.needs_login = False
         self.organization = environment_reader.organization()
         self.project = environment_reader.project()
         assert self.organization and self.project, 'Detector node needs an organization and an project'
@@ -353,9 +352,6 @@ def find_category_id_by_name(categories: List[Category], category_name: str):
             classification_detection.category_id = category_id
         return detections
 
-    def get_node_type(self):
-        return 'detector'
-
     def register_sio_events(self, sio_client: AsyncClient):
         pass
 
diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py
index 5e7adf33..85a81fd6 100644
--- a/learning_loop_node/node.py
+++ b/learning_loop_node/node.py
@@ -20,8 +20,9 @@
 
 class Node(FastAPI):
 
-    def __init__(self, name: str, uuid: Optional[str] = None, needs_login: bool = True):
+    def __init__(self, name: str, uuid: Optional[str] = None, node_type: str = 'node', needs_login: bool = True):
         """Base class for all nodes. A node is a process that communicates with the zauberzeug learning loop.
+        This class provides the basic functionality to connect to the learning loop via socket.io and to exchange data.
 
         Args:
             name (str): The name of the node. This name is used to generate a uuid.
@@ -49,7 +50,7 @@ def __init__(self, name: str, uuid: Optional[str] = None, needs_login: bool = Tr
 
         self.sio_headers = {'organization': self.loop_communicator.organization,
                             'project': self.loop_communicator.project,
-                            'nodeType': self.get_node_type()}
+                            'nodeType': node_type}
 
         self.repeat_task: Any = None
 
@@ -159,10 +160,6 @@ async def connect_sio(self):
 
     # --------------------------------------------------- ABSTRACT METHODS ---------------------------------------------------
 
-    @abstractmethod
-    def get_node_type(self):
-        """Return the type of the node. This is used to register the node at the learning loop."""
-
     @abstractmethod
     async def on_startup(self):
         """This method is called when the node is started."""
diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py
index d571a665..a0ad04d7 100644
--- a/learning_loop_node/trainer/tests/states/test_state_detecting.py
+++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py
@@ -1,7 +1,7 @@
 import asyncio
 
 from learning_loop_node.conftest import get_dummy_detections
-from learning_loop_node.data_classes import TrainingState
+from learning_loop_node.data_classes import TrainerState
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
@@ -31,7 +31,7 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi
 
 async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
-    create_active_training_file(trainer, training_state=TrainingState.TrainModelUploaded)
+    create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded)
     trainer.init_from_last_training()
     trainer.training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242'
 
diff --git a/learning_loop_node/trainer/tests/test_trainer_states.py b/learning_loop_node/trainer/tests/test_trainer_states.py
index c6e449b7..74e630d1 100644
--- a/learning_loop_node/trainer/tests/test_trainer_states.py
+++ b/learning_loop_node/trainer/tests/test_trainer_states.py
@@ -1,10 +1,9 @@
 
 from uuid import uuid4
 
-from learning_loop_node.data_classes import Context, Training, TrainingState
+from learning_loop_node.data_classes import Context, TrainerState, Training
 from learning_loop_node.trainer.io_helpers import LastTrainingIO
-from learning_loop_node.trainer.tests.testing_trainer_logic import \
-    TestingTrainerLogic
+from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_node import TrainerNode
 
 
@@ -27,7 +26,7 @@ def test_fixture_trainer_node(test_initialized_trainer_node):
 def test_save_load_training():
     training = create_training()
     last_training_io = LastTrainingIO('00000000-0000-0000-0000-000000000000')
-    training.training_state = TrainingState.Preparing
+    training.training_state = TrainerState.Preparing
     last_training_io.save(training)
 
     training = last_training_io.load()
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 2cff2d6f..dedaa9e6 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -19,9 +19,8 @@
 from tqdm import tqdm
 
 from ..data_classes import (BasicModel, Category, Context, Detections, Errors, Hyperparameter, ModelInformation,
-                            PretrainedModel, Training, TrainingData, TrainingError, TrainingState)
+                            PretrainedModel, TrainerState, Training, TrainingData, TrainingError)
 from ..helpers.misc import create_image_folder, create_project_folder, generate_training, is_valid_uuid4
-from ..node import Node
 from . import training_syncronizer
 from .downloader import TrainingsDownloader
 from .executor import Executor
@@ -76,6 +75,14 @@ def is_initialized(self) -> bool:
         """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'"""
         return self._training is not None and self._active_training_io is not None and self._node is not None
 
+    @property
+    def state(self) -> str:
+        if (not self.is_initialized) or (self.training.training_state is None):
+            return TrainerState.Idle.value
+        else:
+            state = self.training.training_state
+            return state.value if isinstance(state, TrainerState) else state
+
     def init_new_training(self, context: Context, details: Dict) -> None:
         """Called on `begin_training` event from the Learning Loop.
         Note that details needs the entries 'categories' and 'training_number'"""
@@ -90,7 +97,7 @@ def init_new_training(self, context: Context, details: Dict) -> None:
             self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details)
             self._training.training_number = details['training_number']
             self._training.base_model_id = details['id']
-            self._training.training_state = TrainingState.Initialized
+            self._training.training_state = TrainerState.Initialized
             self._active_training_io = ActiveTrainingIO(self._training.training_folder)
             logging.info(f'init training: {self._training}')
         except Exception:
@@ -112,7 +119,7 @@ async def run(self) -> None:
         except asyncio.CancelledError:
             if not self.shutdown_event.is_set():
                 logging.info('training task was cancelled but not by shutdown event')
-                self.training.training_state = TrainingState.ReadyForCleanup
+                self.training.training_state = TrainerState.ReadyForCleanup
                 self.node.last_training_io.save(self.training)
                 await self.clear_training()
 
@@ -134,27 +141,27 @@ async def _run(self) -> None:
             tstate = self.training.training_state
             logging.info(f'STATE LOOP: {tstate}')
             await asyncio.sleep(0.6)  # Note: Required for pytests!
-            if tstate == TrainingState.Initialized:  # -> DataDownloading -> DataDownloaded
+            if tstate == TrainerState.Initialized:  # -> DataDownloading -> DataDownloaded
                 await self.prepare()
-            elif tstate == TrainingState.DataDownloaded:  # -> TrainModelDownloading -> TrainModelDownloaded
+            elif tstate == TrainerState.DataDownloaded:  # -> TrainModelDownloading -> TrainModelDownloaded
                 await self.download_model()
-            elif tstate == TrainingState.TrainModelDownloaded:  # -> TrainingRunning -> TrainingFinished
+            elif tstate == TrainerState.TrainModelDownloaded:  # -> TrainingRunning -> TrainingFinished
                 await self.train()
-            elif tstate == TrainingState.TrainingFinished:  # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced
+            elif tstate == TrainerState.TrainingFinished:  # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced
                 await self.ensure_confusion_matrix_synced()
-            elif tstate == TrainingState.ConfusionMatrixSynced:  # -> TrainModelUploading -> TrainModelUploaded
+            elif tstate == TrainerState.ConfusionMatrixSynced:  # -> TrainModelUploading -> TrainModelUploaded
                 await self.upload_model()
-            elif tstate == TrainingState.TrainModelUploaded:  # -> Detecting -> Detected
+            elif tstate == TrainerState.TrainModelUploaded:  # -> Detecting -> Detected
                 await self.do_detections()
-            elif tstate == TrainingState.Detected:  # -> DetectionUploading -> ReadyForCleanup
+            elif tstate == TrainerState.Detected:  # -> DetectionUploading -> ReadyForCleanup
                 await self.upload_detections()
-            elif tstate == TrainingState.ReadyForCleanup:  # -> RESTART or TrainingFinished
+            elif tstate == TrainerState.ReadyForCleanup:  # -> RESTART or TrainingFinished
                 await self.clear_training()
                 self.may_restart()
 
     async def prepare(self) -> None:
         previous_state = self.training.training_state
-        self.training.training_state = TrainingState.DataDownloading
+        self.training.training_state = TrainerState.DataDownloading
         error_key = 'prepare'
         try:
             await self._prepare()
@@ -167,7 +174,7 @@ async def prepare(self) -> None:
             self.errors.set(error_key, str(e))
         else:
             self.errors.reset(error_key)
-            self.training.training_state = TrainingState.DataDownloaded
+            self.training.training_state = TrainerState.DataDownloaded
             self.node.last_training_io.save(self.training)
 
     async def _prepare(self) -> None:
@@ -181,7 +188,7 @@ async def _prepare(self) -> None:
     async def download_model(self) -> None:
         logging.info('Downloading model')
         previous_state = self.training.training_state
-        self.training.training_state = TrainingState.TrainModelDownloading
+        self.training.training_state = TrainerState.TrainModelDownloading
         error_key = 'download_model'
         try:
             await self._download_model()
@@ -195,7 +202,7 @@ async def download_model(self) -> None:
         else:
             self.errors.reset(error_key)
             logging.info('download_model_task finished')
-            self.training.training_state = TrainingState.TrainModelDownloaded
+            self.training.training_state = TrainerState.TrainModelDownloaded
             self.node.last_training_io.save(self.training)
 
     async def _download_model(self) -> None:
@@ -218,7 +225,7 @@ async def train(self) -> None:
         self.errors.reset(error_key)
         previous_state = self.training.training_state
         self._executor = Executor(self.training.training_folder)
-        self.training.training_state = TrainingState.TrainingRunning
+        self.training.training_state = TrainerState.TrainingRunning
         try:
             await self._start_training()
 
@@ -263,7 +270,7 @@ async def train(self) -> None:
             self.training.training_state = previous_state
             logging.exception('Error in run_training')
         else:
-            self.training.training_state = TrainingState.TrainingFinished
+            self.training.training_state = TrainerState.TrainingFinished
             self.node.last_training_io.save(self.training)
 
     async def _start_training(self):
@@ -283,7 +290,7 @@ async def _start_training(self):
     async def ensure_confusion_matrix_synced(self):
         logging.info('Ensure syncing confusion matrix')
         previous_state = self.training.training_state
-        self.training.training_state = TrainingState.ConfusionMatrixSyncing
+        self.training.training_state = TrainerState.ConfusionMatrixSyncing
         try:
             await self.sync_confusion_matrix()
         except asyncio.CancelledError:
@@ -293,7 +300,7 @@ async def ensure_confusion_matrix_synced(self):
             logging.exception('Error in ensure_confusion_matrix_synced')
             self.training.training_state = previous_state
         else:
-            self.training.training_state = TrainingState.ConfusionMatrixSynced
+            self.training.training_state = TrainerState.ConfusionMatrixSynced
             self.node.last_training_io.save(self.training)
 
     async def sync_confusion_matrix(self):
@@ -315,11 +322,11 @@ async def sync_confusion_matrix(self):
     async def upload_model(self) -> None:
         error_key = 'upload_model'
         previous_state = self.training.training_state
-        self.training.training_state = TrainingState.TrainModelUploading
+        self.training.training_state = TrainerState.TrainModelUploading
         try:
             new_model_id = await self._upload_model_return_new_id(self.training.context)
             if new_model_id is None:
-                self.training.training_state = TrainingState.ReadyForCleanup
+                self.training.training_state = TrainerState.ReadyForCleanup
                 logging.error('could not upload model - maybe training failed.. cleaning up')
                 return
             assert new_model_id is not None, 'uploaded_model must be set'
@@ -335,7 +342,7 @@ async def upload_model(self) -> None:
             # self.training.training_state = TrainingState.ReadyForCleanup
         else:
             self.errors.reset(error_key)
-            self.training.training_state = TrainingState.TrainModelUploaded
+            self.training.training_state = TrainerState.TrainModelUploaded
             self.node.last_training_io.save(self.training)
 
     async def _upload_model_return_new_id(self, context: Context) -> Optional[str]:
@@ -377,7 +384,7 @@ async def do_detections(self):
         error_key = 'detecting'
         previous_state = self.training.training_state
         try:
-            self.training.training_state = TrainingState.Detecting
+            self.training.training_state = TrainerState.Detecting
             await self._do_detections()
         except asyncio.CancelledError:
             logging.warning('CancelledError in do_detections')
@@ -388,7 +395,7 @@ async def do_detections(self):
             self.training.training_state = previous_state
         else:
             self.errors.reset(error_key)
-            self.training.training_state = TrainingState.Detected
+            self.training.training_state = TrainerState.Detected
             self.node.last_training_io.save(self.training)
 
     async def _do_detections(self) -> None:
@@ -439,7 +446,7 @@ async def _do_detections(self) -> None:
     async def upload_detections(self):
         error_key = 'upload_detections'
         previous_state = self.training.training_state
-        self.training.training_state = TrainingState.DetectionUploading
+        self.training.training_state = TrainerState.DetectionUploading
         await asyncio.sleep(0.1)  # NOTE needed for tests
         try:
             json_files = self.active_training_io.get_detection_file_names()
@@ -460,7 +467,7 @@ async def upload_detections(self):
             self.training.training_state = previous_state
         else:
             self.errors.reset(error_key)
-            self.training.training_state = TrainingState.ReadyForCleanup
+            self.training.training_state = TrainerState.ReadyForCleanup
             self.node.last_training_io.save(self.training)
 
     async def _upload_detections_batched(self, context: Context, detections: List[Detections]):
@@ -540,11 +547,11 @@ def general_progress(self) -> Optional[float]:
             return None
 
         t_state = self.training.training_state
-        if t_state == TrainingState.DataDownloading:
+        if t_state == TrainerState.DataDownloading:
             return self.node.data_exchanger.progress
-        if t_state == TrainingState.TrainingRunning:
+        if t_state == TrainerState.TrainingRunning:
             return self.training_progress
-        if t_state == TrainingState.Detecting:
+        if t_state == TrainerState.Detecting:
             return self.detection_progress
 
         return None
diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py
index d26831a2..219a48ca 100644
--- a/learning_loop_node/trainer/trainer_node.py
+++ b/learning_loop_node/trainer/trainer_node.py
@@ -7,7 +7,7 @@
 from fastapi.encoders import jsonable_encoder
 from socketio import AsyncClient
 
-from ..data_classes import Context, NodeState, TrainingState, TrainingStatus
+from ..data_classes import Context, NodeState, TrainerState, TrainingStatus
 from ..data_classes.socket_response import SocketResponse
 from ..node import Node
 from .io_helpers import LastTrainingIO
@@ -18,7 +18,7 @@
 class TrainerNode(Node):
 
     def __init__(self, name: str, trainer_logic: TrainerLogic, uuid: Optional[str] = None, use_backdoor_controls: bool = False):
-        super().__init__(name, uuid)
+        super().__init__(name, uuid, 'trainer')
         trainer_logic._node = self  # pylint: disable=protected-access
         self.trainer_logic = trainer_logic
         self.last_training_io = LastTrainingIO(self.uuid)
@@ -65,7 +65,6 @@ def register_sio_events(self, sio_client: AsyncClient):
 
         @sio_client.event
         async def begin_training(organization: str, project: str, details: Dict):
-            assert self._sio_client is not None
             self.log.info('received begin_training from server')
             self.trainer_logic.init_new_training(Context(organization=organization, project=project), details)
             asyncio.get_event_loop().create_task(self.trainer_logic.run())
@@ -81,20 +80,13 @@ async def stop_training():
             return True
 
     async def send_status(self):
-        if self._sio_client is None or not self._sio_client.connected:
+        if not self.sio_client.connected:
             self.log.warning('cannot send status - not connected to the Learning Loop')
             return
 
-        if not self.trainer_logic.is_initialized:
-            state_for_learning_loop = str(NodeState.Idle.value)
-        else:
-            assert self.trainer_logic.training.training_state is not None
-            state_for_learning_loop = TrainerNode.state_for_learning_loop(
-                self.trainer_logic.training.training_state)
-
         status = TrainingStatus(id=self.uuid,
                                 name=self.name,
-                                state=state_for_learning_loop,
+                                state=self.trainer_logic.state,
                                 errors={},
                                 uptime=self.training_uptime,
                                 progress=self.progress)
@@ -111,12 +103,10 @@ async def send_status(self):
             status.context = self.trainer_logic.training.context
 
         self.log.info(f'sending status: {status.short_str()}')
-        result = await self._sio_client.call('update_trainer', jsonable_encoder(asdict(status)), timeout=30)
+        result = await self.sio_client.call('update_trainer', jsonable_encoder(asdict(status)), timeout=30)
         assert isinstance(result, Dict)
-        response = from_dict(data_class=SocketResponse, data=result)
-
-        if not response.success:
-            self.log.error(f'Error when sending status update: Response from loop was:\n {asdict(response)}')
+        if not result['success']:
+            self.log.error(f'Error when sending status update: Response from loop was:\n {result}')
 
     async def continue_run_if_incomplete(self) -> bool:
         if not self.trainer_logic.is_initialized and self.last_training_io.exists():
@@ -126,43 +116,40 @@ async def continue_run_if_incomplete(self) -> bool:
             return True
         return False
 
-    def get_node_type(self):
-        return 'trainer'
-
     # --------------------------------------------------- HELPER ---------------------------------------------------
 
     @staticmethod
-    def state_for_learning_loop(trainer_state: Union[TrainingState, str]) -> str:
-        if trainer_state == TrainingState.Initialized:
+    def state_for_learning_loop(trainer_state: Union[TrainerState, str]) -> str:
+        if trainer_state == TrainerState.Initialized:
             return 'Training is initialized'
-        if trainer_state == TrainingState.DataDownloading:
+        if trainer_state == TrainerState.DataDownloading:
             return 'Downloading data'
-        if trainer_state == TrainingState.DataDownloaded:
+        if trainer_state == TrainerState.DataDownloaded:
             return 'Data downloaded'
-        if trainer_state == TrainingState.TrainModelDownloading:
+        if trainer_state == TrainerState.TrainModelDownloading:
             return 'Downloading model'
-        if trainer_state == TrainingState.TrainModelDownloaded:
+        if trainer_state == TrainerState.TrainModelDownloaded:
             return 'Model downloaded'
-        if trainer_state == TrainingState.TrainingRunning:
+        if trainer_state == TrainerState.TrainingRunning:
             return NodeState.Running
-        if trainer_state == TrainingState.TrainingFinished:
+        if trainer_state == TrainerState.TrainingFinished:
             return 'Training finished'
-        if trainer_state == TrainingState.Detecting:
+        if trainer_state == TrainerState.Detecting:
             return NodeState.Detecting
-        if trainer_state == TrainingState.ConfusionMatrixSyncing:
+        if trainer_state == TrainerState.ConfusionMatrixSyncing:
             return 'Syncing confusion matrix'
-        if trainer_state == TrainingState.ConfusionMatrixSynced:
+        if trainer_state == TrainerState.ConfusionMatrixSynced:
             return 'Confusion matrix synced'
-        if trainer_state == TrainingState.TrainModelUploading:
+        if trainer_state == TrainerState.TrainModelUploading:
             return 'Uploading trained model'
-        if trainer_state == TrainingState.TrainModelUploaded:
+        if trainer_state == TrainerState.TrainModelUploaded:
             return 'Trained model uploaded'
-        if trainer_state == TrainingState.Detecting:
+        if trainer_state == TrainerState.Detecting:
             return 'calculating detections'
-        if trainer_state == TrainingState.Detected:
+        if trainer_state == TrainerState.Detected:
             return 'Detections calculated'
-        if trainer_state == TrainingState.DetectionUploading:
+        if trainer_state == TrainerState.DetectionUploading:
             return 'Uploading detections'
-        if trainer_state == TrainingState.ReadyForCleanup:
+        if trainer_state == TrainerState.ReadyForCleanup:
             return 'Cleaning training'
         return 'unknown state'

From 00482753112ea41f5ed4112bd27e3b90b9216c8b Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Thu, 7 Mar 2024 13:26:51 +0100
Subject: [PATCH 08/62] Further cleanup and make annotation node send status
 once.

---
 .../annotation/annotator_node.py              | 17 ++++++--
 learning_loop_node/node.py                    |  3 +-
 learning_loop_node/trainer/trainer_node.py    | 42 +------------------
 mock_annotator/app_code/restart/restart.py    |  2 +
 mock_annotator/start.sh                       |  2 +-
 5 files changed, 19 insertions(+), 47 deletions(-)
 create mode 100644 mock_annotator/app_code/restart/restart.py

diff --git a/learning_loop_node/annotation/annotator_node.py b/learning_loop_node/annotation/annotator_node.py
index d12bcc0f..94848506 100644
--- a/learning_loop_node/annotation/annotator_node.py
+++ b/learning_loop_node/annotation/annotator_node.py
@@ -22,6 +22,7 @@ def __init__(self, name: str, annotator_logic: AnnotatorLogic, uuid: Optional[st
         self.tool = annotator_logic
         self.histories: Dict = {}
         annotator_logic.init(self)
+        self.status_sent = False
 
     def register_sio_events(self, sio_client: AsyncClient):
 
@@ -65,6 +66,9 @@ def get_history(self, frontend_id: str) -> Dict:
         return self.histories.setdefault(frontend_id, self.tool.create_empty_history())
 
     async def send_status(self):
+        if self.status_sent:
+            return
+
         status = AnnotationNodeStatus(
             id=self.uuid,
             name=self.name,
@@ -73,14 +77,19 @@ async def send_status(self):
         )
 
         self.log.info(f'Sending status {status}')
-        if self._sio_client is None:
-            raise Exception('No socket client')
-        result = await self._sio_client.call('update_annotation_node', jsonable_encoder(asdict(status)), timeout=10)
+        try:
+            result = await self.sio_client.call('update_annotation_node', jsonable_encoder(asdict(status)), timeout=10)
+        except Exception as e:
+            self.log.error(f'Error for updating: {str(e)}')
+            return
+
         assert isinstance(result, Dict)
         response = from_dict(data_class=SocketResponse, data=result)
 
         if not response.success:
             self.log.error(f'Error for updating: Response from loop was : {asdict(response)}')
+        else:
+            self.status_sent = True
 
     async def download_image(self, context: Context, uuid: str):
         project_folder = create_project_folder(context)
@@ -96,4 +105,4 @@ async def on_shutdown(self):
         pass
 
     async def on_repeat(self):
-        pass
+        await self.send_status()
diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py
index 85a81fd6..38742fa4 100644
--- a/learning_loop_node/node.py
+++ b/learning_loop_node/node.py
@@ -162,7 +162,8 @@ async def connect_sio(self):
 
     @abstractmethod
     async def on_startup(self):
-        """This method is called when the node is started."""
+        """This method is called when the node is started.
+        Note: In this method the sio connection is not yet established!"""
 
     @abstractmethod
     async def on_shutdown(self):
diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py
index 219a48ca..ae8f2527 100644
--- a/learning_loop_node/trainer/trainer_node.py
+++ b/learning_loop_node/trainer/trainer_node.py
@@ -3,12 +3,10 @@
 from dataclasses import asdict
 from typing import Dict, Optional, Union
 
-from dacite import from_dict
 from fastapi.encoders import jsonable_encoder
 from socketio import AsyncClient
 
-from ..data_classes import Context, NodeState, TrainerState, TrainingStatus
-from ..data_classes.socket_response import SocketResponse
+from ..data_classes import Context, TrainingStatus
 from ..node import Node
 from .io_helpers import LastTrainingIO
 from .rest import backdoor_controls, controls
@@ -115,41 +113,3 @@ async def continue_run_if_incomplete(self) -> bool:
             asyncio.get_event_loop().create_task(self.trainer_logic.run())
             return True
         return False
-
-    # --------------------------------------------------- HELPER ---------------------------------------------------
-
-    @staticmethod
-    def state_for_learning_loop(trainer_state: Union[TrainerState, str]) -> str:
-        if trainer_state == TrainerState.Initialized:
-            return 'Training is initialized'
-        if trainer_state == TrainerState.DataDownloading:
-            return 'Downloading data'
-        if trainer_state == TrainerState.DataDownloaded:
-            return 'Data downloaded'
-        if trainer_state == TrainerState.TrainModelDownloading:
-            return 'Downloading model'
-        if trainer_state == TrainerState.TrainModelDownloaded:
-            return 'Model downloaded'
-        if trainer_state == TrainerState.TrainingRunning:
-            return NodeState.Running
-        if trainer_state == TrainerState.TrainingFinished:
-            return 'Training finished'
-        if trainer_state == TrainerState.Detecting:
-            return NodeState.Detecting
-        if trainer_state == TrainerState.ConfusionMatrixSyncing:
-            return 'Syncing confusion matrix'
-        if trainer_state == TrainerState.ConfusionMatrixSynced:
-            return 'Confusion matrix synced'
-        if trainer_state == TrainerState.TrainModelUploading:
-            return 'Uploading trained model'
-        if trainer_state == TrainerState.TrainModelUploaded:
-            return 'Trained model uploaded'
-        if trainer_state == TrainerState.Detecting:
-            return 'calculating detections'
-        if trainer_state == TrainerState.Detected:
-            return 'Detections calculated'
-        if trainer_state == TrainerState.DetectionUploading:
-            return 'Uploading detections'
-        if trainer_state == TrainerState.ReadyForCleanup:
-            return 'Cleaning training'
-        return 'unknown state'
diff --git a/mock_annotator/app_code/restart/restart.py b/mock_annotator/app_code/restart/restart.py
new file mode 100644
index 00000000..915175ed
--- /dev/null
+++ b/mock_annotator/app_code/restart/restart.py
@@ -0,0 +1,2 @@
+# add 'reload_dirs=['./app_code/restart'] to uvicorn call in main.py
+# save this file to trigger uvicorn restart
diff --git a/mock_annotator/start.sh b/mock_annotator/start.sh
index e6d3aaac..7814999d 100755
--- a/mock_annotator/start.sh
+++ b/mock_annotator/start.sh
@@ -5,5 +5,5 @@ if [[ $1 = "debug" ]]; then
 elif [[ $1 = "profile" ]]; then
     kernprof -l /app/main.py
 else
-   python3 /app/main.py
+    uvicorn main:node --host 0.0.0.0 --port 80 --reload --lifespan on --reload-dir /app/app_code/restart
 fi
\ No newline at end of file

From 17eb503bbfb3017982f9fb134d37ac4242b8b3d2 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Thu, 7 Mar 2024 15:24:54 +0100
Subject: [PATCH 09/62] change value of trainer state to old version, so old
 trainers still work in the loop

---
 learning_loop_node/data_classes/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index b78190ef..dd9fb624 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -49,7 +49,7 @@ class TrainerState(str, Enum):
     DataDownloaded = 'data_downloaded'
     TrainModelDownloading = 'train_model_downloading'
     TrainModelDownloaded = 'train_model_downloaded'
-    TrainingRunning = 'training_running'
+    TrainingRunning = 'running'
     TrainingFinished = 'training_finished'
     ConfusionMatrixSyncing = 'confusion_matrix_syncing'
     ConfusionMatrixSynced = 'confusion_matrix_synced'

From 68a0eb56120df6ebf127b9f53573fb68a6f0e332 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Thu, 7 Mar 2024 16:48:44 +0100
Subject: [PATCH 10/62] Refactoring: use enums

---
 learning_loop_node/data_classes/training.py   |  4 +-
 .../tests/states/test_state_detecting.py      |  6 +-
 .../states/test_state_download_train_model.py | 11 +--
 .../tests/states/test_state_prepare.py        | 12 +--
 .../test_state_sync_confusion_matrix.py       | 25 ++++---
 .../trainer/tests/states/test_state_train.py  | 25 ++++---
 .../states/test_state_upload_detections.py    | 29 +++----
 .../tests/states/test_state_upload_model.py   | 18 ++---
 .../trainer/tests/test_trainer_states.py      |  2 +-
 learning_loop_node/trainer/trainer_logic.py   | 61 +++++++++------
 .../trainer/trainer_logic_abstraction.py      | 75 +++++++++++++++++++
 learning_loop_node/trainer/trainer_node.py    | 57 +++++---------
 12 files changed, 200 insertions(+), 125 deletions(-)
 create mode 100644 learning_loop_node/trainer/trainer_logic_abstraction.py

diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index dd9fb624..9a41928f 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -65,7 +65,7 @@ class TrainerState(str, Enum):
 class TrainingStatus():
     id: str  # TODO this must not be changed, but tests wont detect it -> update tests!
     name: str
-    state: Union[Optional[TrainerState], str]
+    state: Optional[TrainerState]
     errors: Optional[Dict]
     uptime: Optional[float]
     progress: Optional[float]
@@ -99,7 +99,7 @@ class Training():
     base_model_id: Optional[str] = None
     data: Optional[TrainingData] = None
     training_number: Optional[int] = None
-    training_state: Optional[Union[TrainerState, str]] = None
+    training_state: Optional[TrainerState] = None
     model_id_for_detecting: Optional[str] = None
     hyperparameters: Optional[Dict] = None
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py
index a0ad04d7..5d7583fe 100644
--- a/learning_loop_node/trainer/tests/states/test_state_detecting.py
+++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py
@@ -24,7 +24,7 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi
     await assert_training_state(trainer.training, 'detected', timeout=10, interval=0.001)
 
     assert trainer_has_error(trainer) is False
-    assert trainer.training.training_state == 'detected'
+    assert trainer.training.training_state == TrainerState.Detected
     assert trainer.node.last_training_io.load() == trainer.training
     assert trainer.active_training_io.detections_exist()
 
@@ -48,7 +48,7 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer
 
 async def test_model_not_downloadable_error(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
-    create_active_training_file(trainer, training_state='train_model_uploaded',
+    create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded,
                                 model_id_for_detecting='00000000-0000-0000-0000-000000000000')  # bad model id
     trainer.init_from_last_training()
 
@@ -58,7 +58,7 @@ async def test_model_not_downloadable_error(test_initialized_trainer: TestingTra
     await assert_training_state(trainer.training, 'train_model_uploaded', timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == 'train_model_uploaded'
+    assert trainer.training.training_state == TrainerState.TrainModelUploaded
     assert trainer.training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000'
     assert trainer.node.last_training_io.load() == trainer.training
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
index 687e5060..5785e5fa 100644
--- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
@@ -2,13 +2,14 @@
 import asyncio
 import os
 
+from learning_loop_node.data_classes import TrainerState
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 
 
 async def test_downloading_is_successful(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
-    create_active_training_file(trainer, training_state='data_downloaded')
+    create_active_training_file(trainer, training_state=TrainerState.DataDownloaded)
 
     trainer.model_format = 'mocked'
     trainer.init_from_last_training()
@@ -17,7 +18,7 @@ async def test_downloading_is_successful(test_initialized_trainer: TestingTraine
     await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
     await assert_training_state(trainer.training, 'train_model_downloaded', timeout=1, interval=0.001)
 
-    assert trainer.training.training_state == 'train_model_downloaded'
+    assert trainer.training.training_state == TrainerState.TrainModelDownloaded
     assert trainer.node.last_training_io.load() == trainer.training
 
     # file on disk
@@ -43,15 +44,15 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi
 
 async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
-    create_active_training_file(trainer, training_state='data_downloaded',
+    create_active_training_file(trainer, training_state=TrainerState.DataDownloaded,
                                 base_model_id='00000000-0000-0000-0000-000000000000')  # bad model id)
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
     await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, 'data_downloaded', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.DataDownloaded, timeout=1, interval=0.001)
 
     assert trainer.errors.has_error_for('download_model')
     assert trainer._training is not None  # pylint: disable=protected-access
-    assert trainer.training.training_state == 'data_downloaded'
+    assert trainer.training.training_state == TrainerState.DataDownloaded
     assert trainer.node.last_training_io.load() == trainer.training
diff --git a/learning_loop_node/trainer/tests/states/test_state_prepare.py b/learning_loop_node/trainer/tests/states/test_state_prepare.py
index 9d2eedcc..261fbb70 100644
--- a/learning_loop_node/trainer/tests/states/test_state_prepare.py
+++ b/learning_loop_node/trainer/tests/states/test_state_prepare.py
@@ -1,6 +1,6 @@
 import asyncio
 
-from learning_loop_node.data_classes import Context
+from learning_loop_node.data_classes import Context, TrainerState
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
@@ -19,7 +19,7 @@ async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerL
 
     await trainer.prepare()
     assert trainer_has_error(trainer) is False
-    assert trainer.training.training_state == 'data_downloaded'
+    assert trainer.training.training_state == TrainerState.DataDownloaded
     assert trainer.training.data is not None
     assert trainer.node.last_training_io.load() == trainer.training
 
@@ -30,7 +30,7 @@ async def test_abort_preparing(test_initialized_trainer: TestingTrainerLogic):
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, 'data_downloading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -46,10 +46,10 @@ async def test_request_error(test_initialized_trainer: TestingTrainerLogic):
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, 'data_downloading', timeout=3, interval=0.001)
-    await assert_training_state(trainer.training, 'initialized', timeout=3, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=3, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Initialized, timeout=3, interval=0.001)
 
     assert trainer_has_error(trainer)
     assert trainer._training is not None  # pylint: disable=protected-access
-    assert trainer.training.training_state == 'initialized'
+    assert trainer.training.training_state == TrainerState.Initialized
     assert trainer.node.last_training_io.load() == trainer.training
diff --git a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
index b6cce7c2..51fec3ff 100644
--- a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
+++ b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
@@ -3,6 +3,7 @@
 
 from pytest_mock import MockerFixture  # pip install pytest-mock
 
+from learning_loop_node.data_classes import TrainerState
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 from learning_loop_node.trainer.trainer_node import TrainerNode
 
@@ -21,14 +22,14 @@ async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic):
 
     # TODO this requires trainer to have _training
     # trainer.load_active_training()
-    create_active_training_file(trainer, training_state='training_finished')
+    create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'confusion_matrix_synced', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
     assert trainer_has_error(trainer) is False
-    assert trainer.training.training_state == 'confusion_matrix_synced'
+    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
     assert trainer.node.last_training_io.load() == trainer.training
 
 
@@ -37,16 +38,16 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine
     assert isinstance(trainer, TestingTrainerLogic)
 
     await mock_socket_io_call(mocker, test_initialized_trainer_node, {'success': True})
-    create_active_training_file(trainer, training_state='training_finished')
+    create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
 
     trainer.init_from_last_training()
     trainer.has_new_model = True
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, 'confusion_matrix_synced', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer) is False
-#    assert trainer.training.training_state == 'confusion_matrix_synced'
+#    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
     assert trainer.node.last_training_io.load() == trainer.training
 
 
@@ -54,7 +55,7 @@ async def test_unsynced_model_available__sio_not_connected(test_initialized_trai
     trainer = test_initialized_trainer_node.trainer_logic
     assert isinstance(trainer, TestingTrainerLogic)
 
-    create_active_training_file(trainer, training_state='training_finished')
+    create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
 
     assert test_initialized_trainer_node.sio_client.connected is False
     trainer.has_new_model = True
@@ -62,10 +63,10 @@ async def test_unsynced_model_available__sio_not_connected(test_initialized_trai
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
     await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, 'training_finished', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == 'training_finished'
+    assert trainer.training.training_state == TrainerState.TrainingFinished
     assert trainer.node.last_training_io.load() == trainer.training
 
 
@@ -75,16 +76,16 @@ async def test_unsynced_model_available__request_is_not_successful(test_initiali
 
     await mock_socket_io_call(mocker, test_initialized_trainer_node, {'success': False})
 
-    create_active_training_file(trainer, training_state='training_finished')
+    create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
 
     trainer.has_new_model = True
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
     await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, 'training_finished', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == 'training_finished'
+    assert trainer.training.training_state == TrainerState.TrainingFinished
     assert trainer.node.last_training_io.load() == trainer.training
 
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py
index c46294ba..9d6b31f4 100644
--- a/learning_loop_node/trainer/tests/states/test_state_train.py
+++ b/learning_loop_node/trainer/tests/states/test_state_train.py
@@ -1,5 +1,6 @@
 import asyncio
 
+from learning_loop_node.data_classes import TrainerState
 from learning_loop_node.tests.test_helper import condition
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
@@ -8,41 +9,41 @@
 async def test_successful_training(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
-    create_active_training_file(trainer, training_state='train_model_downloaded')
+    create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'start_training'
 
     # pylint: disable=protected-access
     assert trainer._executor is not None
     trainer._executor.stop()  # NOTE normally a training terminates itself
-    await assert_training_state(trainer.training, 'training_finished', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.training.training_state == 'training_finished'
+    assert trainer.training.training_state == TrainerState.TrainingFinished
     assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_stop_running_training(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
-    create_active_training_file(trainer, training_state='train_model_downloaded')
+    create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
     await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)  # pylint: disable=protected-access
-    await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'start_training'
 
     await trainer.stop()
-    await assert_training_state(trainer.training, 'training_finished', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.training.training_state == 'training_finished'
+    assert trainer.training.training_state == TrainerState.TrainingFinished
     assert trainer.node.last_training_io.load() == trainer.training
 
 
@@ -50,21 +51,21 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain
     trainer = test_initialized_trainer
 
     # NOTE e.g. when a node-computer is restarted
-    create_active_training_file(trainer, training_state='train_model_downloaded')
+    create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
     trainer.init_from_last_training()
     trainer._can_resume = True  # pylint: disable=protected-access
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
     await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)  # pylint: disable=protected-access
-    await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'resume'
 
     # pylint: disable=protected-access
     assert trainer._executor is not None
     trainer._executor.stop()  # NOTE normally a training terminates itself e.g
-    await assert_training_state(trainer.training, 'training_finished', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.training.training_state == 'training_finished'
+    assert trainer.training.training_state == TrainerState.TrainingFinished
     assert trainer.node.last_training_io.load() == trainer.training
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
index ca6912d1..a6f69c56 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
@@ -1,3 +1,4 @@
+from learning_loop_node.data_classes import TrainerState
 import asyncio
 
 import pytest
@@ -43,13 +44,13 @@ async def create_valid_detection_file(trainer: TrainerLogic, number_of_entries:
 @pytest.mark.asyncio
 async def test_upload_successful(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
-    create_active_training_file(trainer, training_state='detected')
+    create_active_training_file(trainer, training_state=TrainerState.Detected)
     trainer.init_from_last_training()
 
     await create_valid_detection_file(trainer)
     await trainer.upload_detections()
 
-    assert trainer.training.training_state == 'ready_for_cleanup'
+    assert trainer.training.training_state == TrainerState.ReadyForCleanup
     assert trainer.node.last_training_io.load() == trainer.training
 
 
@@ -57,7 +58,7 @@ async def test_upload_successful(test_initialized_trainer: TestingTrainerLogic):
 async def test_detection_upload_progress_is_stored(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
-    create_active_training_file(trainer, training_state='detected')
+    create_active_training_file(trainer, training_state=TrainerState.Detected)
     trainer.init_from_last_training()
 
     await create_valid_detection_file(trainer)
@@ -72,7 +73,7 @@ async def test_detection_upload_progress_is_stored(test_initialized_trainer: Tes
 async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
-    create_active_training_file(trainer, training_state='detected')
+    create_active_training_file(trainer, training_state=TrainerState.Detected)
     trainer.init_from_last_training()
 
     await create_valid_detection_file(trainer, 2, 0)
@@ -114,17 +115,17 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test
 async def test_bad_status_from_LearningLoop(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
-    create_active_training_file(trainer, training_state='detected', context=Context(
+    create_active_training_file(trainer, training_state=TrainerState.Detected, context=Context(
         organization='zauberzeug', project='some_bad_project'))
     trainer.init_from_last_training()
     trainer.active_training_io.save_detections([get_dummy_detections()])
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, 'detection_uploading', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, 'detected', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == 'detected'
+    assert trainer.training.training_state == TrainerState.Detected
     assert trainer.node.last_training_io.load() == trainer.training
 
 
@@ -132,28 +133,28 @@ async def test_other_errors(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
     # e.g. missing detection file
-    create_active_training_file(trainer, training_state='detected')
+    create_active_training_file(trainer, training_state=TrainerState.Detected)
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, 'detection_uploading', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, 'detected', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == 'detected'
+    assert trainer.training.training_state == TrainerState.Detected
     assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
-    create_active_training_file(trainer, training_state='detected')
+    create_active_training_file(trainer, training_state=TrainerState.Detected)
     trainer.init_from_last_training()
     await create_valid_detection_file(trainer)
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'detection_uploading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
index 41a5a4a8..efc40010 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
@@ -2,7 +2,7 @@
 
 from pytest_mock import MockerFixture
 
-from learning_loop_node.data_classes import Context
+from learning_loop_node.data_classes import Context, TrainerState
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
@@ -23,11 +23,11 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer
 
     train_task = asyncio.get_running_loop().create_task(trainer.upload_model())
 
-    await assert_training_state(trainer.training, 'train_model_uploading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
     await train_task
 
     assert trainer_has_error(trainer) is False
-    assert trainer.training.training_state == 'train_model_uploaded'
+    assert trainer.training.training_state == TrainerState.TrainModelUploaded
     assert trainer.training.model_id_for_detecting is not None
     assert trainer.node.last_training_io.load() == trainer.training
 
@@ -35,12 +35,12 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer
 async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
-    create_active_training_file(trainer, training_state='confusion_matrix_synced')
+    create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced)
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'train_model_uploading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -55,17 +55,17 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai
     The training should be aborted and the training state should be set to confusion_matrix_synced."""
     trainer = test_initialized_trainer
 
-    create_active_training_file(trainer, training_state='confusion_matrix_synced')
+    create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced)
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'train_model_uploading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
     # TODO goes to finished because of the error
-    await assert_training_state(trainer.training, 'confusion_matrix_synced', timeout=2, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == 'confusion_matrix_synced'
+    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
     assert trainer.training.model_id_for_detecting is None
     assert trainer.node.last_training_io.load() == trainer.training
 
diff --git a/learning_loop_node/trainer/tests/test_trainer_states.py b/learning_loop_node/trainer/tests/test_trainer_states.py
index 74e630d1..c5f2d04e 100644
--- a/learning_loop_node/trainer/tests/test_trainer_states.py
+++ b/learning_loop_node/trainer/tests/test_trainer_states.py
@@ -30,4 +30,4 @@ def test_save_load_training():
     last_training_io.save(training)
 
     training = last_training_io.load()
-    assert training.training_state == 'preparing'
+    assert training.training_state == TrainerState.Preparing
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index dedaa9e6..717e9c8f 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -25,12 +25,10 @@
 from .downloader import TrainingsDownloader
 from .executor import Executor
 from .io_helpers import ActiveTrainingIO
+from .trainer_logic_abstraction import TrainerLogicAbstraction
 
-if TYPE_CHECKING:
-    from .trainer_node import TrainerNode
 
-
-class TrainerLogic():
+class TrainerLogic(TrainerLogicAbstraction):
 
     def __init__(self, model_format: str) -> None:
         self.model_format: str = model_format
@@ -44,12 +42,15 @@ def __init__(self, model_format: str) -> None:
 
         self._training: Optional[Training] = None
         self._active_training_io: Optional[ActiveTrainingIO] = None
-        self._node: Optional[TrainerNode] = None
         self.restart_after_training = os.environ.get('RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1']
         self.keep_old_trainings = os.environ.get('KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1']
         self.inference_batch_size = int(os.environ.get('INFERENCE_BATCH_SIZE', '10'))
         logging.info(f'INFERENCE_BATCH_SIZE: {self.inference_batch_size}')
 
+    @property
+    def training_uptime(self) -> Union[float, None]:
+        return time.time() - self.start_time if self.start_time else None
+
     @property
     def executor(self) -> Executor:
         assert self._executor is not None, 'executor must be set, call `run_training` first'
@@ -66,22 +67,28 @@ def active_training_io(self) -> ActiveTrainingIO:
         return self._active_training_io
 
     @property
-    def node(self) -> 'TrainerNode':
-        assert self._node is not None, 'node should be set by TrainerNodes before initialization'
-        return self._node
-
-    @property
-    def is_initialized(self) -> bool:
+    def training_active(self) -> bool:
         """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'"""
         return self._training is not None and self._active_training_io is not None and self._node is not None
 
     @property
-    def state(self) -> str:
-        if (not self.is_initialized) or (self.training.training_state is None):
-            return TrainerState.Idle.value
+    def state(self) -> TrainerState:
+        if (not self.training_active) or (self.training.training_state is None):
+            return TrainerState.Idle
         else:
-            state = self.training.training_state
-            return state.value if isinstance(state, TrainerState) else state
+            return self.training.training_state
+
+    @property
+    def training_data(self) -> TrainingData | None:
+        if self.training_active and self.training.data:
+            return self.training.data
+        return None
+
+    @property
+    def training_context(self) -> Context | None:
+        if self.training_active:
+            return self.training.context
+        return None
 
     def init_new_training(self, context: Context, details: Dict) -> None:
         """Called on `begin_training` event from the Learning Loop.
@@ -108,13 +115,25 @@ def init_from_last_training(self) -> None:
         assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder'
         self._active_training_io = ActiveTrainingIO(self._training.training_folder)
 
+    async def continue_run_if_incomplete(self) -> bool:
+        if not self.training_active and self.node.last_training_io.exists():
+            logging.info('found incomplete training, continuing now.')
+            self.init_from_last_training()
+            asyncio.get_event_loop().create_task(self.run())
+            return True
+        return False
+
+    async def begin_training(self, organization: str, project: str, details: Dict) -> None:
+        self.init_new_training(Context(organization=organization, project=project), details)
+        asyncio.get_event_loop().create_task(self.run())
+
     async def run(self) -> None:
         """Called on `begin_training` event from the Learning Loop."""
 
         self.start_time = time.time()
         self.errors.reset_all()
         try:
-            self.training_task = asyncio.get_running_loop().create_task(self._run())
+            self.training_task = asyncio.get_running_loop().create_task(self._run_training_loop())
             await self.training_task  # Object is used to potentially cancel the task
         except asyncio.CancelledError:
             if not self.shutdown_event.is_set():
@@ -130,10 +149,10 @@ async def run(self) -> None:
 
     # ---------------------------------------- TRAINING STATES ----------------------------------------
 
-    async def _run(self) -> None:
+    async def _run_training_loop(self) -> None:
         """asyncio.CancelledError is catched in train"""
 
-        if not self.is_initialized:
+        if not self.training_active:
             logging.error('could not start training - trainer is not initialized')
             return
 
@@ -511,7 +530,7 @@ async def clear_training(self):
 
     async def stop(self) -> None:
         """If executor is running, stop it. Else cancel training task."""
-        if not self.is_initialized:
+        if not self.training_active:
             return
         if self._executor and self._executor.is_process_running():
             self.executor.stop()
@@ -543,7 +562,7 @@ def may_restart(self) -> None:
     @property
     def general_progress(self) -> Optional[float]:
         """Represents the progress for different states."""
-        if not self.is_initialized:
+        if not self.training_active:
             return None
 
         t_state = self.training.training_state
diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py
new file mode 100644
index 00000000..a4d1f39b
--- /dev/null
+++ b/learning_loop_node/trainer/trainer_logic_abstraction.py
@@ -0,0 +1,75 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Optional
+
+from ..data_classes import Context, Errors, PretrainedModel, TrainerState, TrainingData
+
+if TYPE_CHECKING:
+    from .trainer_node import TrainerNode
+
+
+class TrainerLogicAbstraction(ABC):
+
+    def __init__(self):
+        self._node: Optional['TrainerNode'] = None  # type: ignore
+        self.errors = Errors()
+
+    @property
+    def node(self) -> 'TrainerNode':
+        assert self._node is not None, 'node should be set by TrainerNodes before initialization'
+        return self._node
+
+    @property
+    @abstractmethod
+    def state(self) -> TrainerState:
+        """Returns the current state of the training logic"""
+
+    @property
+    @abstractmethod
+    def training_uptime(self) -> float | None:
+        """Returns the time in seconds since the training started or None if idle"""
+
+    @property
+    @abstractmethod
+    def general_progress(self) -> float | None:
+        """Returns the general progress of the training per state or None if idle"""
+
+    @property
+    @abstractmethod
+    def provided_pretrained_models(self) -> List[PretrainedModel]:
+        """Returns the list of provided pretrained models"""
+
+    @property
+    @abstractmethod
+    def model_architecture(self) -> str:
+        """Returns the architecture name of the model"""
+
+    @property
+    @abstractmethod
+    def hyperparameters(self) -> dict | None:
+        """Returns the hyperparameters if available"""
+
+    @property
+    @abstractmethod
+    def training_data(self) -> TrainingData | None:
+        """Returns the training data if available"""
+
+    @property
+    @abstractmethod
+    def training_context(self) -> Context | None:
+        """Returns the training context if available"""
+
+    @abstractmethod
+    async def begin_training(self, organization: str, project: str, details: dict):
+        """Starts the training process"""
+
+    @abstractmethod
+    async def stop(self):
+        """Stops the training process"""
+
+    @abstractmethod
+    async def shutdown(self):
+        """Stops the training process and releases resources"""
+
+    @abstractmethod
+    async def continue_run_if_incomplete(self) -> bool:
+        """Continues the training if it is incomplete"""
diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py
index ae8f2527..fb191a9d 100644
--- a/learning_loop_node/trainer/trainer_node.py
+++ b/learning_loop_node/trainer/trainer_node.py
@@ -1,21 +1,20 @@
 import asyncio
-import time
 from dataclasses import asdict
-from typing import Dict, Optional, Union
+from typing import Dict, Optional
 
 from fastapi.encoders import jsonable_encoder
 from socketio import AsyncClient
 
-from ..data_classes import Context, TrainingStatus
+from ..data_classes import TrainingStatus
 from ..node import Node
 from .io_helpers import LastTrainingIO
 from .rest import backdoor_controls, controls
-from .trainer_logic import TrainerLogic
+from .trainer_logic_abstraction import TrainerLogicAbstraction
 
 
 class TrainerNode(Node):
 
-    def __init__(self, name: str, trainer_logic: TrainerLogic, uuid: Optional[str] = None, use_backdoor_controls: bool = False):
+    def __init__(self, name: str, trainer_logic: TrainerLogicAbstraction, uuid: Optional[str] = None, use_backdoor_controls: bool = False):
         super().__init__(name, uuid, 'trainer')
         trainer_logic._node = self  # pylint: disable=protected-access
         self.trainer_logic = trainer_logic
@@ -24,18 +23,7 @@ def __init__(self, name: str, trainer_logic: TrainerLogic, uuid: Optional[str] =
         if use_backdoor_controls:
             self.include_router(backdoor_controls.router, tags=["controls"])
 
-    # --------------------------------------------------- STATUS ---------------------------------------------------
-
-    @property
-    def progress(self) -> Union[float, None]:
-        return self.trainer_logic.general_progress if (self.trainer_logic is not None and
-                                                       hasattr(self.trainer_logic, 'general_progress')) else None
-
-    @property
-    def training_uptime(self) -> Union[float, None]:
-        return time.time() - self.trainer_logic.start_time if self.trainer_logic.start_time else None
-
-    # ----------------------------------- LIVECYCLE: ABSTRACT NODE METHODS --------------------------
+    # ----------------------------------- NODE LIVECYCLE METHODS --------------------------
 
     async def on_startup(self):
         pass
@@ -46,26 +34,24 @@ async def on_shutdown(self):
 
     async def on_repeat(self):
         try:
-            if await self.continue_run_if_incomplete():
+            if await self.trainer_logic.continue_run_if_incomplete():
                 return  # NOTE: we prevent sending idle status after starting a continuation
             await self.send_status()
         except Exception as e:
             if isinstance(e, asyncio.TimeoutError):
                 self.log.warning('timeout when sending status to learning loop, reconnecting sio_client')
-                await self.sio_client.disconnect()
-                # NOTE: reconnect happens in node._on_repeat
+                await self.sio_client.disconnect()  # NOTE: reconnect happens in node._on_repeat
             else:
                 self.log.exception(f'could not send status state: {e}')
 
-    # ---------------------------------------------- NODE ABSTRACT METHODS ---------------------------------------------------
+    # ---------------------------------------------- NODE METHODS ---------------------------------------------------
 
     def register_sio_events(self, sio_client: AsyncClient):
 
         @sio_client.event
         async def begin_training(organization: str, project: str, details: Dict):
             self.log.info('received begin_training from server')
-            self.trainer_logic.init_new_training(Context(organization=organization, project=project), details)
-            asyncio.get_event_loop().create_task(self.trainer_logic.run())
+            await self.trainer_logic.begin_training(organization, project, details)
             return True
 
         @sio_client.event
@@ -86,30 +72,21 @@ async def send_status(self):
                                 name=self.name,
                                 state=self.trainer_logic.state,
                                 errors={},
-                                uptime=self.training_uptime,
-                                progress=self.progress)
+                                uptime=self.trainer_logic.training_uptime,
+                                progress=self.trainer_logic.general_progress)
 
         status.pretrained_models = self.trainer_logic.provided_pretrained_models
         status.architecture = self.trainer_logic.model_architecture
 
-        if self.trainer_logic.is_initialized and self.trainer_logic.training.data:
-            status.train_image_count = self.trainer_logic.training.data.train_image_count()
-            status.test_image_count = self.trainer_logic.training.data.test_image_count()
-            status.skipped_image_count = self.trainer_logic.training.data.skipped_image_count
+        if data := self.trainer_logic.training_data:
+            status.train_image_count = data.train_image_count()
+            status.test_image_count = data.test_image_count()
+            status.skipped_image_count = data.skipped_image_count
             status.hyperparameters = self.trainer_logic.hyperparameters
             status.errors = self.trainer_logic.errors.errors
-            status.context = self.trainer_logic.training.context
+            status.context = self.trainer_logic.training_context
 
         self.log.info(f'sending status: {status.short_str()}')
         result = await self.sio_client.call('update_trainer', jsonable_encoder(asdict(status)), timeout=30)
-        assert isinstance(result, Dict)
-        if not result['success']:
+        if isinstance(result, Dict) and not result['success']:
             self.log.error(f'Error when sending status update: Response from loop was:\n {result}')
-
-    async def continue_run_if_incomplete(self) -> bool:
-        if not self.trainer_logic.is_initialized and self.last_training_io.exists():
-            self.log.info('found incomplete training, continuing now.')
-            self.trainer_logic.init_from_last_training()
-            asyncio.get_event_loop().create_task(self.trainer_logic.run())
-            return True
-        return False

From 3b908b31058f2166ce7018ed85a06504c3538acd Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Thu, 7 Mar 2024 16:50:37 +0100
Subject: [PATCH 11/62] introduce trainer_logic_abstraction

---
 learning_loop_node/trainer/trainer_logic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 717e9c8f..b08f1c87 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -10,7 +10,7 @@
 from datetime import datetime
 from glob import glob
 from time import perf_counter
-from typing import TYPE_CHECKING, Coroutine, Dict, List, Optional, Union
+from typing import Coroutine, Dict, List, Optional, Union
 from uuid import uuid4
 
 import socketio
@@ -18,7 +18,7 @@
 from fastapi.encoders import jsonable_encoder
 from tqdm import tqdm
 
-from ..data_classes import (BasicModel, Category, Context, Detections, Errors, Hyperparameter, ModelInformation,
+from ..data_classes import (BasicModel, Category, Context, Detections, Hyperparameter, ModelInformation,
                             PretrainedModel, TrainerState, Training, TrainingData, TrainingError)
 from ..helpers.misc import create_image_folder, create_project_folder, generate_training, is_valid_uuid4
 from . import training_syncronizer
@@ -31,12 +31,12 @@
 class TrainerLogic(TrainerLogicAbstraction):
 
     def __init__(self, model_format: str) -> None:
+        super().__init__()
         self.model_format: str = model_format
         self._executor: Optional[Executor] = None
         self.start_time: Optional[float] = None
         self.training_task: Optional[asyncio.Task] = None
         self.start_training_task: Optional[Coroutine] = None
-        self.errors = Errors()
         self.shutdown_event: asyncio.Event = asyncio.Event()
         self.detection_progress = 0.0
 

From c6eb38b127bb5d3dffb63d23ae5ca7c46ecb9174 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Thu, 7 Mar 2024 22:31:53 +0100
Subject: [PATCH 12/62] Remove lots of duplicate code in state processing -
 test locally green

---
 learning_loop_node/data_classes/training.py   |   8 +-
 learning_loop_node/data_exchanger.py          |   3 +
 .../tests/test_client_communication.py        |   3 +-
 learning_loop_node/trainer/io_helpers.py      |  64 ++-
 learning_loop_node/trainer/rest/controls.py   |   2 +-
 .../trainer/tests/state_helper.py             |   2 +-
 .../tests/states/test_state_detecting.py      |  28 +-
 .../states/test_state_download_train_model.py |  29 +-
 .../tests/states/test_state_prepare.py        |  18 +-
 .../test_state_sync_confusion_matrix.py       |  26 +-
 .../trainer/tests/states/test_state_train.py  |  25 +-
 .../states/test_state_upload_detections.py    |  37 +-
 .../tests/states/test_state_upload_model.py   |  22 +-
 .../trainer/tests/test_errors.py              |  17 +-
 .../trainer/tests/testing_trainer_logic.py    |   9 +-
 learning_loop_node/trainer/trainer_logic.py   | 412 +++++-------------
 .../trainer/trainer_logic_abstraction.py      | 136 ++++--
 learning_loop_node/trainer/trainer_node.py    |   8 +-
 .../trainer/training_syncronizer.py           |  45 +-
 mock_trainer/app_code/progress_simulator.py   |   4 +-
 .../app_code/tests/test_mock_trainer.py       |   5 +-
 21 files changed, 433 insertions(+), 470 deletions(-)

diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index 9a41928f..a0601c2d 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -1,8 +1,9 @@
 
 import sys
+import time
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional
 
 # pylint: disable=no-name-in-module
 from .general import Category, Context
@@ -65,7 +66,7 @@ class TrainerState(str, Enum):
 class TrainingStatus():
     id: str  # TODO this must not be changed, but tests wont detect it -> update tests!
     name: str
-    state: Optional[TrainerState]
+    state: Optional[str]
     errors: Optional[Dict]
     uptime: Optional[float]
     progress: Optional[float]
@@ -95,11 +96,12 @@ class Training():
     project_folder: str
     images_folder: str
     training_folder: str
+    start_time: float = field(default_factory=time.time)
 
     base_model_id: Optional[str] = None
     data: Optional[TrainingData] = None
     training_number: Optional[int] = None
-    training_state: Optional[TrainerState] = None
+    training_state: Optional[str] = None
     model_id_for_detecting: Optional[str] = None
     hyperparameters: Optional[Dict] = None
 
diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py
index 6bb30e6d..ab53b243 100644
--- a/learning_loop_node/data_exchanger.py
+++ b/learning_loop_node/data_exchanger.py
@@ -21,6 +21,9 @@ def __init__(self, cause: str, *args: object) -> None:
         super().__init__(*args)
         self.cause = cause
 
+    def __str__(self) -> str:
+        return f'DownloadError: {self.cause}'
+
 
 class DataExchanger():
 
diff --git a/learning_loop_node/detector/tests/test_client_communication.py b/learning_loop_node/detector/tests/test_client_communication.py
index 16f0fa6b..97daf93a 100644
--- a/learning_loop_node/detector/tests/test_client_communication.py
+++ b/learning_loop_node/detector/tests/test_client_communication.py
@@ -2,7 +2,7 @@
 import json
 
 import pytest
-import requests
+import requests  # type: ignore
 
 from learning_loop_node import DetectorNode
 from learning_loop_node.data_classes import ModelInformation
@@ -88,6 +88,7 @@ async def test_sio_upload(test_detector_node: DetectorNode, sio_client):
     assert len(get_outbox_files(test_detector_node.outbox)) == 2, 'There should be one image and one .json file.'
 
 
+# NOTE: This test seems to be flaky.
 async def test_about_endpoint(test_detector_node: DetectorNode):
     await asyncio.sleep(1)
     response = requests.get(f'http://localhost:{GLOBALS.detector_port}/about', timeout=30)
diff --git a/learning_loop_node/trainer/io_helpers.py b/learning_loop_node/trainer/io_helpers.py
index 3755f2f2..6ec7a5c3 100644
--- a/learning_loop_node/trainer/io_helpers.py
+++ b/learning_loop_node/trainer/io_helpers.py
@@ -1,5 +1,6 @@
 
 import json
+import logging
 import os
 from dataclasses import asdict
 from pathlib import Path
@@ -8,8 +9,9 @@
 from dacite import from_dict
 from fastapi.encoders import jsonable_encoder
 
-from ..data_classes import Detections, Training
+from ..data_classes import Context, Detections, Training
 from ..globals import GLOBALS
+from ..loop_communication import LoopCommunicator
 
 
 class LastTrainingIO:
@@ -35,13 +37,16 @@ def exists(self) -> bool:
 
 class ActiveTrainingIO:
 
-    @staticmethod
-    def create_mocked_training_io() -> 'ActiveTrainingIO':
-        training_folder = ''
-        return ActiveTrainingIO(training_folder)
+    # @staticmethod
+    # def create_mocked_training_io() -> 'ActiveTrainingIO':
+    #     training_folder = ''
+    #     return ActiveTrainingIO(training_folder)
 
-    def __init__(self, training_folder: str):
+    def __init__(self, training_folder: str, loop_communicator: LoopCommunicator, context: Context) -> None:
         self.training_folder = training_folder
+        self.loop_communicator = loop_communicator
+        self.context = context
+
         self.mup_path = f'{training_folder}/model_uploading_progress.txt'
         # string with placeholder gor index
         self.det_path = f'{training_folder}' + '/detections_{0}.json'
@@ -63,13 +68,16 @@ def load_model_upload_progress(self) -> List[str]:
 
     # detections
 
-    def get_detection_file_names(self) -> List[Path]:
+    def _get_detection_file_names(self) -> List[Path]:
         files = [f for f in Path(self.training_folder).iterdir()
                  if f.is_file() and f.name.startswith('detections_')]
         if not files:
             return []
         return files
 
+    def get_number_of_detection_files(self) -> int:
+        return len(self._get_detection_file_names())
+
     # TODO: saving and uploading multiple files is not tested!
     def save_detections(self, detections: List[Detections], index: int = 0) -> None:
         with open(self.det_path.format(index), 'w') as f:
@@ -81,11 +89,11 @@ def load_detections(self, index: int = 0) -> List[Detections]:
             return [from_dict(data_class=Detections, data=d) for d in dict_list]
 
     def delete_detections(self) -> None:
-        for file in self.get_detection_file_names():
+        for file in self._get_detection_file_names():
             os.remove(Path(self.training_folder) / file)
 
     def detections_exist(self) -> bool:
-        return bool(self.get_detection_file_names())
+        return bool(self._get_detection_file_names())
 
     # detections upload file index
 
@@ -124,3 +132,41 @@ def delete_detection_upload_progress(self) -> None:
 
     def detection_upload_progress_exist(self) -> bool:
         return os.path.exists(self.dup_path)
+
+    async def upload_detetions(self):
+        num_files = self.get_number_of_detection_files()
+        print(f'num_files: {num_files}', flush=True)
+        if not num_files:
+            raise Exception('no detection files found')
+        current_json_file_index = self.load_detections_upload_file_index()
+        for i in range(current_json_file_index, num_files):
+            detections = self.load_detections(i)
+            logging.info(f'uploading detections {i}/{num_files}')
+            await self._upload_detections_batched(self.context, detections)
+            self.save_detections_upload_file_index(i+1)
+
+    async def _upload_detections_batched(self, context: Context, detections: List[Detections]):
+        batch_size = 10
+        skip_detections = self.load_detection_upload_progress()
+        for i in range(skip_detections, len(detections), batch_size):
+            up_progress = i+batch_size
+            batch_detections = detections[i:up_progress]
+            dict_detections = [jsonable_encoder(asdict(detection)) for detection in batch_detections]
+            logging.info(f'uploading detections. File size : {len(json.dumps(dict_detections))}')
+            await self._upload_detections(context, batch_detections, up_progress)
+            skip_detections = up_progress
+
+    async def _upload_detections(self, context: Context, batch_detections: List[Detections], up_progress: int):
+        detections_json = [jsonable_encoder(asdict(detections)) for detections in batch_detections]
+        response = await self.loop_communicator.post(
+            f'/{context.organization}/projects/{context.project}/detections', json=detections_json)
+        if response.status_code != 200:
+            msg = f'could not upload detections. {str(response)}'
+            logging.error(msg)
+            raise Exception(msg)
+        else:
+            logging.info('successfully uploaded detections')
+            if up_progress > len(batch_detections):
+                self.save_detection_upload_progress(0)
+            else:
+                self.save_detection_upload_progress(up_progress)
diff --git a/learning_loop_node/trainer/rest/controls.py b/learning_loop_node/trainer/rest/controls.py
index 17434d64..b8fbbec8 100644
--- a/learning_loop_node/trainer/rest/controls.py
+++ b/learning_loop_node/trainer/rest/controls.py
@@ -22,5 +22,5 @@ async def operation_mode(organization: str, project: str, version: str, request:
     model_id = next(m for m in models if m['version'] == version)['id']
     logging.info(model_id)
     trainer: TrainerLogic = request.app.trainer
-    await trainer.do_detections()
+    await trainer._do_detections()
     return "OK"
diff --git a/learning_loop_node/trainer/tests/state_helper.py b/learning_loop_node/trainer/tests/state_helper.py
index 01c9001d..a5b982ec 100644
--- a/learning_loop_node/trainer/tests/state_helper.py
+++ b/learning_loop_node/trainer/tests/state_helper.py
@@ -7,7 +7,7 @@
 
 def create_active_training_file(trainer: TrainerLogic, **kwargs) -> None:
     update_attributes(trainer._training, **kwargs)  # pylint: disable=protected-access
-    trainer.node.last_training_io.save(training=trainer.training)
+    trainer.node.last_training_io.save(training=trainer.active_training)
 
 
 async def assert_training_state(training: Training, state: str, timeout: float, interval: float) -> None:
diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py
index 5d7583fe..d48279ee 100644
--- a/learning_loop_node/trainer/tests/states/test_state_detecting.py
+++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py
@@ -18,14 +18,17 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi
     create_active_training_file(trainer, training_state='train_model_uploaded',
                                 model_id_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a')
     # trainer.load_active_training()
-    _ = asyncio.get_running_loop().create_task(trainer.do_detections())
+    _ = asyncio.get_running_loop().create_task(
+        trainer.perform_state('do_detections', TrainerState.Detecting,
+                              TrainerState.Detected, trainer._do_detections)
+    )
 
-    await assert_training_state(trainer.training, 'detecting', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, 'detected', timeout=10, interval=0.001)
+    await assert_training_state(trainer.active_training, 'detecting', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, 'detected', timeout=10, interval=0.001)
 
     assert trainer_has_error(trainer) is False
-    assert trainer.training.training_state == TrainerState.Detected
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.Detected
+    assert trainer.node.last_training_io.load() == trainer.active_training
     assert trainer.active_training_io.detections_exist()
 
 
@@ -33,11 +36,11 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded)
     trainer.init_from_last_training()
-    trainer.training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242'
+    trainer.active_training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242'
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'detecting', timeout=5, interval=0.001)
+    await assert_training_state(trainer.active_training, 'detecting', timeout=5, interval=0.001)
     await trainer.stop()
     await asyncio.sleep(0.1)
 
@@ -54,13 +57,14 @@ async def test_model_not_downloadable_error(test_initialized_trainer: TestingTra
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'detecting', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, 'train_model_uploaded', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, 'detecting', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, 'train_model_uploaded', timeout=1, interval=0.001)
+    await asyncio.sleep(0.1)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == TrainerState.TrainModelUploaded
-    assert trainer.training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000'
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.TrainModelUploaded
+    assert trainer.active_training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000'
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 def test_save_load_detections(test_initialized_trainer: TestingTrainerLogic):
diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
index 5785e5fa..12e9b745 100644
--- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
@@ -14,17 +14,20 @@ async def test_downloading_is_successful(test_initialized_trainer: TestingTraine
     trainer.model_format = 'mocked'
     trainer.init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.download_model())
-    await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, 'train_model_downloaded', timeout=1, interval=0.001)
+    asyncio.get_running_loop().create_task(
+        trainer.perform_state('download_model',
+                              TrainerState.TrainModelDownloading,
+                              TrainerState.TrainModelDownloaded, trainer._download_model))
+    await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, 'train_model_downloaded', timeout=1, interval=0.001)
 
-    assert trainer.training.training_state == TrainerState.TrainModelDownloaded
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.TrainModelDownloaded
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
     # file on disk
-    assert os.path.exists(f'{trainer.training.training_folder}/base_model.json')
-    assert os.path.exists(f'{trainer.training.training_folder}/file_1.txt')
-    assert os.path.exists(f'{trainer.training.training_folder}/file_2.txt')
+    assert os.path.exists(f'{trainer.active_training.training_folder}/base_model.json')
+    assert os.path.exists(f'{trainer.active_training.training_folder}/file_1.txt')
+    assert os.path.exists(f'{trainer.active_training.training_folder}/file_2.txt')
 
 
 async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogic):
@@ -33,7 +36,7 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -49,10 +52,10 @@ async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic)
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, TrainerState.DataDownloaded, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.DataDownloaded, timeout=1, interval=0.001)
 
     assert trainer.errors.has_error_for('download_model')
     assert trainer._training is not None  # pylint: disable=protected-access
-    assert trainer.training.training_state == TrainerState.DataDownloaded
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.DataDownloaded
+    assert trainer.node.last_training_io.load() == trainer.active_training
diff --git a/learning_loop_node/trainer/tests/states/test_state_prepare.py b/learning_loop_node/trainer/tests/states/test_state_prepare.py
index 261fbb70..8c490c92 100644
--- a/learning_loop_node/trainer/tests/states/test_state_prepare.py
+++ b/learning_loop_node/trainer/tests/states/test_state_prepare.py
@@ -17,11 +17,11 @@ async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerL
     create_active_training_file(trainer)
     trainer.init_from_last_training()
 
-    await trainer.prepare()
+    await trainer.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare)
     assert trainer_has_error(trainer) is False
-    assert trainer.training.training_state == TrainerState.DataDownloaded
-    assert trainer.training.data is not None
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.DataDownloaded
+    assert trainer.active_training.data is not None
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_abort_preparing(test_initialized_trainer: TestingTrainerLogic):
@@ -30,7 +30,7 @@ async def test_abort_preparing(test_initialized_trainer: TestingTrainerLogic):
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -46,10 +46,10 @@ async def test_request_error(test_initialized_trainer: TestingTrainerLogic):
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=3, interval=0.001)
-    await assert_training_state(trainer.training, TrainerState.Initialized, timeout=3, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=3, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.Initialized, timeout=3, interval=0.001)
 
     assert trainer_has_error(trainer)
     assert trainer._training is not None  # pylint: disable=protected-access
-    assert trainer.training.training_state == TrainerState.Initialized
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.Initialized
+    assert trainer.node.last_training_io.load() == trainer.active_training
diff --git a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
index 51fec3ff..cc145233 100644
--- a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
+++ b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
@@ -27,10 +27,10 @@ async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic):
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
     assert trainer_has_error(trainer) is False
-    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_unsynced_model_available__sync_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture):
@@ -44,11 +44,11 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine
     trainer.has_new_model = True
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer) is False
 #    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_unsynced_model_available__sio_not_connected(test_initialized_trainer_node: TrainerNode):
@@ -62,12 +62,12 @@ async def test_unsynced_model_available__sio_not_connected(test_initialized_trai
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_unsynced_model_available__request_is_not_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture):
@@ -81,12 +81,12 @@ async def test_unsynced_model_available__request_is_not_successful(test_initiali
     trainer.has_new_model = True
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_basic_mock(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture):
diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py
index 9d6b31f4..46a7f953 100644
--- a/learning_loop_node/trainer/tests/states/test_state_train.py
+++ b/learning_loop_node/trainer/tests/states/test_state_train.py
@@ -14,17 +14,18 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'start_training'
 
     # pylint: disable=protected-access
     assert trainer._executor is not None
     trainer._executor.stop()  # NOTE normally a training terminates itself
-    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_stop_running_training(test_initialized_trainer: TestingTrainerLogic):
@@ -36,15 +37,15 @@ async def test_stop_running_training(test_initialized_trainer: TestingTrainerLog
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
     await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)  # pylint: disable=protected-access
-    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'start_training'
 
     await trainer.stop()
-    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrainerLogic):
@@ -58,14 +59,14 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
     await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)  # pylint: disable=protected-access
-    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'resume'
 
     # pylint: disable=protected-access
     assert trainer._executor is not None
     trainer._executor.stop()  # NOTE normally a training terminates itself e.g
-    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.active_training
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
index a6f69c56..757cf968 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
@@ -1,11 +1,10 @@
-from learning_loop_node.data_classes import TrainerState
 import asyncio
 
 import pytest
 from dacite import from_dict
 
 from learning_loop_node.conftest import get_dummy_detections
-from learning_loop_node.data_classes import BoxDetection, Context, Detections
+from learning_loop_node.data_classes import BoxDetection, Context, Detections, TrainerState
 from learning_loop_node.loop_communication import LoopCommunicator
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
@@ -48,10 +47,11 @@ async def test_upload_successful(test_initialized_trainer: TestingTrainerLogic):
     trainer.init_from_last_training()
 
     await create_valid_detection_file(trainer)
-    await trainer.upload_detections()
+    await asyncio.get_running_loop().create_task(
+        trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions))
 
-    assert trainer.training.training_state == TrainerState.ReadyForCleanup
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.ReadyForCleanup
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 @pytest.mark.asyncio
@@ -64,7 +64,10 @@ async def test_detection_upload_progress_is_stored(test_initialized_trainer: Tes
     await create_valid_detection_file(trainer)
 
     assert trainer.active_training_io.load_detections_upload_file_index() == 0
-    await trainer.upload_detections()
+    # await trainer.upload_detections()
+    await asyncio.get_running_loop().create_task(
+        trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions))
+
     assert trainer.active_training_io.load_detection_upload_progress() == 0  # Progress is reset for every file
     assert trainer.active_training_io.load_detections_upload_file_index() == 1
 
@@ -88,7 +91,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test
     for i in range(skip_detections, len(detections), batch_size):
         batch_detections = detections[i:i+batch_size]
         # pylint: disable=protected-access
-        await trainer._upload_detections(trainer.training.context, batch_detections, i + batch_size)
+        await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size)
 
         expected_value = i + batch_size if i + batch_size < len(detections) else 0  # Progress is reset for every file
         assert trainer.active_training_io.load_detection_upload_progress() == expected_value
@@ -104,7 +107,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test
     for i in range(skip_detections, len(detections), batch_size):
         batch_detections = detections[i:i+batch_size]
         # pylint: disable=protected-access
-        await trainer._upload_detections(trainer.training.context, batch_detections, i + batch_size)
+        await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size)
 
         expected_value = i + batch_size if i + batch_size < len(detections) else 0  # Progress is reset for every file
         assert trainer.active_training_io.load_detection_upload_progress() == expected_value
@@ -121,12 +124,12 @@ async def test_bad_status_from_LearningLoop(test_initialized_trainer: TestingTra
     trainer.active_training_io.save_detections([get_dummy_detections()])
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == TrainerState.Detected
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.Detected
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_other_errors(test_initialized_trainer: TestingTrainerLogic):
@@ -137,12 +140,12 @@ async def test_other_errors(test_initialized_trainer: TestingTrainerLogic):
     trainer.init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == TrainerState.Detected
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.Detected
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic):
@@ -154,7 +157,7 @@ async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic):
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
index efc40010..21727b27 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
@@ -23,13 +23,13 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer
 
     train_task = asyncio.get_running_loop().create_task(trainer.upload_model())
 
-    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
     await train_task
 
     assert trainer_has_error(trainer) is False
-    assert trainer.training.training_state == TrainerState.TrainModelUploaded
-    assert trainer.training.model_id_for_detecting is not None
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.TrainModelUploaded
+    assert trainer.active_training.model_id_for_detecting is not None
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic):
@@ -40,7 +40,7 @@ async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic)
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -60,14 +60,14 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
     # TODO goes to finished because of the error
-    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
-    assert trainer.training.model_id_for_detecting is None
-    assert trainer.node.last_training_io.load() == trainer.training
+    assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced
+    assert trainer.active_training.model_id_for_detecting is None
+    assert trainer.node.last_training_io.load() == trainer.active_training
 
 
 async def test_mock_loop_response_example(mocker: MockerFixture, test_initialized_trainer: TestingTrainerLogic):
@@ -79,7 +79,7 @@ async def test_mock_loop_response_example(mocker: MockerFixture, test_initialize
     trainer.init_from_last_training()
 
     # pylint: disable=protected-access
-    result = await trainer._upload_model_return_new_id(Context(organization='zauberzeug', project='demo'))
+    result = await trainer._upload_model_return_new_model_uuid(Context(organization='zauberzeug', project='demo'))
     assert result is not None
 
 
diff --git a/learning_loop_node/trainer/tests/test_errors.py b/learning_loop_node/trainer/tests/test_errors.py
index bb6b3d8a..1ba85572 100644
--- a/learning_loop_node/trainer/tests/test_errors.py
+++ b/learning_loop_node/trainer/tests/test_errors.py
@@ -1,35 +1,38 @@
 import asyncio
 import re
 
+from learning_loop_node.data_classes import TrainerState
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 
 
 async def test_training_process_is_stopped_when_trainer_reports_error(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
-    create_active_training_file(trainer, training_state='train_model_downloaded')
+    create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
     trainer.init_from_last_training()
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     trainer.error_msg = 'some_error'
-    await assert_training_state(trainer.training, 'train_model_downloaded', timeout=6, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
 
 
 async def test_log_can_provide_only_data_for_current_run(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
-    create_active_training_file(trainer, training_state='train_model_downloaded')
+    create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
     trainer.init_from_last_training()
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
+
     assert trainer._executor is not None
     assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) == 1
 
     trainer.error_msg = 'some_error'
-    await assert_training_state(trainer.training, 'train_model_downloaded', timeout=6, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
     trainer.error_msg = None
-    await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     await asyncio.sleep(1)
 
     assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) > 1
diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index 08589657..d6e9b78a 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -54,19 +54,14 @@ async def _download_model(self) -> None:
         await super()._download_model()
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
 
-    async def ensure_confusion_matrix_synced(self):
-        await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
-        await super().ensure_confusion_matrix_synced()
-        await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
-
     async def upload_model(self) -> None:
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
         await super().upload_model()
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
 
-    async def _upload_model_return_new_id(self, context: Context) -> Optional[str]:
+    async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
-        result = await super()._upload_model_return_new_id(context)
+        result = await super()._upload_model_return_new_model_uuid(context)
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
         assert isinstance(result, str)
         return result
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index b08f1c87..4bfdb743 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -4,7 +4,6 @@
 import os
 import shutil
 import sys
-import time
 from abc import abstractmethod
 from dataclasses import asdict
 from datetime import datetime
@@ -18,10 +17,9 @@
 from fastapi.encoders import jsonable_encoder
 from tqdm import tqdm
 
-from ..data_classes import (BasicModel, Category, Context, Detections, Hyperparameter, ModelInformation,
-                            PretrainedModel, TrainerState, Training, TrainingData, TrainingError)
+from ..data_classes import (BasicModel, Category, Context, Detections, Hyperparameter, ModelInformation, TrainerState,
+                            Training, TrainingData, TrainingError, TrainingOut)
 from ..helpers.misc import create_image_folder, create_project_folder, generate_training, is_valid_uuid4
-from . import training_syncronizer
 from .downloader import TrainingsDownloader
 from .executor import Executor
 from .io_helpers import ActiveTrainingIO
@@ -31,98 +29,54 @@
 class TrainerLogic(TrainerLogicAbstraction):
 
     def __init__(self, model_format: str) -> None:
-        super().__init__()
+        super().__init__(model_format)
         self.model_format: str = model_format
+        # NOTE: String to be used in the file path for the model on the server:
+        # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
+
         self._executor: Optional[Executor] = None
-        self.start_time: Optional[float] = None
         self.training_task: Optional[asyncio.Task] = None
         self.start_training_task: Optional[Coroutine] = None
         self.shutdown_event: asyncio.Event = asyncio.Event()
         self.detection_progress = 0.0
 
-        self._training: Optional[Training] = None
-        self._active_training_io: Optional[ActiveTrainingIO] = None
-        self.restart_after_training = os.environ.get('RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1']
-        self.keep_old_trainings = os.environ.get('KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1']
-        self.inference_batch_size = int(os.environ.get('INFERENCE_BATCH_SIZE', '10'))
-        logging.info(f'INFERENCE_BATCH_SIZE: {self.inference_batch_size}')
-
-    @property
-    def training_uptime(self) -> Union[float, None]:
-        return time.time() - self.start_time if self.start_time else None
-
     @property
     def executor(self) -> Executor:
         assert self._executor is not None, 'executor must be set, call `run_training` first'
         return self._executor
 
-    @property
-    def training(self) -> Training:
-        assert self._training is not None, 'training must be set, call `init` first'
-        return self._training
-
-    @property
-    def active_training_io(self) -> ActiveTrainingIO:
-        assert self._active_training_io is not None, 'active_training_io must be set, call `init` first'
-        return self._active_training_io
-
-    @property
-    def training_active(self) -> bool:
-        """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'"""
-        return self._training is not None and self._active_training_io is not None and self._node is not None
-
-    @property
-    def state(self) -> TrainerState:
-        if (not self.training_active) or (self.training.training_state is None):
-            return TrainerState.Idle
-        else:
-            return self.training.training_state
-
-    @property
-    def training_data(self) -> TrainingData | None:
-        if self.training_active and self.training.data:
-            return self.training.data
-        return None
-
-    @property
-    def training_context(self) -> Context | None:
-        if self.training_active:
-            return self.training.context
-        return None
-
     def init_new_training(self, context: Context, details: Dict) -> None:
         """Called on `begin_training` event from the Learning Loop.
         Note that details needs the entries 'categories' and 'training_number'"""
 
-        try:
-            project_folder = create_project_folder(context)
-            if not self.keep_old_trainings:
-                # NOTE: We delete all existing training folders because they are not needed anymore.
-                TrainerLogic.delete_all_training_folders(project_folder)
-            self._training = generate_training(project_folder, context)
-            self._training.data = TrainingData(categories=Category.from_list(details['categories']))
-            self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details)
-            self._training.training_number = details['training_number']
-            self._training.base_model_id = details['id']
-            self._training.training_state = TrainerState.Initialized
-            self._active_training_io = ActiveTrainingIO(self._training.training_folder)
-            logging.info(f'init training: {self._training}')
-        except Exception:
-            logging.exception('Error in init')
-
-    def init_from_last_training(self) -> None:
-        self._training = self.node.last_training_io.load()
-        assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder'
-        self._active_training_io = ActiveTrainingIO(self._training.training_folder)
-
-    async def continue_run_if_incomplete(self) -> bool:
-        if not self.training_active and self.node.last_training_io.exists():
+        project_folder = create_project_folder(context)
+        if not self.keep_old_trainings:
+            # NOTE: We delete all existing training folders because they are not needed anymore.
+            TrainerLogic.delete_all_training_folders(project_folder)
+        self._training = generate_training(project_folder, context)
+        self._training.data = TrainingData(categories=Category.from_list(details['categories']))
+        self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details)
+        self._training.training_number = details['training_number']
+        self._training.base_model_id = details['id']
+        self._training.training_state = TrainerState.Initialized
+        self._active_training_io = ActiveTrainingIO(
+            self._training.training_folder, self.loop_communicator, context)
+        logging.info(f'training initialized: {self._training}')
+
+    async def try_continue_run_if_incomplete(self) -> bool:
+        if not self.training_active and self.last_training_io.exists():
             logging.info('found incomplete training, continuing now.')
             self.init_from_last_training()
             asyncio.get_event_loop().create_task(self.run())
             return True
         return False
 
+    def init_from_last_training(self) -> None:
+        self._training = self.last_training_io.load()
+        assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder'
+        self._active_training_io = ActiveTrainingIO(
+            self._training.training_folder, self.loop_communicator, self._training.context)
+
     async def begin_training(self, organization: str, project: str, details: Dict) -> None:
         self.init_new_training(Context(organization=organization, project=project), details)
         asyncio.get_event_loop().create_task(self.run())
@@ -130,7 +84,6 @@ async def begin_training(self, organization: str, project: str, details: Dict) -
     async def run(self) -> None:
         """Called on `begin_training` event from the Learning Loop."""
 
-        self.start_time = time.time()
         self.errors.reset_all()
         try:
             self.training_task = asyncio.get_running_loop().create_task(self._run_training_loop())
@@ -138,14 +91,12 @@ async def run(self) -> None:
         except asyncio.CancelledError:
             if not self.shutdown_event.is_set():
                 logging.info('training task was cancelled but not by shutdown event')
-                self.training.training_state = TrainerState.ReadyForCleanup
-                self.node.last_training_io.save(self.training)
+                self.active_training.training_state = TrainerState.ReadyForCleanup
+                self.last_training_io.save(self.active_training)
                 await self.clear_training()
 
         except Exception as e:
             logging.exception(f'Error in train: {e}')
-        finally:
-            self.start_time = None
 
     # ---------------------------------------- TRAINING STATES ----------------------------------------
 
@@ -157,94 +108,55 @@ async def _run_training_loop(self) -> None:
             return
 
         while self._training is not None:
-            tstate = self.training.training_state
-            logging.info(f'STATE LOOP: {tstate}')
+            tstate = self.active_training.training_state
+            logging.info(f'STATE LOOP: {tstate}, eerrors: {self.errors.errors}')
             await asyncio.sleep(0.6)  # Note: Required for pytests!
             if tstate == TrainerState.Initialized:  # -> DataDownloading -> DataDownloaded
-                await self.prepare()
+                await self.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare)
             elif tstate == TrainerState.DataDownloaded:  # -> TrainModelDownloading -> TrainModelDownloaded
-                await self.download_model()
+                await self.perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model)
             elif tstate == TrainerState.TrainModelDownloaded:  # -> TrainingRunning -> TrainingFinished
-                await self.train()
+                await self.perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train)
             elif tstate == TrainerState.TrainingFinished:  # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced
-                await self.ensure_confusion_matrix_synced()
+                await self.perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self.sync_confusion_matrix)
+                # await self.ensure_confusion_matrix_synced()
             elif tstate == TrainerState.ConfusionMatrixSynced:  # -> TrainModelUploading -> TrainModelUploaded
                 await self.upload_model()
             elif tstate == TrainerState.TrainModelUploaded:  # -> Detecting -> Detected
-                await self.do_detections()
+                await self.perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections)
             elif tstate == TrainerState.Detected:  # -> DetectionUploading -> ReadyForCleanup
-                await self.upload_detections()
+                await self.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions)
             elif tstate == TrainerState.ReadyForCleanup:  # -> RESTART or TrainingFinished
                 await self.clear_training()
                 self.may_restart()
 
-    async def prepare(self) -> None:
-        previous_state = self.training.training_state
-        self.training.training_state = TrainerState.DataDownloading
-        error_key = 'prepare'
-        try:
-            await self._prepare()
-        except asyncio.CancelledError:
-            logging.warning('CancelledError in prepare')
-            raise
-        except Exception as e:
-            logging.exception("Unknown error in 'prepare'. Exception:")
-            self.training.training_state = previous_state
-            self.errors.set(error_key, str(e))
-        else:
-            self.errors.reset(error_key)
-            self.training.training_state = TrainerState.DataDownloaded
-            self.node.last_training_io.save(self.training)
-
     async def _prepare(self) -> None:
-        self.node.data_exchanger.set_context(self.training.context)
-        downloader = TrainingsDownloader(self.node.data_exchanger)
-        image_data, skipped_image_count = await downloader.download_training_data(self.training.images_folder)
-        assert self.training.data is not None, 'training.data must be set'
-        self.training.data.image_data = image_data
-        self.training.data.skipped_image_count = skipped_image_count
-
-    async def download_model(self) -> None:
-        logging.info('Downloading model')
-        previous_state = self.training.training_state
-        self.training.training_state = TrainerState.TrainModelDownloading
-        error_key = 'download_model'
-        try:
-            await self._download_model()
-        except asyncio.CancelledError:
-            logging.warning('CancelledError in download_model')
-            raise
-        except Exception as e:
-            logging.exception('download_model failed')
-            self.training.training_state = previous_state
-            self.errors.set(error_key, str(e))
-        else:
-            self.errors.reset(error_key)
-            logging.info('download_model_task finished')
-            self.training.training_state = TrainerState.TrainModelDownloaded
-            self.node.last_training_io.save(self.training)
+        self.data_exchanger.set_context(self.active_training.context)
+        downloader = TrainingsDownloader(self.data_exchanger)
+        image_data, skipped_image_count = await downloader.download_training_data(self.active_training.images_folder)
+        assert self.active_training.data is not None, 'training.data must be set'
+        self.active_training.data.image_data = image_data
+        self.active_training.data.skipped_image_count = skipped_image_count
 
     async def _download_model(self) -> None:
-        model_id = self.training.base_model_id
+        model_id = self.active_training.base_model_id
         assert model_id is not None, 'model_id must be set'
         if is_valid_uuid4(
-                self.training.base_model_id):  # TODO this checks if we continue a training -> make more explicit
+                self.active_training.base_model_id):  # TODO this checks if we continue a training -> make more explicit
             logging.info('loading model from Learning Loop')
             logging.info(f'downloading model {model_id} as {self.model_format}')
-            await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, model_id, self.model_format)
-            shutil.move(f'{self.training.training_folder}/model.json',
-                        f'{self.training.training_folder}/base_model.json')
+            await self.data_exchanger.download_model(self.active_training.training_folder, self.active_training.context, model_id, self.model_format)
+            shutil.move(f'{self.active_training.training_folder}/model.json',
+                        f'{self.active_training.training_folder}/base_model.json')
         else:
             logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download')
 
-    async def train(self) -> None:
-        logging.info('Running training')
+    async def _train(self) -> None:
+        previous_state = TrainerState.TrainModelDownloaded
         error_key = 'run_training'
-        # NOTE normally we reset errors after the step was successful. We do not want to display an old error during the whole training.
-        self.errors.reset(error_key)
-        previous_state = self.training.training_state
-        self._executor = Executor(self.training.training_folder)
-        self.training.training_state = TrainerState.TrainingRunning
+        self._executor = Executor(self.active_training.training_folder)
+        self.active_training.training_state = TrainerState.TrainingRunning
+
         try:
             await self._start_training()
 
@@ -276,28 +188,19 @@ async def train(self) -> None:
             #     self.errors.set(error_key, f'Executor return code was {self.executor.return_code}')
             #     raise TrainingError(cause=f'Executor return code was {self.executor.return_code}')
 
-        except asyncio.CancelledError:
-            logging.warning('CancelledError in run_training')
-            raise
         except TrainingError:
             logging.exception('Error in TrainingProcess')
             if self.executor.is_process_running():
                 self.executor.stop()
-            self.training.training_state = previous_state
-        except Exception as e:
-            self.errors.set(error_key, f'Could not start training {str(e)}')
-            self.training.training_state = previous_state
-            logging.exception('Error in run_training')
-        else:
-            self.training.training_state = TrainerState.TrainingFinished
-            self.node.last_training_io.save(self.training)
+            self.active_training.training_state = previous_state
+            raise
 
     async def _start_training(self):
         self.start_training_task = None  # NOTE: this is used i.e. by tests
         if self.can_resume():
             self.start_training_task = self.resume()
         else:
-            base_model_id = self.training.base_model_id
+            base_model_id = self.active_training.base_model_id
             if not is_valid_uuid4(base_model_id):  # TODO this check was done earlier!
                 assert isinstance(base_model_id, str)
                 # TODO this could be removed here and accessed via self.training.base_model_id
@@ -306,27 +209,32 @@ async def _start_training(self):
                 self.start_training_task = self.start_training()
         await self.start_training_task
 
-    async def ensure_confusion_matrix_synced(self):
-        logging.info('Ensure syncing confusion matrix')
-        previous_state = self.training.training_state
-        self.training.training_state = TrainerState.ConfusionMatrixSyncing
-        try:
-            await self.sync_confusion_matrix()
-        except asyncio.CancelledError:
-            logging.warning('CancelledError in run_training')
-            raise
-        except Exception:
-            logging.exception('Error in ensure_confusion_matrix_synced')
-            self.training.training_state = previous_state
-        else:
-            self.training.training_state = TrainerState.ConfusionMatrixSynced
-            self.node.last_training_io.save(self.training)
-
     async def sync_confusion_matrix(self):
         logging.info('Syncing confusion matrix')
         error_key = 'sync_confusion_matrix'
         try:
-            await training_syncronizer.try_sync_model(self, self.node.uuid, self.node.sio_client)
+            try:
+                model = self.get_new_model()
+            except Exception as exc:
+                logging.exception('error while getting new model')
+                raise Exception(f'Could not get new model: {str(exc)}') from exc
+            if model and self.active_training.data:
+                new_training = TrainingOut(
+                    trainer_id=self.node_uuid,
+                    confusion_matrix=model.confusion_matrix,
+                    train_image_count=self.active_training.data.train_image_count(),
+                    test_image_count=self.active_training.data.test_image_count(),
+                    hyperparameters=self.hyperparameters)
+
+                await asyncio.sleep(0.1)  # NOTE needed for tests.
+                result = await self.sio_client.call('update_training', (self.active_training.context.organization, self.active_training.context.project, jsonable_encoder(new_training)))
+                if isinstance(result,  dict) and result['success']:
+                    logging.info(f'successfully updated training {asdict(new_training)}')
+                    self.on_model_published(model)
+                else:
+                    error_msg = f'Error for update_training: Response from loop was : {result}'
+                    logging.error(error_msg)
+                    raise Exception(error_msg)
         except socketio.exceptions.BadNamespaceError as e:  # type: ignore
             logging.error('Error during confusion matrix syncronization. BadNamespaceError')
             self.errors.set(error_key, str(e))
@@ -340,86 +248,72 @@ async def sync_confusion_matrix(self):
 
     async def upload_model(self) -> None:
         error_key = 'upload_model'
-        previous_state = self.training.training_state
-        self.training.training_state = TrainerState.TrainModelUploading
+        previous_state = self.active_training.training_state
+        self.active_training.training_state = TrainerState.TrainModelUploading
         try:
-            new_model_id = await self._upload_model_return_new_id(self.training.context)
+            new_model_id = await self._upload_model_return_new_model_uuid(self.active_training.context)
             if new_model_id is None:
-                self.training.training_state = TrainerState.ReadyForCleanup
+                self.active_training.training_state = TrainerState.ReadyForCleanup
                 logging.error('could not upload model - maybe training failed.. cleaning up')
                 return
             assert new_model_id is not None, 'uploaded_model must be set'
             logging.info(f'successfully uploaded model and received new model id: {new_model_id}')
-            self.training.model_id_for_detecting = new_model_id
+            self.active_training.model_id_for_detecting = new_model_id
         except asyncio.CancelledError:
             logging.warning('CancelledError in upload_model')
             raise
         except Exception as e:
             logging.exception('Error in upload_model. Exception:')
             self.errors.set(error_key, str(e))
-            self.training.training_state = previous_state  # TODO... going back is pointless here as it ends in a deadlock ?!
+            self.active_training.training_state = previous_state  # TODO... going back is pointless here as it ends in a deadlock ?!
             # self.training.training_state = TrainingState.ReadyForCleanup
         else:
             self.errors.reset(error_key)
-            self.training.training_state = TrainerState.TrainModelUploaded
-            self.node.last_training_io.save(self.training)
+            self.active_training.training_state = TrainerState.TrainModelUploaded
+            self.last_training_io.save(self.active_training)
 
-    async def _upload_model_return_new_id(self, context: Context) -> Optional[str]:
+    async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
         """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file.
         Note that with the latest trainers the conversion to (.wts) is done by the trainer.
         The conversion from .wts to .engine is done by the detector (needs to be done on target hardware).
         Note that trainer may train with different classes, which is why we send an initial model.json file.
         """
         files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files)
-
         if files is None:
             return None
 
         if isinstance(files, List):
             files = {self.model_format: files}
-        assert isinstance(files, Dict), f'can only save model as list or dict, but was {files}'
+        assert isinstance(files, Dict), f'can only upload model as list or dict, but was {files}'
 
-        model_json_path = self.create_model_json_with_categories()
         already_uploaded_formats = self.active_training_io.load_model_upload_progress()
 
-        new_id = None
+        new_model_uuid = None
         for file_format in files:
             if file_format in already_uploaded_formats:
                 continue
             _files = files[file_format]
-            # model.json was mandatory in previous versions. Now its forbidden to provide an own model.json file.
             assert not any(f for f in _files if 'model.json' in f), "Upload 'model.json' not allowed (added automatically)."
-            _files.append(model_json_path)
-            new_id = await self.node.data_exchanger.upload_model_get_uuid(context, _files, self.training.training_number, file_format)
-            if new_id is None:
+            _files.append(self.dump_categories_to_json())
+            new_model_uuid = await self.data_exchanger.upload_model_get_uuid(context, _files, self.active_training.training_number, file_format)
+            if new_model_uuid is None:
                 return None
 
             already_uploaded_formats.append(file_format)
             self.active_training_io.save_model_upload_progress(already_uploaded_formats)
 
-        return new_id
+        return new_model_uuid
 
-    async def do_detections(self):
-        error_key = 'detecting'
-        previous_state = self.training.training_state
-        try:
-            self.training.training_state = TrainerState.Detecting
-            await self._do_detections()
-        except asyncio.CancelledError:
-            logging.warning('CancelledError in do_detections')
-            raise
-        except Exception as e:
-            self.errors.set(error_key, str(e))
-            logging.exception('Error in do_detections - Exception:')
-            self.training.training_state = previous_state
-        else:
-            self.errors.reset(error_key)
-            self.training.training_state = TrainerState.Detected
-            self.node.last_training_io.save(self.training)
+    def dump_categories_to_json(self) -> str:
+        content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None
+        json_path = '/tmp/model.json'
+        with open(json_path, 'w') as f:
+            json.dump(content, f)
+        return json_path
 
     async def _do_detections(self) -> None:
-        context = self.training.context
-        model_id = self.training.model_id_for_detecting
+        context = self.active_training.context
+        model_id = self.active_training.model_id_for_detecting
         assert model_id, 'model_id must be set'
         tmp_folder = f'/tmp/model_for_auto_detections_{model_id}_{self.model_format}'
 
@@ -427,22 +321,22 @@ async def _do_detections(self) -> None:
         os.makedirs(tmp_folder)
         logging.info(f'downloading detection model to {tmp_folder}')
 
-        await self.node.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format)
+        await self.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format)
         with open(f'{tmp_folder}/model.json', 'r') as f:
             content = json.load(f)
             model_information = from_dict(data_class=ModelInformation, data=content)
 
         project_folder = create_project_folder(context)
         image_folder = create_image_folder(project_folder)
-        self.node.data_exchanger.set_context(context)
+        self.data_exchanger.set_context(context)
         image_ids = []
         for state, p in zip(['inbox', 'annotate', 'review', 'complete'], [0.1, 0.2, 0.3, 0.4]):
             self.detection_progress = p
             logging.info(f'fetching image ids of {state}')
-            new_ids = await self.node.data_exchanger.fetch_image_uuids(query_params=f'state={state}')
+            new_ids = await self.data_exchanger.fetch_image_uuids(query_params=f'state={state}')
             image_ids += new_ids
             logging.info(f'downloading {len(new_ids)} images')
-            await self.node.data_exchanger.download_images(new_ids, image_folder)
+            await self.data_exchanger.download_images(new_ids, image_folder)
         self.detection_progress = 0.42
         # await delete_corrupt_images(image_folder)
 
@@ -462,70 +356,15 @@ async def _do_detections(self) -> None:
 
         return None
 
-    async def upload_detections(self):
-        error_key = 'upload_detections'
-        previous_state = self.training.training_state
-        self.training.training_state = TrainerState.DetectionUploading
-        await asyncio.sleep(0.1)  # NOTE needed for tests
-        try:
-            json_files = self.active_training_io.get_detection_file_names()
-            if not json_files:
-                raise Exception()
-            current_json_file_index = self.active_training_io.load_detections_upload_file_index()
-            for i in range(current_json_file_index, len(json_files)):
-                detections = self.active_training_io.load_detections(i)
-                logging.info(f'uploading detections {i}/{len(json_files)}')
-                await self._upload_detections_batched(self.training.context, detections)
-                self.active_training_io.save_detections_upload_file_index(i+1)
-        except asyncio.CancelledError:
-            logging.warning('CancelledError in upload_detections')
-            raise
-        except Exception as e:
-            self.errors.set(error_key, str(e))
-            logging.exception('Error in upload_detections')
-            self.training.training_state = previous_state
-        else:
-            self.errors.reset(error_key)
-            self.training.training_state = TrainerState.ReadyForCleanup
-            self.node.last_training_io.save(self.training)
-
-    async def _upload_detections_batched(self, context: Context, detections: List[Detections]):
-        batch_size = 10
-        skip_detections = self.active_training_io.load_detection_upload_progress()
-        for i in tqdm(range(skip_detections, len(detections), batch_size), position=0, leave=True):
-            up_progress = i+batch_size
-            batch_detections = detections[i:up_progress]
-            dict_detections = [jsonable_encoder(asdict(detection)) for detection in batch_detections]
-            logging.info(f'uploading detections. File size : {len(json.dumps(dict_detections))}')
-            await self._upload_detections(context, batch_detections, up_progress)
-            skip_detections = up_progress
-
-    async def _upload_detections(self, context: Context, batch_detections: List[Detections], up_progress: int):
-        assert self._active_training_io is not None, 'active_training must be set'
-
-        detections_json = [jsonable_encoder(asdict(detections)) for detections in batch_detections]
-        response = await self.node.loop_communicator.post(
-            f'/{context.organization}/projects/{context.project}/detections', json=detections_json)
-        if response.status_code != 200:
-            msg = f'could not upload detections. {str(response)}'
-            logging.error(msg)
-            raise Exception(msg)
-        else:
-            logging.info('successfully uploaded detections')
-            if up_progress > len(batch_detections):
-                self._active_training_io.save_detection_upload_progress(0)
-            else:
-                self._active_training_io.save_detection_upload_progress(up_progress)
-
     async def clear_training(self):
         self.active_training_io.delete_detections()
         self.active_training_io.delete_detection_upload_progress()
         self.active_training_io.delete_detections_upload_file_index()
-        await self.clear_training_data(self.training.training_folder)
-        self.node.last_training_io.delete()
+        await self.clear_training_data(self.active_training.training_folder)
+        self.last_training_io.delete()
         # self.training.training_state = TrainingState.TrainingFinished
-        assert self._node is not None
-        await self._node.send_status()  # make sure the status is updated before we stop the training
+
+        await self.node.send_status()
         self._training = None
 
     async def stop(self) -> None:
@@ -565,9 +404,9 @@ def general_progress(self) -> Optional[float]:
         if not self.training_active:
             return None
 
-        t_state = self.training.training_state
+        t_state = self.active_training.training_state
         if t_state == TrainerState.DataDownloading:
-            return self.node.data_exchanger.progress
+            return self.data_exchanger.progress
         if t_state == TrainerState.TrainingRunning:
             return self.training_progress
         if t_state == TrainerState.Detecting:
@@ -582,16 +421,6 @@ def training_progress(self) -> Optional[float]:
         """Represents the training progress."""
         raise NotImplementedError
 
-    @property
-    @abstractmethod
-    def provided_pretrained_models(self) -> List[PretrainedModel]:
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def model_architecture(self) -> Optional[str]:
-        raise NotImplementedError
-
     @abstractmethod
     async def start_training(self) -> None:
         '''Should be used to start a training.'''
@@ -698,18 +527,3 @@ def hyperparameters(self) -> Optional[Dict]:
             information['flipUd'] = self._training.data.hyperparameter.flip_ud
             return information
         return None
-
-    def create_model_json_with_categories(self) -> str:
-        """Remaining fields are filled by the Learning Loop"""
-        if self._training and self._training.data:
-            content = {
-                'categories': [asdict(c) for c in self._training.data.categories],
-            }
-        else:
-            content = None
-
-        model_json_path = '/tmp/model.json'
-        with open(model_json_path, 'w') as f:
-            json.dump(content, f)
-
-        return model_json_path
diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py
index a4d1f39b..b7f6b006 100644
--- a/learning_loop_node/trainer/trainer_logic_abstraction.py
+++ b/learning_loop_node/trainer/trainer_logic_abstraction.py
@@ -1,7 +1,16 @@
+import asyncio
+import logging
+import os
+import time
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Callable, Coroutine, List, Optional
 
-from ..data_classes import Context, Errors, PretrainedModel, TrainerState, TrainingData
+from socketio import AsyncClient
+
+from ..data_classes import Context, Errors, PretrainedModel, TrainerState, Training, TrainingData
+from ..data_exchanger import DataExchanger
+from ..loop_communication import LoopCommunicator
+from .io_helpers import ActiveTrainingIO, LastTrainingIO
 
 if TYPE_CHECKING:
     from .trainer_node import TrainerNode
@@ -9,24 +18,88 @@
 
 class TrainerLogicAbstraction(ABC):
 
-    def __init__(self):
+    def __init__(self, model_format: str):
+
+        # NOTE: String to be used in the file path for the model on the server:
+        # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
+        self.model_format: str = model_format
+
         self._node: Optional['TrainerNode'] = None  # type: ignore
+        self._last_training_io: Optional[LastTrainingIO] = None  # type: ignore
         self.errors = Errors()
 
+        self._training: Optional[Training] = None
+        self._active_training_io: Optional[ActiveTrainingIO] = None
+
+        self.restart_after_training = os.environ.get('RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1']
+        self.keep_old_trainings = os.environ.get('KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1']
+        self.inference_batch_size = int(os.environ.get('INFERENCE_BATCH_SIZE', '10'))
+
     @property
     def node(self) -> 'TrainerNode':
-        assert self._node is not None, 'node should be set by TrainerNodes before initialization'
+        assert self._node is not None, 'node should be set by TrainerNode before initialization'
         return self._node
 
     @property
-    @abstractmethod
-    def state(self) -> TrainerState:
-        """Returns the current state of the training logic"""
+    def last_training_io(self) -> LastTrainingIO:
+        assert self._last_training_io is not None, 'last_training_io should be set by TrainerNode before initialization'
+        return self._last_training_io
 
     @property
-    @abstractmethod
-    def training_uptime(self) -> float | None:
-        """Returns the time in seconds since the training started or None if idle"""
+    def data_exchanger(self) -> DataExchanger:
+        return self.node.data_exchanger
+
+    @property
+    def loop_communicator(self) -> LoopCommunicator:
+        return self.node.loop_communicator
+
+    @property
+    def node_uuid(self) -> str:
+        return self.node.uuid
+
+    @property
+    def sio_client(self) -> AsyncClient:
+        return self.node.sio_client
+
+    @property
+    def active_training_io(self) -> ActiveTrainingIO:
+        assert self._active_training_io is not None, 'active_training_io must be set, call `init` first'
+        return self._active_training_io
+
+    @property
+    def training_active(self) -> bool:
+        """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'"""
+        return self._training is not None and self._active_training_io is not None
+
+    @property
+    def state(self) -> str:
+        if (not self.training_active) or (self.active_training.training_state is None):
+            return TrainerState.Idle.value
+        else:
+            return self.active_training.training_state
+
+    @property
+    def active_training(self) -> Training:
+        assert self._training is not None, 'training must be initialized, call `init` first'
+        return self._training
+
+    @property
+    def training_uptime(self) -> Optional[float]:
+        if self.active_training:
+            return time.time() - self.active_training.start_time
+        return None
+
+    @property
+    def training_data(self) -> TrainingData | None:
+        if self.training_active and self.active_training.data:
+            return self.active_training.data
+        return None
+
+    @property
+    def training_context(self) -> Context | None:
+        if self.training_active:
+            return self.active_training.context
+        return None
 
     @property
     @abstractmethod
@@ -40,23 +113,13 @@ def provided_pretrained_models(self) -> List[PretrainedModel]:
 
     @property
     @abstractmethod
-    def model_architecture(self) -> str:
-        """Returns the architecture name of the model"""
+    def model_architecture(self) -> Optional[str]:
+        """Returns the architecture name of the model if available"""
 
     @property
     @abstractmethod
     def hyperparameters(self) -> dict | None:
-        """Returns the hyperparameters if available"""
-
-    @property
-    @abstractmethod
-    def training_data(self) -> TrainingData | None:
-        """Returns the training data if available"""
-
-    @property
-    @abstractmethod
-    def training_context(self) -> Context | None:
-        """Returns the training context if available"""
+        """Returns the currently used hyperparameters if available"""
 
     @abstractmethod
     async def begin_training(self, organization: str, project: str, details: dict):
@@ -71,5 +134,28 @@ async def shutdown(self):
         """Stops the training process and releases resources"""
 
     @abstractmethod
-    async def continue_run_if_incomplete(self) -> bool:
-        """Continues the training if it is incomplete"""
+    async def try_continue_run_if_incomplete(self) -> bool:
+        """Start training continuation if possible, returns True if continuation started"""
+
+    async def perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False):
+        await asyncio.sleep(0.1)
+        logging.info(f'Performing state: {state_during}')
+        previous_state = self.active_training.training_state
+        self.active_training.training_state = state_during
+        await asyncio.sleep(0.1)
+        if reset_early:
+            self.errors.reset(error_key)
+
+        try:
+            await action()
+        except asyncio.CancelledError:
+            logging.warning(f'CancelledError in {state_during}')
+            raise
+        except Exception as e:
+            self.errors.set(error_key, str(e))
+            logging.exception(f'Error in {state_during} - Exception:')
+            self.active_training.training_state = previous_state
+        else:
+            self.errors.reset(error_key)
+            self.active_training.training_state = state_after
+            self.last_training_io.save(self.active_training)
diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py
index fb191a9d..f2e011d5 100644
--- a/learning_loop_node/trainer/trainer_node.py
+++ b/learning_loop_node/trainer/trainer_node.py
@@ -16,9 +16,11 @@ class TrainerNode(Node):
 
     def __init__(self, name: str, trainer_logic: TrainerLogicAbstraction, uuid: Optional[str] = None, use_backdoor_controls: bool = False):
         super().__init__(name, uuid, 'trainer')
-        trainer_logic._node = self  # pylint: disable=protected-access
+        trainer_logic._node = self
         self.trainer_logic = trainer_logic
         self.last_training_io = LastTrainingIO(self.uuid)
+        self.trainer_logic._last_training_io = self.last_training_io
+
         self.include_router(controls.router, tags=["controls"])
         if use_backdoor_controls:
             self.include_router(backdoor_controls.router, tags=["controls"])
@@ -34,7 +36,7 @@ async def on_shutdown(self):
 
     async def on_repeat(self):
         try:
-            if await self.trainer_logic.continue_run_if_incomplete():
+            if await self.trainer_logic.try_continue_run_if_incomplete():
                 return  # NOTE: we prevent sending idle status after starting a continuation
             await self.send_status()
         except Exception as e:
@@ -70,7 +72,7 @@ async def send_status(self):
 
         status = TrainingStatus(id=self.uuid,
                                 name=self.name,
-                                state=self.trainer_logic.state,
+                                state=self.trainer_logic.state.value,
                                 errors={},
                                 uptime=self.trainer_logic.training_uptime,
                                 progress=self.trainer_logic.general_progress)
diff --git a/learning_loop_node/trainer/training_syncronizer.py b/learning_loop_node/trainer/training_syncronizer.py
index 1707d407..97041bb9 100644
--- a/learning_loop_node/trainer/training_syncronizer.py
+++ b/learning_loop_node/trainer/training_syncronizer.py
@@ -2,7 +2,7 @@
 import asyncio
 import logging
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING
 
 import socketio
 from dacite import from_dict
@@ -15,7 +15,28 @@
     from .trainer_logic import TrainerLogic
 
 
-async def try_sync_model(trainer: 'TrainerLogic', trainer_node_uuid: str, sio_client: socketio.AsyncClient):
+class TrainingSyncronizer:
+    def __init__(self, trainer_node_uuid: str, sio_client: socketio.AsyncClient):
+        self.trainer_node_uuid = trainer_node_uuid
+        self.sio_client = sio_client
+
+    async def sync_model(model, current_training):
+        new_training = TrainingOut(
+            trainer_id=self.trainer_node_uuid,
+            confusion_matrix=model.confusion_matrix,
+            train_image_count=current_training.data.train_image_count(),
+            test_image_count=current_training.data.test_image_count(),
+            hyperparameters=trainer.hyperparameters)
+
+        await asyncio.sleep(0.1)  # NOTE needed for tests.
+
+        result = await self.sio_client.call('update_training', (current_training.context.organization, current_training.context.project, jsonable_encoder(new_training)))
+        response = from_dict(data_class=SocketResponse, data=result)
+
+        return response
+
+
+async def try_sync_model(mo):
     try:
         model = trainer.get_new_model()
     except Exception as exc:
@@ -30,23 +51,3 @@ async def try_sync_model(trainer: 'TrainerLogic', trainer_node_uuid: str, sio_cl
             error_msg = f'Error for update_training: Response from loop was : {asdict(response)}'
             logging.error(error_msg)
             raise Exception(error_msg)
-
-
-async def sync_model(trainer, trainer_node_uuid, sio_client, model):
-    current_training = trainer.training
-    new_training = TrainingOut(
-        trainer_id=trainer_node_uuid,
-        confusion_matrix=model.confusion_matrix,
-        train_image_count=current_training.data.train_image_count(),
-        test_image_count=current_training.data.test_image_count(),
-        hyperparameters=trainer.hyperparameters)
-
-    await asyncio.sleep(0.1)  # NOTE needed for tests.
-
-    result = await sio_client.call('update_training', (current_training.context.organization, current_training.context.project, jsonable_encoder(new_training)))
-    response = from_dict(data_class=SocketResponse, data=result)
-
-    if response.success:
-        logging.info(f'successfully updated training {asdict(new_training)}')
-        trainer.on_model_published(model)
-    return response
diff --git a/mock_trainer/app_code/progress_simulator.py b/mock_trainer/app_code/progress_simulator.py
index 042a0b29..6eaf5ced 100644
--- a/mock_trainer/app_code/progress_simulator.py
+++ b/mock_trainer/app_code/progress_simulator.py
@@ -10,8 +10,8 @@ def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -
         return None
 
     confusion_matrix = {}
-    assert trainer.training.data is not None
-    for category in trainer.training.data.categories:
+    assert trainer.active_training.data is not None
+    for category in trainer.active_training.data.categories:
         try:
             minimum = latest_known_confusion_matrix[category.id]['tp']
         except Exception:
diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py
index 20e43931..72929505 100644
--- a/mock_trainer/app_code/tests/test_mock_trainer.py
+++ b/mock_trainer/app_code/tests/test_mock_trainer.py
@@ -1,8 +1,7 @@
 from typing import Dict
 from uuid import uuid4
 
-from learning_loop_node.data_classes import (Context, Model, Training,
-                                             TrainingData)
+from learning_loop_node.data_classes import Context, Model, Training, TrainingData
 from learning_loop_node.globals import GLOBALS
 from learning_loop_node.trainer.executor import Executor
 
@@ -38,6 +37,6 @@ async def test_get_new_model(setup_test_project2):
         project_folder="",
         images_folder="",
         training_folder="",)
-    mock_trainer.training.data = TrainingData(image_data=[], categories=[])
+    mock_trainer.active_training.data = TrainingData(image_data=[], categories=[])
     model = mock_trainer.get_new_model()
     assert model is not None

From 02016291e2cc44494dd663d75d537625b9c2ead3 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 8 Mar 2024 00:02:37 +0100
Subject: [PATCH 13/62] finalize intermiate abstraction layer to trainer logic

---
 learning_loop_node/tests/test_helper.py       |   5 +-
 .../tests/states/test_state_upload_model.py   |   3 +-
 .../trainer/tests/testing_trainer_logic.py    |   2 +-
 learning_loop_node/trainer/trainer_logic.py   | 386 +-----------------
 .../trainer/trainer_logic_abstraction.py      |  63 ++-
 .../trainer/trainer_logic_generic.py          | 325 +++++++++++++++
 learning_loop_node/trainer/trainer_node.py    |   4 +-
 mock_trainer/app_code/mock_trainer_logic.py   |   2 +-
 .../app_code/tests/test_mock_trainer.py       |   2 +-
 9 files changed, 380 insertions(+), 412 deletions(-)
 create mode 100644 learning_loop_node/trainer/trainer_logic_generic.py

diff --git a/learning_loop_node/tests/test_helper.py b/learning_loop_node/tests/test_helper.py
index 1f485506..e802c7a0 100644
--- a/learning_loop_node/tests/test_helper.py
+++ b/learning_loop_node/tests/test_helper.py
@@ -7,9 +7,8 @@
 from typing import Callable
 
 from learning_loop_node.data_classes import Context
-from learning_loop_node.helpers.misc import create_image_folder, create_project_folder
+from learning_loop_node.helpers.misc import create_image_folder, create_project_folder, create_training_folder
 from learning_loop_node.loop_communication import LoopCommunicator
-from learning_loop_node.node import Node
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
 
@@ -68,5 +67,5 @@ def create_needed_folders(training_uuid: str = 'some_uuid'):  # pylint: disable=
     project_folder = create_project_folder(
         Context(organization='zauberzeug', project='pytest'))
     image_folder = create_image_folder(project_folder)
-    training_folder = TrainerLogic.create_training_folder(project_folder, training_uuid)
+    training_folder = create_training_folder(project_folder, training_uuid)
     return project_folder, image_folder, training_folder
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
index 21727b27..9faa656f 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
@@ -21,7 +21,8 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer
     create_active_training_file(trainer)
     trainer.init_from_last_training()
 
-    train_task = asyncio.get_running_loop().create_task(trainer.upload_model())
+    train_task = asyncio.get_running_loop().create_task(
+        trainer.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, trainer._upload_model))
 
     await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
     await train_task
diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index d6e9b78a..c7faeca8 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -38,7 +38,7 @@ async def start_training(self, model: str = 'model.model') -> None:
     async def start_training_from_scratch(self, base_model_id: str) -> None:
         await self.start_training(model=f'model_{base_model_id}.pt')
 
-    def get_new_model(self) -> Optional[BasicModel]:
+    def get_new_best_model(self) -> Optional[BasicModel]:
         if self.has_new_model:
             return BasicModel(confusion_matrix={})
         return None
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 4bfdb743..40b706fd 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -3,30 +3,19 @@
 import logging
 import os
 import shutil
-import sys
 from abc import abstractmethod
-from dataclasses import asdict
 from datetime import datetime
-from glob import glob
-from time import perf_counter
-from typing import Coroutine, Dict, List, Optional, Union
-from uuid import uuid4
+from typing import Coroutine, Dict, List, Optional
 
-import socketio
 from dacite import from_dict
-from fastapi.encoders import jsonable_encoder
-from tqdm import tqdm
 
-from ..data_classes import (BasicModel, Category, Context, Detections, Hyperparameter, ModelInformation, TrainerState,
-                            Training, TrainingData, TrainingError, TrainingOut)
-from ..helpers.misc import create_image_folder, create_project_folder, generate_training, is_valid_uuid4
-from .downloader import TrainingsDownloader
+from ..data_classes import Detections, ModelInformation, TrainerState, TrainingError
+from ..helpers.misc import create_image_folder, create_project_folder, images_for_ids, is_valid_uuid4
 from .executor import Executor
-from .io_helpers import ActiveTrainingIO
-from .trainer_logic_abstraction import TrainerLogicAbstraction
+from .trainer_logic_generic import TrainerLogicGeneric
 
 
-class TrainerLogic(TrainerLogicAbstraction):
+class TrainerLogic(TrainerLogicGeneric):
 
     def __init__(self, model_format: str) -> None:
         super().__init__(model_format)
@@ -35,121 +24,22 @@ def __init__(self, model_format: str) -> None:
         # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
 
         self._executor: Optional[Executor] = None
-        self.training_task: Optional[asyncio.Task] = None
         self.start_training_task: Optional[Coroutine] = None
-        self.shutdown_event: asyncio.Event = asyncio.Event()
-        self.detection_progress = 0.0
 
     @property
     def executor(self) -> Executor:
         assert self._executor is not None, 'executor must be set, call `run_training` first'
         return self._executor
 
-    def init_new_training(self, context: Context, details: Dict) -> None:
-        """Called on `begin_training` event from the Learning Loop.
-        Note that details needs the entries 'categories' and 'training_number'"""
-
-        project_folder = create_project_folder(context)
-        if not self.keep_old_trainings:
-            # NOTE: We delete all existing training folders because they are not needed anymore.
-            TrainerLogic.delete_all_training_folders(project_folder)
-        self._training = generate_training(project_folder, context)
-        self._training.data = TrainingData(categories=Category.from_list(details['categories']))
-        self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details)
-        self._training.training_number = details['training_number']
-        self._training.base_model_id = details['id']
-        self._training.training_state = TrainerState.Initialized
-        self._active_training_io = ActiveTrainingIO(
-            self._training.training_folder, self.loop_communicator, context)
-        logging.info(f'training initialized: {self._training}')
-
-    async def try_continue_run_if_incomplete(self) -> bool:
-        if not self.training_active and self.last_training_io.exists():
-            logging.info('found incomplete training, continuing now.')
-            self.init_from_last_training()
-            asyncio.get_event_loop().create_task(self.run())
-            return True
-        return False
-
-    def init_from_last_training(self) -> None:
-        self._training = self.last_training_io.load()
-        assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder'
-        self._active_training_io = ActiveTrainingIO(
-            self._training.training_folder, self.loop_communicator, self._training.context)
-
-    async def begin_training(self, organization: str, project: str, details: Dict) -> None:
-        self.init_new_training(Context(organization=organization, project=project), details)
-        asyncio.get_event_loop().create_task(self.run())
-
-    async def run(self) -> None:
-        """Called on `begin_training` event from the Learning Loop."""
-
-        self.errors.reset_all()
-        try:
-            self.training_task = asyncio.get_running_loop().create_task(self._run_training_loop())
-            await self.training_task  # Object is used to potentially cancel the task
-        except asyncio.CancelledError:
-            if not self.shutdown_event.is_set():
-                logging.info('training task was cancelled but not by shutdown event')
-                self.active_training.training_state = TrainerState.ReadyForCleanup
-                self.last_training_io.save(self.active_training)
-                await self.clear_training()
-
-        except Exception as e:
-            logging.exception(f'Error in train: {e}')
-
-    # ---------------------------------------- TRAINING STATES ----------------------------------------
-
-    async def _run_training_loop(self) -> None:
-        """asyncio.CancelledError is catched in train"""
-
-        if not self.training_active:
-            logging.error('could not start training - trainer is not initialized')
-            return
-
-        while self._training is not None:
-            tstate = self.active_training.training_state
-            logging.info(f'STATE LOOP: {tstate}, eerrors: {self.errors.errors}')
-            await asyncio.sleep(0.6)  # Note: Required for pytests!
-            if tstate == TrainerState.Initialized:  # -> DataDownloading -> DataDownloaded
-                await self.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare)
-            elif tstate == TrainerState.DataDownloaded:  # -> TrainModelDownloading -> TrainModelDownloaded
-                await self.perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model)
-            elif tstate == TrainerState.TrainModelDownloaded:  # -> TrainingRunning -> TrainingFinished
-                await self.perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train)
-            elif tstate == TrainerState.TrainingFinished:  # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced
-                await self.perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self.sync_confusion_matrix)
-                # await self.ensure_confusion_matrix_synced()
-            elif tstate == TrainerState.ConfusionMatrixSynced:  # -> TrainModelUploading -> TrainModelUploaded
-                await self.upload_model()
-            elif tstate == TrainerState.TrainModelUploaded:  # -> Detecting -> Detected
-                await self.perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections)
-            elif tstate == TrainerState.Detected:  # -> DetectionUploading -> ReadyForCleanup
-                await self.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions)
-            elif tstate == TrainerState.ReadyForCleanup:  # -> RESTART or TrainingFinished
-                await self.clear_training()
-                self.may_restart()
-
-    async def _prepare(self) -> None:
-        self.data_exchanger.set_context(self.active_training.context)
-        downloader = TrainingsDownloader(self.data_exchanger)
-        image_data, skipped_image_count = await downloader.download_training_data(self.active_training.images_folder)
-        assert self.active_training.data is not None, 'training.data must be set'
-        self.active_training.data.image_data = image_data
-        self.active_training.data.skipped_image_count = skipped_image_count
-
-    async def _download_model(self) -> None:
-        model_id = self.active_training.base_model_id
-        assert model_id is not None, 'model_id must be set'
-        if is_valid_uuid4(
-                self.active_training.base_model_id):  # TODO this checks if we continue a training -> make more explicit
-            logging.info('loading model from Learning Loop')
-            logging.info(f'downloading model {model_id} as {self.model_format}')
-            await self.data_exchanger.download_model(self.active_training.training_folder, self.active_training.context, model_id, self.model_format)
-            shutil.move(f'{self.active_training.training_folder}/model.json',
-                        f'{self.active_training.training_folder}/base_model.json')
-        else:
-            logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download')
+    @property
+    def hyperparameters(self) -> Optional[Dict]:
+        if self._training and self._training.data and self._training.data.hyperparameter:
+            information = {}
+            information['resolution'] = self._training.data.hyperparameter.resolution
+            information['flipRl'] = self._training.data.hyperparameter.flip_rl
+            information['flipUd'] = self._training.data.hyperparameter.flip_ud
+            return information
+        return None
 
     async def _train(self) -> None:
         previous_state = TrainerState.TrainModelDownloaded
@@ -170,7 +60,7 @@ async def _train(self) -> None:
                         break
                     self.errors.reset(error_key)
                     try:
-                        await self.sync_confusion_matrix()
+                        await self._sync_confusion_matrix()
                     except asyncio.CancelledError:
                         logging.warning('CancelledError in run_training')
                         raise
@@ -209,108 +99,6 @@ async def _start_training(self):
                 self.start_training_task = self.start_training()
         await self.start_training_task
 
-    async def sync_confusion_matrix(self):
-        logging.info('Syncing confusion matrix')
-        error_key = 'sync_confusion_matrix'
-        try:
-            try:
-                model = self.get_new_model()
-            except Exception as exc:
-                logging.exception('error while getting new model')
-                raise Exception(f'Could not get new model: {str(exc)}') from exc
-            if model and self.active_training.data:
-                new_training = TrainingOut(
-                    trainer_id=self.node_uuid,
-                    confusion_matrix=model.confusion_matrix,
-                    train_image_count=self.active_training.data.train_image_count(),
-                    test_image_count=self.active_training.data.test_image_count(),
-                    hyperparameters=self.hyperparameters)
-
-                await asyncio.sleep(0.1)  # NOTE needed for tests.
-                result = await self.sio_client.call('update_training', (self.active_training.context.organization, self.active_training.context.project, jsonable_encoder(new_training)))
-                if isinstance(result,  dict) and result['success']:
-                    logging.info(f'successfully updated training {asdict(new_training)}')
-                    self.on_model_published(model)
-                else:
-                    error_msg = f'Error for update_training: Response from loop was : {result}'
-                    logging.error(error_msg)
-                    raise Exception(error_msg)
-        except socketio.exceptions.BadNamespaceError as e:  # type: ignore
-            logging.error('Error during confusion matrix syncronization. BadNamespaceError')
-            self.errors.set(error_key, str(e))
-            raise
-        except Exception as e:
-            logging.exception('Error during confusion matrix syncronization')
-            self.errors.set(error_key, str(e))
-            raise
-
-        self.errors.reset(error_key)
-
-    async def upload_model(self) -> None:
-        error_key = 'upload_model'
-        previous_state = self.active_training.training_state
-        self.active_training.training_state = TrainerState.TrainModelUploading
-        try:
-            new_model_id = await self._upload_model_return_new_model_uuid(self.active_training.context)
-            if new_model_id is None:
-                self.active_training.training_state = TrainerState.ReadyForCleanup
-                logging.error('could not upload model - maybe training failed.. cleaning up')
-                return
-            assert new_model_id is not None, 'uploaded_model must be set'
-            logging.info(f'successfully uploaded model and received new model id: {new_model_id}')
-            self.active_training.model_id_for_detecting = new_model_id
-        except asyncio.CancelledError:
-            logging.warning('CancelledError in upload_model')
-            raise
-        except Exception as e:
-            logging.exception('Error in upload_model. Exception:')
-            self.errors.set(error_key, str(e))
-            self.active_training.training_state = previous_state  # TODO... going back is pointless here as it ends in a deadlock ?!
-            # self.training.training_state = TrainingState.ReadyForCleanup
-        else:
-            self.errors.reset(error_key)
-            self.active_training.training_state = TrainerState.TrainModelUploaded
-            self.last_training_io.save(self.active_training)
-
-    async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
-        """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file.
-        Note that with the latest trainers the conversion to (.wts) is done by the trainer.
-        The conversion from .wts to .engine is done by the detector (needs to be done on target hardware).
-        Note that trainer may train with different classes, which is why we send an initial model.json file.
-        """
-        files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files)
-        if files is None:
-            return None
-
-        if isinstance(files, List):
-            files = {self.model_format: files}
-        assert isinstance(files, Dict), f'can only upload model as list or dict, but was {files}'
-
-        already_uploaded_formats = self.active_training_io.load_model_upload_progress()
-
-        new_model_uuid = None
-        for file_format in files:
-            if file_format in already_uploaded_formats:
-                continue
-            _files = files[file_format]
-            assert not any(f for f in _files if 'model.json' in f), "Upload 'model.json' not allowed (added automatically)."
-            _files.append(self.dump_categories_to_json())
-            new_model_uuid = await self.data_exchanger.upload_model_get_uuid(context, _files, self.active_training.training_number, file_format)
-            if new_model_uuid is None:
-                return None
-
-            already_uploaded_formats.append(file_format)
-            self.active_training_io.save_model_upload_progress(already_uploaded_formats)
-
-        return new_model_uuid
-
-    def dump_categories_to_json(self) -> str:
-        content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None
-        json_path = '/tmp/model.json'
-        with open(json_path, 'w') as f:
-            json.dump(content, f)
-        return json_path
-
     async def _do_detections(self) -> None:
         context = self.active_training.context
         model_id = self.active_training.model_id_for_detecting
@@ -323,8 +111,7 @@ async def _do_detections(self) -> None:
 
         await self.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format)
         with open(f'{tmp_folder}/model.json', 'r') as f:
-            content = json.load(f)
-            model_information = from_dict(data_class=ModelInformation, data=content)
+            model_information = from_dict(data_class=ModelInformation, data=json.load(f))
 
         project_folder = create_project_folder(context)
         image_folder = create_image_folder(project_folder)
@@ -340,32 +127,17 @@ async def _do_detections(self) -> None:
         self.detection_progress = 0.42
         # await delete_corrupt_images(image_folder)
 
-        images = await asyncio.get_event_loop().run_in_executor(None, TrainerLogic.images_for_ids, image_ids, image_folder)
+        images = await asyncio.get_event_loop().run_in_executor(None, images_for_ids, image_ids, image_folder)
+        if not images:
+            self.active_training_io.save_detections([], 0)
         num_images = len(images)
-        logging.info(f'running detections on {num_images} images')
+
         batch_size = 200
-        idx = 0
-        if not images:
-            self.active_training_io.save_detections([], idx)
-        for i in tqdm(range(0, num_images, batch_size), position=0, leave=True):
+        for idx, i in enumerate(range(0, num_images, batch_size)):
             self.detection_progress = 0.5 + (i/num_images)*0.5
             batch_images = images[i:i+batch_size]
             batch_detections = await self._detect(model_information, batch_images, tmp_folder)
             self.active_training_io.save_detections(batch_detections, idx)
-            idx += 1
-
-        return None
-
-    async def clear_training(self):
-        self.active_training_io.delete_detections()
-        self.active_training_io.delete_detection_upload_progress()
-        self.active_training_io.delete_detections_upload_file_index()
-        await self.clear_training_data(self.active_training.training_folder)
-        self.last_training_io.delete()
-        # self.training.training_state = TrainingState.TrainingFinished
-
-        await self.node.send_status()
-        self._training = None
 
     async def stop(self) -> None:
         """If executor is running, stop it. Else cancel training task."""
@@ -383,44 +155,11 @@ async def stop(self) -> None:
                 logging.info('cancelled training task')
                 self.may_restart()
 
-    async def shutdown(self) -> None:
-        self.shutdown_event.set()
-        await self.stop()
-        await self.stop()  # NOTE first stop may only stop training.
-
     def get_log(self) -> str:
         return self.executor.get_log()
 
-    def may_restart(self) -> None:
-        if self.restart_after_training:
-            logging.info('restarting')
-            sys.exit(0)
-        else:
-            logging.info('not restarting')
-
-    @property
-    def general_progress(self) -> Optional[float]:
-        """Represents the progress for different states."""
-        if not self.training_active:
-            return None
-
-        t_state = self.active_training.training_state
-        if t_state == TrainerState.DataDownloading:
-            return self.data_exchanger.progress
-        if t_state == TrainerState.TrainingRunning:
-            return self.training_progress
-        if t_state == TrainerState.Detecting:
-            return self.detection_progress
-
-        return None
     # ---------------------------------------- ABSTRACT METHODS ----------------------------------------
 
-    @property
-    @abstractmethod
-    def training_progress(self) -> Optional[float]:
-        """Represents the training progress."""
-        raise NotImplementedError
-
     @abstractmethod
     async def start_training(self) -> None:
         '''Should be used to start a training.'''
@@ -440,90 +179,9 @@ async def resume(self) -> None:
         One may resume the training on a previously trained model stored by self.on_model_published(basic_model).'''
 
     @abstractmethod
-    def get_executor_error_from_log(self) -> Optional[str]:  # TODO we should allow other options to get the error
+    def get_executor_error_from_log(self) -> Optional[str]:
         '''Should be used to provide error informations to the Learning Loop by extracting data from self.executor.get_log().'''
 
-    @abstractmethod
-    def get_new_model(self) -> Optional[BasicModel]:
-        '''Is called frequently in `try_sync_model` to check if a new "best" model is availabe.
-        Returns None if no new model could be found. Otherwise BasicModel(confusion_matrix, meta_information).
-        `confusion_matrix` contains a dict of all classes:
-            - The classes must be identified by their id, not their name.
-            - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives).
-        `meta_information` can hold any data which is helpful for self.on_model_published to store weight file etc for later upload via self.get_model_files
-        '''
-
-    @abstractmethod
-    def on_model_published(self, basic_model: BasicModel) -> None:
-        '''Called after a BasicModel has been successfully send to the Learning Loop.
-        The files for this model should be stored.
-        self.get_latest_model_files is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop.
-        In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model
-        '''
-
-    @abstractmethod
-    def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]:
-        '''Called when the Learning Loop requests to backup the latest model for the training.
-        Should return a list of file paths which describe the model.
-        These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.)
-        and will be stored in the Learning Loop unter the format of this trainer.
-        Note: by convention the weightfile should be named "model.<extension>" where extension is the file format of the weightfile.
-        For example "model.pt" for pytorch or "model.weights" for darknet/yolo.
-
-        If a trainer can also generate other formats (for example for an detector),
-        a dictionary mapping format -> list of files can be returned.'''
-
     @abstractmethod
     async def _detect(self, model_information: ModelInformation, images: List[str], model_folder: str) -> List[Detections]:
         '''Called to run detections on a list of images.'''
-
-    @abstractmethod
-    async def clear_training_data(self, training_folder: str) -> None:
-        '''Called after a training has finished. Deletes all data that is not needed anymore after a training run. 
-        This can be old weightfiles or any additional files.'''
-
-    # ---------------------------------------- HELPER METHODS ----------------------------------------
-
-    @staticmethod
-    def images_for_ids(image_ids, image_folder) -> List[str]:
-        logging.info(f'### Going to get images for {len(image_ids)} images ids')
-        start = perf_counter()
-        images = [img for img in glob(f'{image_folder}/**/*.*', recursive=True)
-                  if os.path.splitext(os.path.basename(img))[0] in image_ids]
-        end = perf_counter()
-        logging.info(f'found {len(images)} images for {len(image_ids)} image ids, which took {end-start:0.2f} seconds')
-        return images
-
-    @staticmethod
-    def generate_training(project_folder: str, context: Context) -> Training:
-        training_uuid = str(uuid4())
-        return Training(
-            id=training_uuid,
-            context=context,
-            project_folder=project_folder,
-            images_folder=create_image_folder(project_folder),
-            training_folder=TrainerLogic.create_training_folder(project_folder, training_uuid)
-        )
-
-    @staticmethod
-    def delete_all_training_folders(project_folder: str):
-        if not os.path.exists(f'{project_folder}/trainings'):
-            return
-        for uuid in os.listdir(f'{project_folder}/trainings'):
-            shutil.rmtree(f'{project_folder}/trainings/{uuid}', ignore_errors=True)
-
-    @staticmethod
-    def create_training_folder(project_folder: str, trainings_id: str) -> str:
-        training_folder = f'{project_folder}/trainings/{trainings_id}'
-        os.makedirs(training_folder, exist_ok=True)
-        return training_folder
-
-    @property
-    def hyperparameters(self) -> Optional[Dict]:
-        if self._training and self._training.data and self._training.data.hyperparameter:
-            information = {}
-            information['resolution'] = self._training.data.hyperparameter.resolution
-            information['flipRl'] = self._training.data.hyperparameter.flip_rl
-            information['flipUd'] = self._training.data.hyperparameter.flip_ud
-            return information
-        return None
diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py
index b7f6b006..2b432998 100644
--- a/learning_loop_node/trainer/trainer_logic_abstraction.py
+++ b/learning_loop_node/trainer/trainer_logic_abstraction.py
@@ -1,9 +1,7 @@
-import asyncio
-import logging
 import os
 import time
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Callable, Coroutine, List, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional
 
 from socketio import AsyncClient
 
@@ -90,27 +88,32 @@ def training_uptime(self) -> Optional[float]:
         return None
 
     @property
-    def training_data(self) -> TrainingData | None:
+    def training_data(self) -> Optional[TrainingData]:
         if self.training_active and self.active_training.data:
             return self.active_training.data
         return None
 
     @property
-    def training_context(self) -> Context | None:
+    def training_context(self) -> Optional[Context]:
         if self.training_active:
             return self.active_training.context
         return None
 
+    # --- ABSTRACT PROPERTIES
+    # --------- implemented in TrainerLogicGeneric
+
     @property
     @abstractmethod
-    def general_progress(self) -> float | None:
+    def general_progress(self) -> Optional[float]:
         """Returns the general progress of the training per state or None if idle"""
 
+    # --------- implemented in TrainerLogic(with Executor)
     @property
     @abstractmethod
-    def provided_pretrained_models(self) -> List[PretrainedModel]:
-        """Returns the list of provided pretrained models"""
+    def hyperparameters(self) -> Optional[Dict]:
+        """Returns the currently used hyperparameters if available"""
 
+    # --------- not implemented in any abstract class
     @property
     @abstractmethod
     def model_architecture(self) -> Optional[str]:
@@ -118,44 +121,26 @@ def model_architecture(self) -> Optional[str]:
 
     @property
     @abstractmethod
-    def hyperparameters(self) -> dict | None:
-        """Returns the currently used hyperparameters if available"""
+    def provided_pretrained_models(self) -> List[PretrainedModel]:
+        """Returns the list of provided pretrained models"""
 
-    @abstractmethod
-    async def begin_training(self, organization: str, project: str, details: dict):
-        """Starts the training process"""
+    # --- ABSTRACT METHODS -----
+    # --------- implemented in TrainerLogicGeneric ---
 
     @abstractmethod
-    async def stop(self):
-        """Stops the training process"""
+    async def on_shutdown(self):
+        """Called when the trainer is shut down"""
 
     @abstractmethod
-    async def shutdown(self):
-        """Stops the training process and releases resources"""
+    async def begin_training(self, organization: str, project: str, details: dict):
+        """Starts the training process"""
 
     @abstractmethod
     async def try_continue_run_if_incomplete(self) -> bool:
         """Start training continuation if possible, returns True if continuation started"""
 
-    async def perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False):
-        await asyncio.sleep(0.1)
-        logging.info(f'Performing state: {state_during}')
-        previous_state = self.active_training.training_state
-        self.active_training.training_state = state_during
-        await asyncio.sleep(0.1)
-        if reset_early:
-            self.errors.reset(error_key)
-
-        try:
-            await action()
-        except asyncio.CancelledError:
-            logging.warning(f'CancelledError in {state_during}')
-            raise
-        except Exception as e:
-            self.errors.set(error_key, str(e))
-            logging.exception(f'Error in {state_during} - Exception:')
-            self.active_training.training_state = previous_state
-        else:
-            self.errors.reset(error_key)
-            self.active_training.training_state = state_after
-            self.last_training_io.save(self.active_training)
+    # --- implemented in TrainerLogic(with Executor) ---
+
+    @abstractmethod
+    async def stop(self):
+        """Stops the training process"""
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
new file mode 100644
index 00000000..ac0479c1
--- /dev/null
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -0,0 +1,325 @@
+import asyncio
+import json
+import logging
+import shutil
+import sys
+from abc import abstractmethod
+from dataclasses import asdict
+from typing import Callable, Coroutine, Dict, List, Optional, Union
+
+from dacite import from_dict
+from fastapi.encoders import jsonable_encoder
+
+from ..data_classes import BasicModel, Category, Context, Hyperparameter, TrainerState, TrainingData, TrainingOut
+from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4
+from .downloader import TrainingsDownloader
+from .io_helpers import ActiveTrainingIO
+from .trainer_logic_abstraction import TrainerLogicAbstraction
+
+
+class TrainerLogicGeneric(TrainerLogicAbstraction):
+
+    def __init__(self, model_format: str) -> None:
+        super().__init__(model_format)
+        self.training_task: Optional[asyncio.Task] = None
+        self.detection_progress = 0.0
+        self.shutdown_event: asyncio.Event = asyncio.Event()
+
+    @property
+    def general_progress(self) -> Optional[float]:
+        """Represents the progress for different states."""
+        if not self.training_active:
+            return None
+
+        t_state = self.active_training.training_state
+        if t_state == TrainerState.DataDownloading:
+            return self.data_exchanger.progress
+        if t_state == TrainerState.TrainingRunning:
+            return self.training_progress
+        if t_state == TrainerState.Detecting:
+            return self.detection_progress
+
+        return None
+
+    def init_new_training(self, context: Context, details: Dict) -> None:
+        """Called on `begin_training` event from the Learning Loop.
+        Note that details needs the entries 'categories' and 'training_number'"""
+
+        project_folder = create_project_folder(context)
+        if not self.keep_old_trainings:
+            # NOTE: We delete all existing training folders because they are not needed anymore.
+            delete_all_training_folders(project_folder)
+        self._training = generate_training(project_folder, context)
+        self._training.data = TrainingData(categories=Category.from_list(details['categories']))
+        self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details)
+        self._training.training_number = details['training_number']
+        self._training.base_model_id = details['id']
+        self._training.training_state = TrainerState.Initialized
+        self._active_training_io = ActiveTrainingIO(
+            self._training.training_folder, self.loop_communicator, context)
+        logging.info(f'training initialized: {self._training}')
+
+    async def try_continue_run_if_incomplete(self) -> bool:
+        if not self.training_active and self.last_training_io.exists():
+            logging.info('found incomplete training, continuing now.')
+            self.init_from_last_training()
+            asyncio.get_event_loop().create_task(self.run())
+            return True
+        return False
+
+    def init_from_last_training(self) -> None:
+        self._training = self.last_training_io.load()
+        assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder'
+        self._active_training_io = ActiveTrainingIO(
+            self._training.training_folder, self.loop_communicator, self._training.context)
+
+    async def begin_training(self, organization: str, project: str, details: Dict) -> None:
+        """Called on `begin_training` event from the Learning Loop."""
+
+        self.init_new_training(Context(organization=organization, project=project), details)
+        asyncio.get_event_loop().create_task(self.run())
+
+    async def run(self) -> None:
+        self.errors.reset_all()
+        try:
+            self.training_task = asyncio.get_running_loop().create_task(self._training_loop())
+            await self.training_task  # NOTE: Task object is used to potentially cancel the task
+        except asyncio.CancelledError:
+            if not self.shutdown_event.is_set():
+                logging.info('training task was cancelled but not by shutdown event')
+                self.active_training.training_state = TrainerState.ReadyForCleanup
+                self.last_training_io.save(self.active_training)
+                await self.clear_training()
+        except Exception as e:
+            logging.exception(f'Error in train: {e}')
+
+    # ---------------------------------------- TRAINING STATES ----------------------------------------
+
+    async def _training_loop(self) -> None:
+        """asyncio.CancelledError is catched in run"""
+
+        assert self.training_active
+
+        while self._training is not None:
+            tstate = self.active_training.training_state
+            logging.info(f'STATE LOOP: {tstate}, eerrors: {self.errors.errors}')
+            await asyncio.sleep(0.6)  # Note: Required for pytests!
+            if tstate == TrainerState.Initialized:  # -> DataDownloading -> DataDownloaded
+                await self.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare)
+            elif tstate == TrainerState.DataDownloaded:  # -> TrainModelDownloading -> TrainModelDownloaded
+                await self.perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model)
+            elif tstate == TrainerState.TrainModelDownloaded:  # -> TrainingRunning -> TrainingFinished
+                await self.perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train)
+            elif tstate == TrainerState.TrainingFinished:  # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced
+                await self.perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix)
+            elif tstate == TrainerState.ConfusionMatrixSynced:  # -> TrainModelUploading -> TrainModelUploaded
+                await self.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model)
+            elif tstate == TrainerState.TrainModelUploaded:  # -> Detecting -> Detected
+                await self.perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections)
+            elif tstate == TrainerState.Detected:  # -> DetectionUploading -> ReadyForCleanup
+                await self.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions)
+            elif tstate == TrainerState.ReadyForCleanup:  # -> RESTART or TrainingFinished
+                await self.clear_training()
+                self.may_restart()
+
+    async def perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False):
+        await asyncio.sleep(0.1)
+        logging.info(f'Performing state: {state_during}')
+        previous_state = self.active_training.training_state
+        self.active_training.training_state = state_during
+        await asyncio.sleep(0.1)
+        if reset_early:
+            self.errors.reset(error_key)
+
+        try:
+            if await action():
+                logging.error('Something went really bad.. cleaning up')
+                state_after = TrainerState.ReadyForCleanup
+        except asyncio.CancelledError:
+            logging.warning(f'CancelledError in {state_during}')
+            raise
+        except Exception as e:
+            self.errors.set(error_key, str(e))
+            logging.exception(f'Error in {state_during} - Exception:')
+            self.active_training.training_state = previous_state
+        else:
+            self.errors.reset(error_key)
+            self.active_training.training_state = state_after
+            self.last_training_io.save(self.active_training)
+
+    async def _prepare(self) -> None:
+        self.data_exchanger.set_context(self.active_training.context)
+        downloader = TrainingsDownloader(self.data_exchanger)
+        image_data, skipped_image_count = await downloader.download_training_data(self.active_training.images_folder)
+        assert self.active_training.data is not None, 'training.data must be set'
+        self.active_training.data.image_data = image_data
+        self.active_training.data.skipped_image_count = skipped_image_count
+
+    async def _download_model(self) -> None:
+        model_id = self.active_training.base_model_id
+        assert model_id is not None, 'model_id must be set'
+        if is_valid_uuid4(
+                self.active_training.base_model_id):  # TODO this checks if we continue a training -> make more explicit
+            logging.info('loading model from Learning Loop')
+            logging.info(f'downloading model {model_id} as {self.model_format}')
+            await self.data_exchanger.download_model(self.active_training.training_folder, self.active_training.context, model_id, self.model_format)
+            shutil.move(f'{self.active_training.training_folder}/model.json',
+                        f'{self.active_training.training_folder}/base_model.json')
+        else:
+            logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download')
+
+    async def _sync_confusion_matrix(self):
+        error_key = 'sync_confusion_matrix'
+        try:
+            new_best_model = self.get_new_best_model()
+            if new_best_model and self.active_training.data:
+                new_training = TrainingOut(trainer_id=self.node_uuid,
+                                           confusion_matrix=new_best_model.confusion_matrix,
+                                           train_image_count=self.active_training.data.train_image_count(),
+                                           test_image_count=self.active_training.data.test_image_count(),
+                                           hyperparameters=self.hyperparameters)
+                await asyncio.sleep(0.1)  # NOTE needed for tests.
+
+                result = await self.sio_client.call('update_training', (
+                    self.active_training.context.organization, self.active_training.context.project, jsonable_encoder(new_training)))
+                if isinstance(result,  dict) and result['success']:
+                    logging.info(f'successfully updated training {asdict(new_training)}')
+                    self.on_model_published(new_best_model)
+                else:
+                    raise Exception(f'Error for update_training: Response from loop was : {result}')
+        except Exception as e:
+            logging.exception('Error during confusion matrix syncronization')
+            self.errors.set(error_key, str(e))
+            raise
+        self.errors.reset(error_key)
+
+    async def _upload_model(self) -> None | bool:
+        """Returns True if the training should be cleaned up."""
+
+        new_model_id = await self._upload_model_return_new_model_uuid(self.active_training.context)
+        if new_model_id is None:
+            self.active_training.training_state = TrainerState.ReadyForCleanup
+            logging.error('could not upload model - maybe training failed.. cleaning up')
+            return True
+        logging.info(f'Successfully uploaded model and received new model id: {new_model_id}')
+        self.active_training.model_id_for_detecting = new_model_id
+        return None
+
+    async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
+        """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file.
+        Note that with the latest trainers the conversion to (.wts) is done by the trainer.
+        The conversion from .wts to .engine is done by the detector (needs to be done on target hardware).
+        Note that trainer may train with different classes, which is why we send an initial model.json file.
+        """
+        files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files)
+        if files is None:
+            return None
+
+        if isinstance(files, List):
+            files = {self.model_format: files}
+        assert isinstance(files, Dict), f'can only upload model as list or dict, but was {files}'
+
+        already_uploaded_formats = self.active_training_io.load_model_upload_progress()
+
+        model_uuid = None
+        for file_format in [f for f in files if f not in already_uploaded_formats]:
+            _files = files[file_format] + [self.dump_categories_to_json()]
+            assert len([f for f in _files if 'model.json' in f]) == 1, "model.json must be included exactly once"
+
+            model_uuid = await self.data_exchanger.upload_model_get_uuid(context, _files, self.active_training.training_number, file_format)
+            if model_uuid is None:
+                return None
+
+            already_uploaded_formats.append(file_format)
+            self.active_training_io.save_model_upload_progress(already_uploaded_formats)
+
+        return model_uuid
+
+    def dump_categories_to_json(self) -> str:
+        content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None
+        json_path = '/tmp/model.json'
+        with open(json_path, 'w') as f:
+            json.dump(content, f)
+        return json_path
+
+    async def clear_training(self):
+        self.active_training_io.delete_detections()
+        self.active_training_io.delete_detection_upload_progress()
+        self.active_training_io.delete_detections_upload_file_index()
+        await self.clear_training_data(self.active_training.training_folder)
+        self.last_training_io.delete()
+        # self.training.training_state = TrainingState.TrainingFinished
+
+        await self.node.send_status()
+        self._training = None
+
+    # ---------------------------------------- OTHER METHODS ----------------------------------------
+
+    def may_restart(self) -> None:
+        if self.restart_after_training:
+            logging.info('restarting')
+            sys.exit(0)
+        else:
+            logging.info('not restarting')
+
+    async def on_shutdown(self) -> None:
+        self.shutdown_event.set()
+        await self.stop()
+        await self.stop()
+
+    # ---------------------------------------- ABSTRACT PROPERTIES ----------------------------------------
+
+    @property
+    @abstractmethod
+    def training_progress(self) -> Optional[float]:
+        """Represents the training progress."""
+        raise NotImplementedError
+
+    # ---------------------------------------- ABSTRACT METHODS ----------------------------------------
+
+    @abstractmethod
+    async def _train(self) -> None:
+        '''Should be used to execute a training.
+        The model should be synchronized with the Learning Loop via self._sync_confusion_matrix() every now and then.
+        asyncio.CancelledError should be catched and re-raised.'''
+
+    @abstractmethod
+    async def _do_detections(self) -> None:
+        '''Should be used to execute detections.
+        active_training_io.save_detections(...) should be used to store the detections.
+        asyncio.CancelledError should be catched and re-raised.'''
+
+    @abstractmethod
+    def get_new_best_model(self) -> Optional[BasicModel]:
+        '''Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe.
+        Returns None if no new model could be found. Otherwise BasicModel(confusion_matrix, meta_information).
+        `confusion_matrix` contains a dict of all classes:
+            - The classes must be identified by their id, not their name.
+            - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives).
+        `meta_information` can hold any data which is helpful for self.on_model_published to store weight file etc for later upload via self.get_model_files
+        '''
+
+    @abstractmethod
+    def on_model_published(self, basic_model: BasicModel) -> None:
+        '''Called after a BasicModel has been successfully send to the Learning Loop.
+        The files for this model should be stored.
+        self.get_latest_model_files is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop.
+        In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model
+        '''
+
+    @abstractmethod
+    def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]:
+        '''Called when the Learning Loop requests to backup the latest model for the training.
+        Should return a list of file paths which describe the model.
+        These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.)
+        and will be stored in the Learning Loop unter the format of this trainer.
+        Note: by convention the weightfile should be named "model.<extension>" where extension is the file format of the weightfile.
+        For example "model.pt" for pytorch or "model.weights" for darknet/yolo.
+
+        If a trainer can also generate other formats (for example for an detector),
+        a dictionary mapping format -> list of files can be returned.'''
+
+    @abstractmethod
+    async def clear_training_data(self, training_folder: str) -> None:
+        '''Called after a training has finished. Deletes all data that is not needed anymore after a training run. 
+        This can be old weightfiles or any additional files.'''
diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py
index f2e011d5..c87124c1 100644
--- a/learning_loop_node/trainer/trainer_node.py
+++ b/learning_loop_node/trainer/trainer_node.py
@@ -32,7 +32,7 @@ async def on_startup(self):
 
     async def on_shutdown(self):
         self.log.info('shutdown detected, stopping training')
-        await self.trainer_logic.shutdown()
+        await self.trainer_logic.on_shutdown()
 
     async def on_repeat(self):
         try:
@@ -72,7 +72,7 @@ async def send_status(self):
 
         status = TrainingStatus(id=self.uuid,
                                 name=self.name,
-                                state=self.trainer_logic.state.value,
+                                state=self.trainer_logic.state,
                                 errors={},
                                 uptime=self.trainer_logic.training_uptime,
                                 progress=self.trainer_logic.general_progress)
diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py
index b3f1adb5..b24dc531 100644
--- a/mock_trainer/app_code/mock_trainer_logic.py
+++ b/mock_trainer/app_code/mock_trainer_logic.py
@@ -111,7 +111,7 @@ def training_progress(self) -> float:
         print(f'prog. is {self.current_iteration} / {self.max_iterations} = {self.current_iteration / self.max_iterations}')
         return self.current_iteration / self.max_iterations
 
-    def get_new_model(self) -> Optional[BasicModel]:
+    def get_new_best_model(self) -> Optional[BasicModel]:
         logging.warning('get_new_model called')
         if self.error_configuration.get_new_model:
             raise Exception()
diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py
index 72929505..f20797b0 100644
--- a/mock_trainer/app_code/tests/test_mock_trainer.py
+++ b/mock_trainer/app_code/tests/test_mock_trainer.py
@@ -38,5 +38,5 @@ async def test_get_new_model(setup_test_project2):
         images_folder="",
         training_folder="",)
     mock_trainer.active_training.data = TrainingData(image_data=[], categories=[])
-    model = mock_trainer.get_new_model()
+    model = mock_trainer.get_new_best_model()
     assert model is not None

From eb9b5a962492d7f9d1acee2c490ac1498a01ef7a Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 8 Mar 2024 09:56:36 +0100
Subject: [PATCH 14/62] Fix minor bug in abstraction layer

---
 learning_loop_node/trainer/trainer_logic_abstraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py
index 2b432998..64349e3d 100644
--- a/learning_loop_node/trainer/trainer_logic_abstraction.py
+++ b/learning_loop_node/trainer/trainer_logic_abstraction.py
@@ -83,7 +83,7 @@ def active_training(self) -> Training:
 
     @property
     def training_uptime(self) -> Optional[float]:
-        if self.active_training:
+        if self.training_active:
             return time.time() - self.active_training.start_time
         return None
 

From e2272e547061f8f9a5aa7709ce46bb7d12e857ae Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 8 Mar 2024 11:23:47 +0100
Subject: [PATCH 15/62] Fix error states that are required by the backend tests

---
 learning_loop_node/data_classes/training.py          |  3 +++
 learning_loop_node/trainer/rest/backdoor_controls.py | 12 +++++++++---
 learning_loop_node/trainer/trainer_logic.py          |  1 -
 learning_loop_node/trainer/trainer_logic_generic.py  |  4 +++-
 mock_trainer/app_code/mock_trainer_logic.py          |  4 ++--
 5 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index a0601c2d..449cc85b 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -165,3 +165,6 @@ class TrainingError(Exception):
     def __init__(self, cause: str, *args: object) -> None:
         super().__init__(*args)
         self.cause = cause
+
+    def __str__(self) -> str:
+        return f'TrainingError: {self.cause}'
diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py
index 8349e737..a796fc4d 100644
--- a/learning_loop_node/trainer/rest/backdoor_controls.py
+++ b/learning_loop_node/trainer/rest/backdoor_controls.py
@@ -9,6 +9,7 @@
 from fastapi import APIRouter, HTTPException, Request
 
 from ...data_classes import ErrorConfiguration, NodeState
+from ..trainer_logic import TrainerLogic
 
 if TYPE_CHECKING:
     from ..trainer_node import TrainerNode
@@ -95,6 +96,8 @@ async def add_steps(request: Request):
     trainer_node = trainer_node_from_request(request)
     trainer_logic = trainer_node.trainer_logic  # NOTE: is MockTrainerLogic which has 'provide_new_model' and 'current_iteration'
 
+    assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic'
+
     if not trainer_logic._executor or not trainer_logic._executor.is_process_running():  # pylint: disable=protected-access
         training = trainer_logic._training  # pylint: disable=protected-access
         logging.error(f'cannot add steps when not running, state: {training.training_state if training else "None"}')
@@ -109,7 +112,7 @@ async def add_steps(request: Request):
     for _ in range(steps):
         try:
             logging.warning('calling sync_confusion_matrix')
-            await trainer_logic.sync_confusion_matrix()
+            await trainer_logic._sync_confusion_matrix()  # pylint: disable=protected-access
         except Exception:
             pass  # Tests can force synchroniation to fail, error state is reported to backend
     trainer_logic.provide_new_model = previous_state  # type: ignore
@@ -119,11 +122,14 @@ async def add_steps(request: Request):
 
 @router.post("/kill_training_process")
 async def kill_process(request: Request):
+
     # pylint: disable=protected-access
     trainer_node = trainer_node_from_request(request)
-    if not trainer_node.trainer_logic._executor or not trainer_node.trainer_logic._executor.is_process_running():
+    trainer_logic = trainer_node.trainer_logic
+    assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic'
+    if not trainer_logic._executor or not trainer_logic._executor.is_process_running():
         raise HTTPException(status_code=409, detail="trainer is not running")
-    trainer_node.trainer_logic._executor.stop()
+    trainer_logic._executor.stop()
 
 
 @router.post("/force_status_update")
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 40b706fd..82fd8aad 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -71,7 +71,6 @@ async def _train(self) -> None:
 
             error = self.get_executor_error_from_log()
             if error:
-                self.errors.set(error_key, error)
                 raise TrainingError(cause=error)
             # TODO check if this works:
             # if self.executor.return_code != 0:
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index ac0479c1..7221e6ec 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -143,7 +143,8 @@ async def perform_state(self, error_key: str, state_during: TrainerState, state_
             logging.exception(f'Error in {state_during} - Exception:')
             self.active_training.training_state = previous_state
         else:
-            self.errors.reset(error_key)
+            if not reset_early:
+                self.errors.reset(error_key)
             self.active_training.training_state = state_after
             self.last_training_io.save(self.active_training)
 
@@ -169,6 +170,7 @@ async def _download_model(self) -> None:
             logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download')
 
     async def _sync_confusion_matrix(self):
+        '''NOTE: This stage sets the errors explicitly because it may be used inside the training stage.'''
         error_key = 'sync_confusion_matrix'
         try:
             new_best_model = self.get_new_best_model()
diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py
index b24dc531..e88a2de3 100644
--- a/mock_trainer/app_code/mock_trainer_logic.py
+++ b/mock_trainer/app_code/mock_trainer_logic.py
@@ -32,7 +32,7 @@ async def resume(self) -> None:
     async def start_training(self) -> None:
         self.current_iteration = 0
         if self.error_configuration.begin_training:
-            raise Exception()
+            raise Exception('Could not start training')
         self.executor.start('while true; do sleep 1; done')
 
     async def start_training_from_scratch(self, base_model_id: str) -> None:
@@ -114,7 +114,7 @@ def training_progress(self) -> float:
     def get_new_best_model(self) -> Optional[BasicModel]:
         logging.warning('get_new_model called')
         if self.error_configuration.get_new_model:
-            raise Exception()
+            raise Exception('Could not get new model')
         if not self.provide_new_model:
             return None
         self.current_iteration += 1

From ee1d113c4d4777dc964c2e0f5fa28f443526ae98 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 8 Mar 2024 11:30:06 +0100
Subject: [PATCH 16/62] fix more tests (use enums)

---
 .../trainer/tests/states/test_state_detecting.py            | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py
index d48279ee..fbb8e9c0 100644
--- a/learning_loop_node/trainer/tests/states/test_state_detecting.py
+++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py
@@ -23,8 +23,8 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi
                               TrainerState.Detected, trainer._do_detections)
     )
 
-    await assert_training_state(trainer.active_training, 'detecting', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, 'detected', timeout=10, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=1, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=10, interval=0.001)
 
     assert trainer_has_error(trainer) is False
     assert trainer.active_training.training_state == TrainerState.Detected
@@ -40,7 +40,7 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer
 
     _ = asyncio.get_running_loop().create_task(trainer.run())
 
-    await assert_training_state(trainer.active_training, 'detecting', timeout=5, interval=0.001)
+    await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=5, interval=0.001)
     await trainer.stop()
     await asyncio.sleep(0.1)
 

From 6a83928e372c3a5f30f033bd9ea1c8928e50c8d8 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 8 Mar 2024 11:35:51 +0100
Subject: [PATCH 17/62] try reduce flakynes of test_about_endpoint

---
 learning_loop_node/detector/tests/test_client_communication.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning_loop_node/detector/tests/test_client_communication.py b/learning_loop_node/detector/tests/test_client_communication.py
index 97daf93a..97e3f074 100644
--- a/learning_loop_node/detector/tests/test_client_communication.py
+++ b/learning_loop_node/detector/tests/test_client_communication.py
@@ -90,7 +90,7 @@ async def test_sio_upload(test_detector_node: DetectorNode, sio_client):
 
 # NOTE: This test seems to be flaky.
 async def test_about_endpoint(test_detector_node: DetectorNode):
-    await asyncio.sleep(1)
+    await asyncio.sleep(3)
     response = requests.get(f'http://localhost:{GLOBALS.detector_port}/about', timeout=30)
 
     assert response.status_code == 200

From 704cdf4d367940502ada454df2d99fa8dd89fcd2 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 8 Mar 2024 12:42:06 +0100
Subject: [PATCH 18/62] fix mock_trainer tests

---
 mock_trainer/app_code/tests/test_detections.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py
index df6e1292..42fbfe8b 100644
--- a/mock_trainer/app_code/tests/test_detections.py
+++ b/mock_trainer/app_code/tests/test_detections.py
@@ -5,10 +5,9 @@
 
 from learning_loop_node.data_classes import Category, Context
 from learning_loop_node.globals import GLOBALS
-from learning_loop_node.helpers.misc import create_project_folder
+from learning_loop_node.helpers.misc import create_project_folder, generate_training
 from learning_loop_node.loop_communication import LoopCommunicator
 from learning_loop_node.tests import test_helper
-from learning_loop_node.trainer.trainer_logic import TrainerLogic
 from learning_loop_node.trainer.trainer_node import TrainerNode
 
 from ..mock_trainer_logic import MockTrainerLogic
@@ -33,7 +32,7 @@ async def test_all(setup_test_project1, glc: LoopCommunicator):  # pylint: disab
     trainer.init_new_training(context=context, details=details)
 
     project_folder = create_project_folder(context)
-    training = TrainerLogic.generate_training(project_folder, context)
+    training = generate_training(project_folder, context)
     training.model_id_for_detecting = latest_model_id
     trainer._training = training  # pylint: disable=protected-access
     await trainer._do_detections()  # pylint: disable=protected-access

From 811514425120302bd18ac277b9e1ddee51d82161 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 22 Mar 2024 19:19:41 +0100
Subject: [PATCH 19/62] improve code documentation, abstraction layers and api

---
 learning_loop_node/data_classes/training.py   |  23 +-
 learning_loop_node/data_exchanger.py          |   8 +-
 learning_loop_node/helpers/misc.py            |   5 -
 learning_loop_node/py.typed                   |   0
 learning_loop_node/trainer/io_helpers.py      |  10 +
 learning_loop_node/trainer/tests/conftest.py  |  28 +-
 .../trainer/tests/state_helper.py             |   2 +-
 .../tests/states/test_state_cleanup.py        |   4 +-
 .../tests/states/test_state_detecting.py      |  36 +-
 .../states/test_state_download_train_model.py |  40 +-
 .../tests/states/test_state_prepare.py        |  28 +-
 .../test_state_sync_confusion_matrix.py       |  38 +-
 .../trainer/tests/states/test_state_train.py  |  36 +-
 .../states/test_state_upload_detections.py    |  48 +--
 .../tests/states/test_state_upload_model.py   |  34 +-
 .../trainer/tests/test_errors.py              |  18 +-
 learning_loop_node/trainer/trainer_logic.py   |  44 +-
 .../trainer/trainer_logic_abstraction.py      | 146 -------
 .../trainer/trainer_logic_generic.py          | 395 ++++++++++++------
 learning_loop_node/trainer/trainer_node.py    |   4 +-
 mock_trainer/app_code/progress_simulator.py   |   4 +-
 .../app_code/tests/test_detections.py         |   2 +-
 .../app_code/tests/test_mock_trainer.py       |   2 +-
 23 files changed, 490 insertions(+), 465 deletions(-)
 create mode 100644 learning_loop_node/py.typed
 delete mode 100644 learning_loop_node/trainer/trainer_logic_abstraction.py

diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index 449cc85b..2ce1c95b 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -17,6 +17,14 @@ class Hyperparameter():
     flip_rl: bool
     flip_ud: bool
 
+    @staticmethod
+    def from_data(data: Dict):
+        return Hyperparameter(
+            resolution=data['resolution'],
+            flip_rl=data.get('flip_rl', False),
+            flip_ud=data.get('flip_ud', False)
+        )
+
 
 @dataclass(**KWONLY_SLOTS)
 class TrainingData():
@@ -93,18 +101,25 @@ class Training():
     id: str
     context: Context
 
-    project_folder: str
-    images_folder: str
-    training_folder: str
+    project_folder: str  # f'{GLOBALS.data_folder}/{context.organization}/{context.project}'
+    images_folder: str  # f'{project_folder}/images'
+    training_folder: str  # f'{project_folder}/trainings/{trainings_id}'
     start_time: float = field(default_factory=time.time)
 
-    base_model_id: Optional[str] = None
+    base_model_id: Optional[str] = None  # model uuid to download into base_model.json
     data: Optional[TrainingData] = None
     training_number: Optional[int] = None
     training_state: Optional[str] = None
     model_id_for_detecting: Optional[str] = None
     hyperparameters: Optional[Dict] = None
 
+    def set_values_from_data(self, data: Dict):
+        self.data = TrainingData(categories=Category.from_list(data['categories']))
+        self.data.hyperparameter = Hyperparameter.from_data(data=data)
+        self.training_number = data['training_number']
+        self.base_model_id = data['id']
+        self.training_state = TrainerState.Initialized
+
 
 @dataclass(**KWONLY_SLOTS)
 class TrainingOut():
diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py
index ab53b243..840a0fe9 100644
--- a/learning_loop_node/data_exchanger.py
+++ b/learning_loop_node/data_exchanger.py
@@ -122,11 +122,11 @@ async def _download_one_image(self, path: str, image_id: str, image_folder: str)
         if not await is_valid_image(filename, self.check_jpeg):
             os.remove(filename)
 
-    async def download_model(self, target_folder: str, context: Context, model_id: str, model_format: str) -> List[str]:
+    async def download_model(self, target_folder: str, context: Context, model_uuid: str, model_format: str) -> List[str]:
         """Downloads a model and returns the paths of the downloaded files."""
-        logging.info(f'Downloading model {model_id} to {target_folder}..')
+        logging.info(f'Downloading model {model_uuid} to {target_folder}..')
 
-        path = f'/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
+        path = f'/{context.organization}/projects/{context.project}/models/{model_uuid}/{model_format}/file'
         response = await self.loop_communicator.get(path, requires_login=False)
         if response.status_code != 200:
             content = response.json()
@@ -150,7 +150,7 @@ async def download_model(self, target_folder: str, context: Context, model_id: s
             new_file = shutil.move(file, target_folder)
             created_files.append(new_file)
 
-        logging.info(f'---- downloaded model {model_id}/{model_format} to {tmp_path}. Moved to {target_folder}.')
+        logging.info(f'---- downloaded model {model_uuid}/{model_format} to {tmp_path}. Moved to {target_folder}.')
         return created_files
 
     async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]:
diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py
index 1f2e297d..5b996092 100644
--- a/learning_loop_node/helpers/misc.py
+++ b/learning_loop_node/helpers/misc.py
@@ -76,7 +76,6 @@ async def is_valid_image(filename: str, check_jpeg: bool) -> bool:
     return "OK" in out.decode()
 
 
-@staticmethod
 async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) -> None:
     logging.info('deleting corrupt images')
     n_deleted = 0
@@ -189,7 +188,6 @@ def activate_asyncio_warnings() -> None:
         logging.exception('could not activate asyncio warnings. Exception:')
 
 
-@staticmethod
 def images_for_ids(image_ids, image_folder) -> List[str]:
     logging.info(f'### Going to get images for {len(image_ids)} images ids')
     start = perf_counter()
@@ -200,7 +198,6 @@ def images_for_ids(image_ids, image_folder) -> List[str]:
     return images
 
 
-@staticmethod
 def generate_training(project_folder: str, context: Context) -> Training:
     training_uuid = str(uuid4())
     return Training(
@@ -212,7 +209,6 @@ def generate_training(project_folder: str, context: Context) -> Training:
     )
 
 
-@staticmethod
 def delete_all_training_folders(project_folder: str):
     if not os.path.exists(f'{project_folder}/trainings'):
         return
@@ -220,7 +216,6 @@ def delete_all_training_folders(project_folder: str):
         shutil.rmtree(f'{project_folder}/trainings/{uuid}', ignore_errors=True)
 
 
-@staticmethod
 def create_training_folder(project_folder: str, trainings_id: str) -> str:
     training_folder = f'{project_folder}/trainings/{trainings_id}'
     os.makedirs(training_folder, exist_ok=True)
diff --git a/learning_loop_node/py.typed b/learning_loop_node/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/learning_loop_node/trainer/io_helpers.py b/learning_loop_node/trainer/io_helpers.py
index 6ec7a5c3..453add80 100644
--- a/learning_loop_node/trainer/io_helpers.py
+++ b/learning_loop_node/trainer/io_helpers.py
@@ -14,6 +14,16 @@
 from ..loop_communication import LoopCommunicator
 
 
+class EnvironmentVars:
+    def __init__(self) -> None:
+        self.restart_after_training = os.environ.get(
+            'RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1']
+        self.keep_old_trainings = os.environ.get(
+            'KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1']
+        self.inference_batch_size = int(
+            os.environ.get('INFERENCE_BATCH_SIZE', '10'))
+
+
 class LastTrainingIO:
 
     def __init__(self, node_uuid: str) -> None:
diff --git a/learning_loop_node/trainer/tests/conftest.py b/learning_loop_node/trainer/tests/conftest.py
index 75937920..f07af98f 100644
--- a/learning_loop_node/trainer/tests/conftest.py
+++ b/learning_loop_node/trainer/tests/conftest.py
@@ -25,13 +25,13 @@ async def test_initialized_trainer_node():
     trainer = TestingTrainerLogic()
     node = TrainerNode(name='test', trainer_logic=trainer, uuid='NOD30000-0000-0000-0000-000000000000')
     trainer._node = node  # pylint: disable=protected-access
-    trainer.init_new_training(context=Context(organization='zauberzeug', project='demo'),
-                              details={'categories': [],
-                                       'id': '917d5c7f-403d-7e92-f95f-577f79c2273a',  # version 1.2 of demo project
-                                       'training_number': 0,
-                                       'resolution': 800,
-                                       'flip_rl': False,
-                                       'flip_ud': False})
+    trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'),
+                               details={'categories': [],
+                                        'id': '917d5c7f-403d-7e92-f95f-577f79c2273a',  # version 1.2 of demo project
+                                        'training_number': 0,
+                                        'resolution': 800,
+                                        'flip_rl': False,
+                                        'flip_ud': False})
 
     # pylint: disable=protected-access
     await node._on_startup()
@@ -47,13 +47,13 @@ async def test_initialized_trainer():
     # pylint: disable=protected-access
     await node._on_startup()
     trainer._node = node  # pylint: disable=protected-access
-    trainer.init_new_training(context=Context(organization='zauberzeug', project='demo'),
-                              details={'categories': [],
-                                       'id': '917d5c7f-403d-7e92-f95f-577f79c2273a',  # version 1.2 of demo project
-                                       'training_number': 0,
-                                       'resolution': 800,
-                                       'flip_rl': False,
-                                       'flip_ud': False})
+    trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'),
+                               details={'categories': [],
+                                        'id': '917d5c7f-403d-7e92-f95f-577f79c2273a',  # version 1.2 of demo project
+                                        'training_number': 0,
+                                        'resolution': 800,
+                                        'flip_rl': False,
+                                        'flip_ud': False})
 
     yield trainer
     # await node._on_shutdown()
diff --git a/learning_loop_node/trainer/tests/state_helper.py b/learning_loop_node/trainer/tests/state_helper.py
index a5b982ec..01c9001d 100644
--- a/learning_loop_node/trainer/tests/state_helper.py
+++ b/learning_loop_node/trainer/tests/state_helper.py
@@ -7,7 +7,7 @@
 
 def create_active_training_file(trainer: TrainerLogic, **kwargs) -> None:
     update_attributes(trainer._training, **kwargs)  # pylint: disable=protected-access
-    trainer.node.last_training_io.save(training=trainer.active_training)
+    trainer.node.last_training_io.save(training=trainer.training)
 
 
 async def assert_training_state(training: Training, state: str, timeout: float, interval: float) -> None:
diff --git a/learning_loop_node/trainer/tests/states/test_state_cleanup.py b/learning_loop_node/trainer/tests/states/test_state_cleanup.py
index 3326d156..9fbf076d 100644
--- a/learning_loop_node/trainer/tests/states/test_state_cleanup.py
+++ b/learning_loop_node/trainer/tests/states/test_state_cleanup.py
@@ -5,7 +5,7 @@
 async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state='ready_for_cleanup')
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
     trainer.active_training_io.save_detections(detections=[])
 
     trainer.active_training_io.save_detection_upload_progress(count=42)
@@ -16,7 +16,7 @@ async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic
     assert trainer.active_training_io.detection_upload_progress_exist() is True
     assert trainer.active_training_io.detections_upload_file_index_exists() is True
 
-    await trainer.clear_training()
+    await trainer._clear_training()
 
     assert trainer._training is None  # pylint: disable=protected-access
     assert trainer.node.last_training_io.exists() is False
diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py
index fbb8e9c0..efd9b966 100644
--- a/learning_loop_node/trainer/tests/states/test_state_detecting.py
+++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py
@@ -19,28 +19,28 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi
                                 model_id_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a')
     # trainer.load_active_training()
     _ = asyncio.get_running_loop().create_task(
-        trainer.perform_state('do_detections', TrainerState.Detecting,
-                              TrainerState.Detected, trainer._do_detections)
+        trainer._perform_state('do_detections', TrainerState.Detecting,
+                               TrainerState.Detected, trainer._do_detections)
     )
 
-    await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=10, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detecting, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detected, timeout=10, interval=0.001)
 
     assert trainer_has_error(trainer) is False
-    assert trainer.active_training.training_state == TrainerState.Detected
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.Detected
+    assert trainer.node.last_training_io.load() == trainer.training
     assert trainer.active_training_io.detections_exist()
 
 
 async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded)
-    trainer.init_from_last_training()
-    trainer.active_training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242'
+    trainer._init_from_last_training()
+    trainer.training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242'
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=5, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detecting, timeout=5, interval=0.001)
     await trainer.stop()
     await asyncio.sleep(0.1)
 
@@ -53,24 +53,24 @@ async def test_model_not_downloadable_error(test_initialized_trainer: TestingTra
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded,
                                 model_id_for_detecting='00000000-0000-0000-0000-000000000000')  # bad model id
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, 'detecting', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, 'train_model_uploaded', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, 'detecting', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, 'train_model_uploaded', timeout=1, interval=0.001)
     await asyncio.sleep(0.1)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.TrainModelUploaded
-    assert trainer.active_training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000'
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainModelUploaded
+    assert trainer.training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000'
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 def test_save_load_detections(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     detections = [get_dummy_detections(), get_dummy_detections()]
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
index 12e9b745..f5ef302b 100644
--- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
@@ -12,31 +12,31 @@ async def test_downloading_is_successful(test_initialized_trainer: TestingTraine
     create_active_training_file(trainer, training_state=TrainerState.DataDownloaded)
 
     trainer.model_format = 'mocked'
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     asyncio.get_running_loop().create_task(
-        trainer.perform_state('download_model',
-                              TrainerState.TrainModelDownloading,
-                              TrainerState.TrainModelDownloaded, trainer._download_model))
-    await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, 'train_model_downloaded', timeout=1, interval=0.001)
+        trainer._perform_state('download_model',
+                               TrainerState.TrainModelDownloading,
+                               TrainerState.TrainModelDownloaded, trainer._download_model))
+    await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, 'train_model_downloaded', timeout=1, interval=0.001)
 
-    assert trainer.active_training.training_state == TrainerState.TrainModelDownloaded
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainModelDownloaded
+    assert trainer.node.last_training_io.load() == trainer.training
 
     # file on disk
-    assert os.path.exists(f'{trainer.active_training.training_folder}/base_model.json')
-    assert os.path.exists(f'{trainer.active_training.training_folder}/file_1.txt')
-    assert os.path.exists(f'{trainer.active_training.training_folder}/file_2.txt')
+    assert os.path.exists(f'{trainer.training.training_folder}/base_model.json')
+    assert os.path.exists(f'{trainer.training.training_folder}/file_1.txt')
+    assert os.path.exists(f'{trainer.training.training_folder}/file_2.txt')
 
 
 async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state='data_downloaded')
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -49,13 +49,13 @@ async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic)
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.DataDownloaded,
                                 base_model_id='00000000-0000-0000-0000-000000000000')  # bad model id)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.DataDownloaded, timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.DataDownloaded, timeout=1, interval=0.001)
 
     assert trainer.errors.has_error_for('download_model')
     assert trainer._training is not None  # pylint: disable=protected-access
-    assert trainer.active_training.training_state == TrainerState.DataDownloaded
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.DataDownloaded
+    assert trainer.node.last_training_io.load() == trainer.training
diff --git a/learning_loop_node/trainer/tests/states/test_state_prepare.py b/learning_loop_node/trainer/tests/states/test_state_prepare.py
index 8c490c92..c6648ea4 100644
--- a/learning_loop_node/trainer/tests/states/test_state_prepare.py
+++ b/learning_loop_node/trainer/tests/states/test_state_prepare.py
@@ -15,22 +15,22 @@ def trainer_has_error(trainer: TrainerLogic):
 async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    await trainer.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare)
+    await trainer._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare)
     assert trainer_has_error(trainer) is False
-    assert trainer.active_training.training_state == TrainerState.DataDownloaded
-    assert trainer.active_training.data is not None
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.DataDownloaded
+    assert trainer.training.data is not None
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_abort_preparing(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -43,13 +43,13 @@ async def test_request_error(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, context=Context(
         organization='zauberzeug', project='some_bad_project'))
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=3, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.Initialized, timeout=3, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=3, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Initialized, timeout=3, interval=0.001)
 
     assert trainer_has_error(trainer)
     assert trainer._training is not None  # pylint: disable=protected-access
-    assert trainer.active_training.training_state == TrainerState.Initialized
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.Initialized
+    assert trainer.node.last_training_io.load() == trainer.training
diff --git a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
index cc145233..2fe586aa 100644
--- a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
+++ b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
@@ -23,14 +23,14 @@ async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic):
     # TODO this requires trainer to have _training
     # trainer.load_active_training()
     create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
     assert trainer_has_error(trainer) is False
-    assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_unsynced_model_available__sync_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture):
@@ -40,15 +40,15 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine
     await mock_socket_io_call(mocker, test_initialized_trainer_node, {'success': True})
     create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
 
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
     trainer.has_new_model = True
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer) is False
 #    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_unsynced_model_available__sio_not_connected(test_initialized_trainer_node: TrainerNode):
@@ -60,14 +60,14 @@ async def test_unsynced_model_available__sio_not_connected(test_initialized_trai
     assert test_initialized_trainer_node.sio_client.connected is False
     trainer.has_new_model = True
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_unsynced_model_available__request_is_not_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture):
@@ -79,14 +79,14 @@ async def test_unsynced_model_available__request_is_not_successful(test_initiali
     create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
 
     trainer.has_new_model = True
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_basic_mock(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture):
diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py
index 46a7f953..168a81d4 100644
--- a/learning_loop_node/trainer/tests/states/test_state_train.py
+++ b/learning_loop_node/trainer/tests/states/test_state_train.py
@@ -10,11 +10,11 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'start_training'
@@ -22,30 +22,30 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic
     # pylint: disable=protected-access
     assert trainer._executor is not None
     trainer._executor.stop()  # NOTE normally a training terminates itself
-    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.active_training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_stop_running_training(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
     await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)  # pylint: disable=protected-access
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'start_training'
 
     await trainer.stop()
-    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.active_training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrainerLogic):
@@ -53,20 +53,20 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain
 
     # NOTE e.g. when a node-computer is restarted
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
     trainer._can_resume = True  # pylint: disable=protected-access
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
     await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)  # pylint: disable=protected-access
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'resume'
 
     # pylint: disable=protected-access
     assert trainer._executor is not None
     trainer._executor.stop()  # NOTE normally a training terminates itself e.g
-    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.active_training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.training
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
index 757cf968..8567e69d 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
@@ -44,14 +44,14 @@ async def create_valid_detection_file(trainer: TrainerLogic, number_of_entries:
 async def test_upload_successful(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.Detected)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     await create_valid_detection_file(trainer)
     await asyncio.get_running_loop().create_task(
-        trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions))
+        trainer._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions))
 
-    assert trainer.active_training.training_state == TrainerState.ReadyForCleanup
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.ReadyForCleanup
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 @pytest.mark.asyncio
@@ -59,14 +59,14 @@ async def test_detection_upload_progress_is_stored(test_initialized_trainer: Tes
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.Detected)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     await create_valid_detection_file(trainer)
 
     assert trainer.active_training_io.load_detections_upload_file_index() == 0
     # await trainer.upload_detections()
     await asyncio.get_running_loop().create_task(
-        trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions))
+        trainer._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions))
 
     assert trainer.active_training_io.load_detection_upload_progress() == 0  # Progress is reset for every file
     assert trainer.active_training_io.load_detections_upload_file_index() == 1
@@ -77,7 +77,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.Detected)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     await create_valid_detection_file(trainer, 2, 0)
     await create_valid_detection_file(trainer, 2, 1)
@@ -91,7 +91,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test
     for i in range(skip_detections, len(detections), batch_size):
         batch_detections = detections[i:i+batch_size]
         # pylint: disable=protected-access
-        await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size)
+        await trainer.active_training_io._upload_detections(trainer.training.context, batch_detections, i + batch_size)
 
         expected_value = i + batch_size if i + batch_size < len(detections) else 0  # Progress is reset for every file
         assert trainer.active_training_io.load_detection_upload_progress() == expected_value
@@ -107,7 +107,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test
     for i in range(skip_detections, len(detections), batch_size):
         batch_detections = detections[i:i+batch_size]
         # pylint: disable=protected-access
-        await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size)
+        await trainer.active_training_io._upload_detections(trainer.training.context, batch_detections, i + batch_size)
 
         expected_value = i + batch_size if i + batch_size < len(detections) else 0  # Progress is reset for every file
         assert trainer.active_training_io.load_detection_upload_progress() == expected_value
@@ -120,16 +120,16 @@ async def test_bad_status_from_LearningLoop(test_initialized_trainer: TestingTra
 
     create_active_training_file(trainer, training_state=TrainerState.Detected, context=Context(
         organization='zauberzeug', project='some_bad_project'))
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
     trainer.active_training_io.save_detections([get_dummy_detections()])
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.Detected
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.Detected
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_other_errors(test_initialized_trainer: TestingTrainerLogic):
@@ -137,27 +137,27 @@ async def test_other_errors(test_initialized_trainer: TestingTrainerLogic):
 
     # e.g. missing detection file
     create_active_training_file(trainer, training_state=TrainerState.Detected)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.Detected
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.Detected
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.Detected)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
     await create_valid_detection_file(trainer)
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
index 9faa656f..ac147065 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
@@ -19,29 +19,29 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer
     mock_upload_model_for_training(mocker, 'new_model_id')
 
     create_active_training_file(trainer)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     train_task = asyncio.get_running_loop().create_task(
-        trainer.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, trainer._upload_model))
+        trainer._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, trainer._upload_model))
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
     await train_task
 
     assert trainer_has_error(trainer) is False
-    assert trainer.active_training.training_state == TrainerState.TrainModelUploaded
-    assert trainer.active_training.model_id_for_detecting is not None
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainModelUploaded
+    assert trainer.training.model_id_for_detecting is not None
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -57,18 +57,18 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
     # TODO goes to finished because of the error
-    await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced
-    assert trainer.active_training.model_id_for_detecting is None
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
+    assert trainer.training.model_id_for_detecting is None
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_mock_loop_response_example(mocker: MockerFixture, test_initialized_trainer: TestingTrainerLogic):
@@ -77,7 +77,7 @@ async def test_mock_loop_response_example(mocker: MockerFixture, test_initialize
     mock_upload_model_for_training(mocker, 'new_model_id')
 
     create_active_training_file(trainer)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     # pylint: disable=protected-access
     result = await trainer._upload_model_return_new_model_uuid(Context(organization='zauberzeug', project='demo'))
diff --git a/learning_loop_node/trainer/tests/test_errors.py b/learning_loop_node/trainer/tests/test_errors.py
index 1ba85572..bdb40c95 100644
--- a/learning_loop_node/trainer/tests/test_errors.py
+++ b/learning_loop_node/trainer/tests/test_errors.py
@@ -9,30 +9,30 @@
 async def test_training_process_is_stopped_when_trainer_reports_error(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
-    trainer.init_from_last_training()
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    trainer._init_from_last_training()
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     trainer.error_msg = 'some_error'
-    await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
 
 
 async def test_log_can_provide_only_data_for_current_run(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
-    trainer.init_from_last_training()
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    trainer._init_from_last_training()
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
 
     assert trainer._executor is not None
     assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) == 1
 
     trainer.error_msg = 'some_error'
-    await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
     trainer.error_msg = None
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     await asyncio.sleep(1)
 
     assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) > 1
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 82fd8aad..c5b47df9 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -5,7 +5,7 @@
 import shutil
 from abc import abstractmethod
 from datetime import datetime
-from typing import Coroutine, Dict, List, Optional
+from typing import Coroutine, List, Optional
 
 from dacite import from_dict
 
@@ -22,30 +22,24 @@ def __init__(self, model_format: str) -> None:
         self.model_format: str = model_format
         # NOTE: String to be used in the file path for the model on the server:
         # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
-
+        self._detection_progress: Optional[float] = None
         self._executor: Optional[Executor] = None
         self.start_training_task: Optional[Coroutine] = None
 
+    @property
+    def detection_progress(self) -> Optional[float]:
+        return self._detection_progress
+
     @property
     def executor(self) -> Executor:
         assert self._executor is not None, 'executor must be set, call `run_training` first'
         return self._executor
 
-    @property
-    def hyperparameters(self) -> Optional[Dict]:
-        if self._training and self._training.data and self._training.data.hyperparameter:
-            information = {}
-            information['resolution'] = self._training.data.hyperparameter.resolution
-            information['flipRl'] = self._training.data.hyperparameter.flip_rl
-            information['flipUd'] = self._training.data.hyperparameter.flip_ud
-            return information
-        return None
-
     async def _train(self) -> None:
         previous_state = TrainerState.TrainModelDownloaded
         error_key = 'run_training'
-        self._executor = Executor(self.active_training.training_folder)
-        self.active_training.training_state = TrainerState.TrainingRunning
+        self._executor = Executor(self.training.training_folder)
+        self.training.training_state = TrainerState.TrainingRunning
 
         try:
             await self._start_training()
@@ -81,7 +75,7 @@ async def _train(self) -> None:
             logging.exception('Error in TrainingProcess')
             if self.executor.is_process_running():
                 self.executor.stop()
-            self.active_training.training_state = previous_state
+            self.training.training_state = previous_state
             raise
 
     async def _start_training(self):
@@ -89,7 +83,7 @@ async def _start_training(self):
         if self.can_resume():
             self.start_training_task = self.resume()
         else:
-            base_model_id = self.active_training.base_model_id
+            base_model_id = self.training.base_model_id
             if not is_valid_uuid4(base_model_id):  # TODO this check was done earlier!
                 assert isinstance(base_model_id, str)
                 # TODO this could be removed here and accessed via self.training.base_model_id
@@ -99,8 +93,8 @@ async def _start_training(self):
         await self.start_training_task
 
     async def _do_detections(self) -> None:
-        context = self.active_training.context
-        model_id = self.active_training.model_id_for_detecting
+        context = self.training.context
+        model_id = self.training.model_id_for_detecting
         assert model_id, 'model_id must be set'
         tmp_folder = f'/tmp/model_for_auto_detections_{model_id}_{self.model_format}'
 
@@ -108,22 +102,22 @@ async def _do_detections(self) -> None:
         os.makedirs(tmp_folder)
         logging.info(f'downloading detection model to {tmp_folder}')
 
-        await self.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format)
+        await self.node.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format)
         with open(f'{tmp_folder}/model.json', 'r') as f:
             model_information = from_dict(data_class=ModelInformation, data=json.load(f))
 
         project_folder = create_project_folder(context)
         image_folder = create_image_folder(project_folder)
-        self.data_exchanger.set_context(context)
+        self.node.data_exchanger.set_context(context)
         image_ids = []
         for state, p in zip(['inbox', 'annotate', 'review', 'complete'], [0.1, 0.2, 0.3, 0.4]):
-            self.detection_progress = p
+            self._detection_progress = p
             logging.info(f'fetching image ids of {state}')
-            new_ids = await self.data_exchanger.fetch_image_uuids(query_params=f'state={state}')
+            new_ids = await self.node.data_exchanger.fetch_image_uuids(query_params=f'state={state}')
             image_ids += new_ids
             logging.info(f'downloading {len(new_ids)} images')
-            await self.data_exchanger.download_images(new_ids, image_folder)
-        self.detection_progress = 0.42
+            await self.node.data_exchanger.download_images(new_ids, image_folder)
+        self._detection_progress = 0.42
         # await delete_corrupt_images(image_folder)
 
         images = await asyncio.get_event_loop().run_in_executor(None, images_for_ids, image_ids, image_folder)
@@ -133,7 +127,7 @@ async def _do_detections(self) -> None:
 
         batch_size = 200
         for idx, i in enumerate(range(0, num_images, batch_size)):
-            self.detection_progress = 0.5 + (i/num_images)*0.5
+            self._detection_progress = 0.5 + (i/num_images)*0.5
             batch_images = images[i:i+batch_size]
             batch_detections = await self._detect(model_information, batch_images, tmp_folder)
             self.active_training_io.save_detections(batch_detections, idx)
diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py
deleted file mode 100644
index 64349e3d..00000000
--- a/learning_loop_node/trainer/trainer_logic_abstraction.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import os
-import time
-from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Dict, List, Optional
-
-from socketio import AsyncClient
-
-from ..data_classes import Context, Errors, PretrainedModel, TrainerState, Training, TrainingData
-from ..data_exchanger import DataExchanger
-from ..loop_communication import LoopCommunicator
-from .io_helpers import ActiveTrainingIO, LastTrainingIO
-
-if TYPE_CHECKING:
-    from .trainer_node import TrainerNode
-
-
-class TrainerLogicAbstraction(ABC):
-
-    def __init__(self, model_format: str):
-
-        # NOTE: String to be used in the file path for the model on the server:
-        # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
-        self.model_format: str = model_format
-
-        self._node: Optional['TrainerNode'] = None  # type: ignore
-        self._last_training_io: Optional[LastTrainingIO] = None  # type: ignore
-        self.errors = Errors()
-
-        self._training: Optional[Training] = None
-        self._active_training_io: Optional[ActiveTrainingIO] = None
-
-        self.restart_after_training = os.environ.get('RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1']
-        self.keep_old_trainings = os.environ.get('KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1']
-        self.inference_batch_size = int(os.environ.get('INFERENCE_BATCH_SIZE', '10'))
-
-    @property
-    def node(self) -> 'TrainerNode':
-        assert self._node is not None, 'node should be set by TrainerNode before initialization'
-        return self._node
-
-    @property
-    def last_training_io(self) -> LastTrainingIO:
-        assert self._last_training_io is not None, 'last_training_io should be set by TrainerNode before initialization'
-        return self._last_training_io
-
-    @property
-    def data_exchanger(self) -> DataExchanger:
-        return self.node.data_exchanger
-
-    @property
-    def loop_communicator(self) -> LoopCommunicator:
-        return self.node.loop_communicator
-
-    @property
-    def node_uuid(self) -> str:
-        return self.node.uuid
-
-    @property
-    def sio_client(self) -> AsyncClient:
-        return self.node.sio_client
-
-    @property
-    def active_training_io(self) -> ActiveTrainingIO:
-        assert self._active_training_io is not None, 'active_training_io must be set, call `init` first'
-        return self._active_training_io
-
-    @property
-    def training_active(self) -> bool:
-        """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'"""
-        return self._training is not None and self._active_training_io is not None
-
-    @property
-    def state(self) -> str:
-        if (not self.training_active) or (self.active_training.training_state is None):
-            return TrainerState.Idle.value
-        else:
-            return self.active_training.training_state
-
-    @property
-    def active_training(self) -> Training:
-        assert self._training is not None, 'training must be initialized, call `init` first'
-        return self._training
-
-    @property
-    def training_uptime(self) -> Optional[float]:
-        if self.training_active:
-            return time.time() - self.active_training.start_time
-        return None
-
-    @property
-    def training_data(self) -> Optional[TrainingData]:
-        if self.training_active and self.active_training.data:
-            return self.active_training.data
-        return None
-
-    @property
-    def training_context(self) -> Optional[Context]:
-        if self.training_active:
-            return self.active_training.context
-        return None
-
-    # --- ABSTRACT PROPERTIES
-    # --------- implemented in TrainerLogicGeneric
-
-    @property
-    @abstractmethod
-    def general_progress(self) -> Optional[float]:
-        """Returns the general progress of the training per state or None if idle"""
-
-    # --------- implemented in TrainerLogic(with Executor)
-    @property
-    @abstractmethod
-    def hyperparameters(self) -> Optional[Dict]:
-        """Returns the currently used hyperparameters if available"""
-
-    # --------- not implemented in any abstract class
-    @property
-    @abstractmethod
-    def model_architecture(self) -> Optional[str]:
-        """Returns the architecture name of the model if available"""
-
-    @property
-    @abstractmethod
-    def provided_pretrained_models(self) -> List[PretrainedModel]:
-        """Returns the list of provided pretrained models"""
-
-    # --- ABSTRACT METHODS -----
-    # --------- implemented in TrainerLogicGeneric ---
-
-    @abstractmethod
-    async def on_shutdown(self):
-        """Called when the trainer is shut down"""
-
-    @abstractmethod
-    async def begin_training(self, organization: str, project: str, details: dict):
-        """Starts the training process"""
-
-    @abstractmethod
-    async def try_continue_run_if_incomplete(self) -> bool:
-        """Start training continuation if possible, returns True if continuation started"""
-
-    # --- implemented in TrainerLogic(with Executor) ---
-
-    @abstractmethod
-    async def stop(self):
-        """Stops the training process"""
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index 7221e6ec..d9abff34 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -3,37 +3,123 @@
 import logging
 import shutil
 import sys
-from abc import abstractmethod
+import time
+from abc import ABC, abstractmethod
 from dataclasses import asdict
-from typing import Callable, Coroutine, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Coroutine, Dict, List, Optional, Union
 
-from dacite import from_dict
 from fastapi.encoders import jsonable_encoder
 
-from ..data_classes import BasicModel, Category, Context, Hyperparameter, TrainerState, TrainingData, TrainingOut
+from ..data_classes import (BasicModel, Context, Errors, PretrainedModel, TrainerState, Training, TrainingData,
+                            TrainingOut)
 from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4
 from .downloader import TrainingsDownloader
-from .io_helpers import ActiveTrainingIO
-from .trainer_logic_abstraction import TrainerLogicAbstraction
+from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO
 
+if TYPE_CHECKING:
+    from .trainer_node import TrainerNode
 
-class TrainerLogicGeneric(TrainerLogicAbstraction):
 
-    def __init__(self, model_format: str) -> None:
-        super().__init__(model_format)
+class TrainerLogicGeneric(ABC):
+
+    def __init__(self, model_format: str):
+
+        # NOTE: model_format is used in the file path for the model on the server:
+        # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
+        self.model_format: str = model_format
+        self.errors = Errors()
+
         self.training_task: Optional[asyncio.Task] = None
-        self.detection_progress = 0.0
         self.shutdown_event: asyncio.Event = asyncio.Event()
 
+        self._node: Optional['TrainerNode'] = None  # type: ignore
+        self._last_training_io: Optional[LastTrainingIO] = None  # type: ignore
+
+        self._training: Optional[Training] = None
+        self._active_training_io: Optional[ActiveTrainingIO] = None
+        self._environment_vars = EnvironmentVars()
+
+    # ---------------------------------------- PROPERTIES TO AVOID CHECKING FOR NONE ----------------------------------------
+
+    @property
+    def node(self) -> 'TrainerNode':
+        assert self._node is not None, 'node should be set by TrainerNode before initialization'
+        return self._node
+
+    @property
+    def last_training_io(self) -> LastTrainingIO:
+        assert self._last_training_io is not None, 'last_training_io should be set by TrainerNode before initialization'
+        return self._last_training_io
+
+    @property
+    def active_training_io(self) -> ActiveTrainingIO:
+        assert self._active_training_io is not None, 'active_training_io must be set, call `init` first'
+        return self._active_training_io
+
+    @property
+    def training(self) -> Training:
+        assert self._training is not None, 'training must be initialized, call `init` first'
+        return self._training
+
+    @property
+    def training_data(self) -> Optional[TrainingData]:
+        if self.training_active and self.training.data:
+            return self.training.data
+        return None
+
+    @property
+    def training_context(self) -> Optional[Context]:
+        if self.training_active:
+            return self.training.context
+        return None
+    # ---------------------------------------- PROPERTIES ----------------------------------------
+
+    @property
+    def training_active(self) -> bool:
+        """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'.
+        """
+        return self._training is not None and self._active_training_io is not None
+
+    @property
+    def state(self) -> str:
+        """Returns the current state of the training. Used solely by the node in send_status().
+        """
+        if (not self.training_active) or (self.training.training_state is None):
+            return TrainerState.Idle.value
+        else:
+            return self.training.training_state
+
+    @property
+    def training_uptime(self) -> Optional[float]:
+        """Livetime of current Training object. Start time is set during initialization of Training object.
+        """
+        if self.training_active:
+            return time.time() - self.training.start_time
+        return None
+
+    @property
+    def hyperparameters(self) -> Optional[Dict]:
+        """Used in sync_confusion_matrix and send_status to provide information about the training configuration.
+        """
+        if self._training and self._training.data and self._training.data.hyperparameter:
+            information = {}
+            information['resolution'] = self._training.data.hyperparameter.resolution
+            information['flipRl'] = self._training.data.hyperparameter.flip_rl
+            information['flipUd'] = self._training.data.hyperparameter.flip_ud
+            return information
+        return None
+
     @property
     def general_progress(self) -> Optional[float]:
-        """Represents the progress for different states."""
+        """Represents the progress for different states, should run from 0 to 100 for each state.
+        Note that training_progress and detection_progress need to be implemented in the specific trainer.
+        """
         if not self.training_active:
             return None
 
-        t_state = self.active_training.training_state
+        t_state = self.training.training_state
         if t_state == TrainerState.DataDownloading:
-            return self.data_exchanger.progress
+            return self.node.data_exchanger.progress
         if t_state == TrainerState.TrainingRunning:
             return self.training_progress
         if t_state == TrainerState.Detecting:
@@ -41,45 +127,83 @@ def general_progress(self) -> Optional[float]:
 
         return None
 
-    def init_new_training(self, context: Context, details: Dict) -> None:
-        """Called on `begin_training` event from the Learning Loop.
-        Note that details needs the entries 'categories' and 'training_number'"""
+    # ---------------------------------------- ABSTRACT PROPERTIES ----------------------------------------
 
-        project_folder = create_project_folder(context)
-        if not self.keep_old_trainings:
-            # NOTE: We delete all existing training folders because they are not needed anymore.
-            delete_all_training_folders(project_folder)
-        self._training = generate_training(project_folder, context)
-        self._training.data = TrainingData(categories=Category.from_list(details['categories']))
-        self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details)
-        self._training.training_number = details['training_number']
-        self._training.base_model_id = details['id']
-        self._training.training_state = TrainerState.Initialized
-        self._active_training_io = ActiveTrainingIO(
-            self._training.training_folder, self.loop_communicator, context)
-        logging.info(f'training initialized: {self._training}')
+    @property
+    @abstractmethod
+    def training_progress(self) -> Optional[float]:
+        """Represents the training progress."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def detection_progress(self) -> Optional[float]:
+        """Represents the detection progress."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def model_architecture(self) -> Optional[str]:
+        """Returns the architecture name of the model if available"""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def provided_pretrained_models(self) -> List[PretrainedModel]:
+        """Returns the list of provided pretrained models"""
+        raise NotImplementedError
+
+    # ---------------------------------------- METHODS ----------------------------------------
+
+    # NOTE: Trainings are started by the Learning Loop via the begin_training event
+        # or by the trainer itself via try_continue_run_if_incomplete.
+        # The trainer will then initialize a new training object and start the training loop.
+        # Initializing a new training object will create the folder structure for the training.
+        # The training loop will then run through the states of the training.
 
     async def try_continue_run_if_incomplete(self) -> bool:
+        """Tries to continue a training if the last training was not finished.
+        """
         if not self.training_active and self.last_training_io.exists():
+            self._init_from_last_training()
             logging.info('found incomplete training, continuing now.')
-            self.init_from_last_training()
-            asyncio.get_event_loop().create_task(self.run())
+            asyncio.get_event_loop().create_task(self._run())
             return True
         return False
 
-    def init_from_last_training(self) -> None:
+    def _init_from_last_training(self) -> None:
+        """Initializes a new training object from the last training saved on disc via last_training_io.
+        """
         self._training = self.last_training_io.load()
         assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder'
         self._active_training_io = ActiveTrainingIO(
-            self._training.training_folder, self.loop_communicator, self._training.context)
+            self._training.training_folder, self.node.loop_communicator, self._training.context)
 
     async def begin_training(self, organization: str, project: str, details: Dict) -> None:
-        """Called on `begin_training` event from the Learning Loop."""
+        """Called on `begin_training` event from the Learning Loop.
+        """
+        self._init_new_training(Context(organization=organization, project=project), details)
+        asyncio.get_event_loop().create_task(self._run())
+
+    def _init_new_training(self, context: Context, details: Dict) -> None:
+        """Called on `begin_training` event from the Learning Loop.
+        Note that details needs the entries 'categories' and 'training_number',
+        but also the hyperparameter entries.
+        """
+        project_folder = create_project_folder(context)
+        if not self._environment_vars.keep_old_trainings:
+            delete_all_training_folders(project_folder)
+        self._training = generate_training(project_folder, context)
+        self._training.set_values_from_data(details)
 
-        self.init_new_training(Context(organization=organization, project=project), details)
-        asyncio.get_event_loop().create_task(self.run())
+        self._active_training_io = ActiveTrainingIO(
+            self._training.training_folder, self.node.loop_communicator, context)
+        logging.info(f'new training initialized: {self._training}')
 
-    async def run(self) -> None:
+    async def _run(self) -> None:
+        """Called on `begin_training` event from the Learning Loop. 
+        Either via `begin_training` or `try_continue_run_if_incomplete`.
+        """
         self.errors.reset_all()
         try:
             self.training_task = asyncio.get_running_loop().create_task(self._training_loop())
@@ -87,46 +211,47 @@ async def run(self) -> None:
         except asyncio.CancelledError:
             if not self.shutdown_event.is_set():
                 logging.info('training task was cancelled but not by shutdown event')
-                self.active_training.training_state = TrainerState.ReadyForCleanup
-                self.last_training_io.save(self.active_training)
-                await self.clear_training()
+                self.training.training_state = TrainerState.ReadyForCleanup
+                self.last_training_io.save(self.training)
+                await self._clear_training()
         except Exception as e:
             logging.exception(f'Error in train: {e}')
 
     # ---------------------------------------- TRAINING STATES ----------------------------------------
 
     async def _training_loop(self) -> None:
-        """asyncio.CancelledError is catched in run"""
-
+        """Cycle through the training states until the training is finished or 
+        an asyncio.CancelledError is raised.
+        """
         assert self.training_active
 
         while self._training is not None:
-            tstate = self.active_training.training_state
-            logging.info(f'STATE LOOP: {tstate}, eerrors: {self.errors.errors}')
+            tstate = self.training.training_state
             await asyncio.sleep(0.6)  # Note: Required for pytests!
+
             if tstate == TrainerState.Initialized:  # -> DataDownloading -> DataDownloaded
-                await self.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare)
+                await self._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare)
             elif tstate == TrainerState.DataDownloaded:  # -> TrainModelDownloading -> TrainModelDownloaded
-                await self.perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model)
+                await self._perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model)
             elif tstate == TrainerState.TrainModelDownloaded:  # -> TrainingRunning -> TrainingFinished
-                await self.perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train)
+                await self._perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train)
             elif tstate == TrainerState.TrainingFinished:  # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced
-                await self.perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix)
+                await self._perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix)
             elif tstate == TrainerState.ConfusionMatrixSynced:  # -> TrainModelUploading -> TrainModelUploaded
-                await self.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model)
+                await self._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model)
             elif tstate == TrainerState.TrainModelUploaded:  # -> Detecting -> Detected
-                await self.perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections)
+                await self._perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections)
             elif tstate == TrainerState.Detected:  # -> DetectionUploading -> ReadyForCleanup
-                await self.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions)
+                await self._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions)
             elif tstate == TrainerState.ReadyForCleanup:  # -> RESTART or TrainingFinished
-                await self.clear_training()
+                await self._clear_training()
                 self.may_restart()
 
-    async def perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False):
+    async def _perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False):
         await asyncio.sleep(0.1)
         logging.info(f'Performing state: {state_during}')
-        previous_state = self.active_training.training_state
-        self.active_training.training_state = state_during
+        previous_state = self.training.training_state
+        self.training.training_state = state_during
         await asyncio.sleep(0.1)
         if reset_early:
             self.errors.reset(error_key)
@@ -141,71 +266,78 @@ async def perform_state(self, error_key: str, state_during: TrainerState, state_
         except Exception as e:
             self.errors.set(error_key, str(e))
             logging.exception(f'Error in {state_during} - Exception:')
-            self.active_training.training_state = previous_state
+            self.training.training_state = previous_state
         else:
             if not reset_early:
                 self.errors.reset(error_key)
-            self.active_training.training_state = state_after
-            self.last_training_io.save(self.active_training)
+            self.training.training_state = state_after
+            self.last_training_io.save(self.training)
 
     async def _prepare(self) -> None:
-        self.data_exchanger.set_context(self.active_training.context)
-        downloader = TrainingsDownloader(self.data_exchanger)
-        image_data, skipped_image_count = await downloader.download_training_data(self.active_training.images_folder)
-        assert self.active_training.data is not None, 'training.data must be set'
-        self.active_training.data.image_data = image_data
-        self.active_training.data.skipped_image_count = skipped_image_count
+        """Downloads images to the images_folder and saves annotations to training.data.image_data.
+        """
+        self.node.data_exchanger.set_context(self.training.context)
+        downloader = TrainingsDownloader(self.node.data_exchanger)
+        image_data, skipped_image_count = await downloader.download_training_data(self.training.images_folder)
+        assert self.training.data is not None, 'training.data must be set'
+        self.training.data.image_data = image_data
+        self.training.data.skipped_image_count = skipped_image_count
 
     async def _download_model(self) -> None:
-        model_id = self.active_training.base_model_id
-        assert model_id is not None, 'model_id must be set'
-        if is_valid_uuid4(
-                self.active_training.base_model_id):  # TODO this checks if we continue a training -> make more explicit
+        """If training is continued, the model is downloaded from the Learning Loop to the training_folder.
+        The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training.
+        """
+        model_id = self.training.base_model_id
+        # TODO this checks if we continue a training -> make more explicit
+        if model_id and is_valid_uuid4(self.training.base_model_id):
             logging.info('loading model from Learning Loop')
             logging.info(f'downloading model {model_id} as {self.model_format}')
-            await self.data_exchanger.download_model(self.active_training.training_folder, self.active_training.context, model_id, self.model_format)
-            shutil.move(f'{self.active_training.training_folder}/model.json',
-                        f'{self.active_training.training_folder}/base_model.json')
+            await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, model_id, self.model_format)
+            shutil.move(f'{self.training.training_folder}/model.json',
+                        f'{self.training.training_folder}/base_model.json')
         else:
-            logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download')
+            logging.info(
+                f'base_model_id {model_id} is not a valid uuid4 (or no base model was not provided), skipping download')
 
-    async def _sync_confusion_matrix(self):
-        '''NOTE: This stage sets the errors explicitly because it may be used inside the training stage.'''
+    async def _sync_confusion_matrix(self) -> None:
+        """Syncronizes the confusion matrix with the Learning Loop via the update_training endpoint.
+        NOTE: This stage sets the errors explicitly because it may be used inside the training stage.
+        """
         error_key = 'sync_confusion_matrix'
         try:
             new_best_model = self.get_new_best_model()
-            if new_best_model and self.active_training.data:
-                new_training = TrainingOut(trainer_id=self.node_uuid,
+            if new_best_model and self.training.data:
+                new_training = TrainingOut(trainer_id=self.node.uuid,
                                            confusion_matrix=new_best_model.confusion_matrix,
-                                           train_image_count=self.active_training.data.train_image_count(),
-                                           test_image_count=self.active_training.data.test_image_count(),
+                                           train_image_count=self.training.data.train_image_count(),
+                                           test_image_count=self.training.data.test_image_count(),
                                            hyperparameters=self.hyperparameters)
                 await asyncio.sleep(0.1)  # NOTE needed for tests.
 
-                result = await self.sio_client.call('update_training', (
-                    self.active_training.context.organization, self.active_training.context.project, jsonable_encoder(new_training)))
+                result = await self.node.sio_client.call('update_training', (
+                    self.training.context.organization, self.training.context.project, jsonable_encoder(new_training)))
                 if isinstance(result,  dict) and result['success']:
-                    logging.info(f'successfully updated training {asdict(new_training)}')
+                    logging.info(
+                        f'successfully updated training {asdict(new_training)}')
                     self.on_model_published(new_best_model)
                 else:
-                    raise Exception(f'Error for update_training: Response from loop was : {result}')
+                    raise Exception(
+                        f'Error for update_training: Response from loop was : {result}')
         except Exception as e:
             logging.exception('Error during confusion matrix syncronization')
             self.errors.set(error_key, str(e))
             raise
         self.errors.reset(error_key)
 
-    async def _upload_model(self) -> None | bool:
-        """Returns True if the training should be cleaned up."""
-
-        new_model_id = await self._upload_model_return_new_model_uuid(self.active_training.context)
+    async def _upload_model(self) -> None:
+        """Uploads the latest model to the Learning Loop.
+        """
+        new_model_id = await self._upload_model_return_new_model_uuid(self.training.context)
         if new_model_id is None:
-            self.active_training.training_state = TrainerState.ReadyForCleanup
+            self.training.training_state = TrainerState.ReadyForCleanup
             logging.error('could not upload model - maybe training failed.. cleaning up')
-            return True
         logging.info(f'Successfully uploaded model and received new model id: {new_model_id}')
-        self.active_training.model_id_for_detecting = new_model_id
-        return None
+        self.training.model_id_for_detecting = new_model_id
 
     async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
         """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file.
@@ -213,6 +345,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
         The conversion from .wts to .engine is done by the detector (needs to be done on target hardware).
         Note that trainer may train with different classes, which is why we send an initial model.json file.
         """
+        # NOTE: I guess this is in executor because originally the conversion happened here..
         files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files)
         if files is None:
             return None
@@ -225,10 +358,10 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
 
         model_uuid = None
         for file_format in [f for f in files if f not in already_uploaded_formats]:
-            _files = files[file_format] + [self.dump_categories_to_json()]
+            _files = files[file_format] + [self._dump_categories_to_json()]
             assert len([f for f in _files if 'model.json' in f]) == 1, "model.json must be included exactly once"
 
-            model_uuid = await self.data_exchanger.upload_model_get_uuid(context, _files, self.active_training.training_number, file_format)
+            model_uuid = await self.node.data_exchanger.upload_model_get_uuid(context, _files, self.training.training_number, file_format)
             if model_uuid is None:
                 return None
 
@@ -237,20 +370,23 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
 
         return model_uuid
 
-    def dump_categories_to_json(self) -> str:
+    def _dump_categories_to_json(self) -> str:
+        """Dumps the categories to a json file and returns the path to the file.
+        """
         content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None
         json_path = '/tmp/model.json'
         with open(json_path, 'w') as f:
             json.dump(content, f)
         return json_path
 
-    async def clear_training(self):
+    async def _clear_training(self):
+        """Clears the training data after a training has finished.
+        """
         self.active_training_io.delete_detections()
         self.active_training_io.delete_detection_upload_progress()
         self.active_training_io.delete_detections_upload_file_index()
-        await self.clear_training_data(self.active_training.training_folder)
+        await self.clear_training_data(self.training.training_folder)
         self.last_training_io.delete()
-        # self.training.training_state = TrainingState.TrainingFinished
 
         await self.node.send_status()
         self._training = None
@@ -258,7 +394,9 @@ async def clear_training(self):
     # ---------------------------------------- OTHER METHODS ----------------------------------------
 
     def may_restart(self) -> None:
-        if self.restart_after_training:
+        """If the environment variable RESTART_AFTER_TRAINING is set, the trainer will restart after a training.
+        """
+        if self._environment_vars.restart_after_training:
             logging.info('restarting')
             sys.exit(0)
         else:
@@ -269,49 +407,64 @@ async def on_shutdown(self) -> None:
         await self.stop()
         await self.stop()
 
-    # ---------------------------------------- ABSTRACT PROPERTIES ----------------------------------------
-
-    @property
-    @abstractmethod
-    def training_progress(self) -> Optional[float]:
-        """Represents the training progress."""
-        raise NotImplementedError
+    async def stop(self):
+        """Stops the training process by canceling training task.
+        """
+        if not self.training_active:
+            return
+        if self.training_task:
+            logging.info('cancelling training task')
+            if self.training_task.cancel():
+                try:
+                    await self.training_task
+                except asyncio.CancelledError:
+                    pass
+                logging.info('cancelled training task')
+                self.may_restart()
 
     # ---------------------------------------- ABSTRACT METHODS ----------------------------------------
 
     @abstractmethod
     async def _train(self) -> None:
-        '''Should be used to execute a training.
+        """Should be used to execute a training.
+        At this point, images are already downloaded to the images_folder and annotations are saved in training.data.image_data.
+        If a training is continued, the model is already downloaded.
         The model should be synchronized with the Learning Loop via self._sync_confusion_matrix() every now and then.
-        asyncio.CancelledError should be catched and re-raised.'''
+        asyncio.CancelledError should be catched and re-raised.
+        """
+        raise NotImplementedError
 
     @abstractmethod
     async def _do_detections(self) -> None:
-        '''Should be used to execute detections.
+        """Should be used to execute detections.
         active_training_io.save_detections(...) should be used to store the detections.
-        asyncio.CancelledError should be catched and re-raised.'''
+        asyncio.CancelledError should be catched and re-raised.
+        """
+        raise NotImplementedError
 
     @abstractmethod
     def get_new_best_model(self) -> Optional[BasicModel]:
-        '''Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe.
+        """Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe.
         Returns None if no new model could be found. Otherwise BasicModel(confusion_matrix, meta_information).
         `confusion_matrix` contains a dict of all classes:
             - The classes must be identified by their id, not their name.
             - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives).
         `meta_information` can hold any data which is helpful for self.on_model_published to store weight file etc for later upload via self.get_model_files
-        '''
+        """
+        raise NotImplementedError
 
     @abstractmethod
     def on_model_published(self, basic_model: BasicModel) -> None:
-        '''Called after a BasicModel has been successfully send to the Learning Loop.
-        The files for this model should be stored.
-        self.get_latest_model_files is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop.
-        In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model
-        '''
+        """Called after the confusion matrix corresponding to BasicModel has been successfully send to the Learning Loop.
+        The respective files for this model should be stored so they can be later uploaded in get_latest_model_files.
+        """
+        raise NotImplementedError
 
     @abstractmethod
     def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]:
-        '''Called when the Learning Loop requests to backup the latest model for the training.
+        """Called when the Learning Loop requests to backup the latest model for the training.
+        This function is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop.
+        In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model
         Should return a list of file paths which describe the model.
         These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.)
         and will be stored in the Learning Loop unter the format of this trainer.
@@ -319,9 +472,13 @@ def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str
         For example "model.pt" for pytorch or "model.weights" for darknet/yolo.
 
         If a trainer can also generate other formats (for example for an detector),
-        a dictionary mapping format -> list of files can be returned.'''
+        a dictionary mapping format -> list of files can be returned.
+        """
+        raise NotImplementedError
 
     @abstractmethod
     async def clear_training_data(self, training_folder: str) -> None:
-        '''Called after a training has finished. Deletes all data that is not needed anymore after a training run. 
-        This can be old weightfiles or any additional files.'''
+        """Called after a training has finished. Deletes all data that is not needed anymore after a training run. 
+        This can be old weightfiles or any additional files.
+        """
+        raise NotImplementedError
diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py
index c87124c1..6112d449 100644
--- a/learning_loop_node/trainer/trainer_node.py
+++ b/learning_loop_node/trainer/trainer_node.py
@@ -9,12 +9,12 @@
 from ..node import Node
 from .io_helpers import LastTrainingIO
 from .rest import backdoor_controls, controls
-from .trainer_logic_abstraction import TrainerLogicAbstraction
+from .trainer_logic_generic import TrainerLogicGeneric
 
 
 class TrainerNode(Node):
 
-    def __init__(self, name: str, trainer_logic: TrainerLogicAbstraction, uuid: Optional[str] = None, use_backdoor_controls: bool = False):
+    def __init__(self, name: str, trainer_logic: TrainerLogicGeneric, uuid: Optional[str] = None, use_backdoor_controls: bool = False):
         super().__init__(name, uuid, 'trainer')
         trainer_logic._node = self
         self.trainer_logic = trainer_logic
diff --git a/mock_trainer/app_code/progress_simulator.py b/mock_trainer/app_code/progress_simulator.py
index 6eaf5ced..042a0b29 100644
--- a/mock_trainer/app_code/progress_simulator.py
+++ b/mock_trainer/app_code/progress_simulator.py
@@ -10,8 +10,8 @@ def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -
         return None
 
     confusion_matrix = {}
-    assert trainer.active_training.data is not None
-    for category in trainer.active_training.data.categories:
+    assert trainer.training.data is not None
+    for category in trainer.training.data.categories:
         try:
             minimum = latest_known_confusion_matrix[category.id]['tp']
         except Exception:
diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py
index 42fbfe8b..5b5aa461 100644
--- a/mock_trainer/app_code/tests/test_detections.py
+++ b/mock_trainer/app_code/tests/test_detections.py
@@ -29,7 +29,7 @@ async def test_all(setup_test_project1, glc: LoopCommunicator):  # pylint: disab
                'flip_rl': False,
                'flip_ud': False}
     trainer._node = node  # pylint: disable=protected-access
-    trainer.init_new_training(context=context, details=details)
+    trainer._init_new_training(context=context, details=details)
 
     project_folder = create_project_folder(context)
     training = generate_training(project_folder, context)
diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py
index f20797b0..fecbe868 100644
--- a/mock_trainer/app_code/tests/test_mock_trainer.py
+++ b/mock_trainer/app_code/tests/test_mock_trainer.py
@@ -37,6 +37,6 @@ async def test_get_new_model(setup_test_project2):
         project_folder="",
         images_folder="",
         training_folder="",)
-    mock_trainer.active_training.data = TrainingData(image_data=[], categories=[])
+    mock_trainer.training.data = TrainingData(image_data=[], categories=[])
     model = mock_trainer.get_new_best_model()
     assert model is not None

From 151393c63bb9e4f88f773e59ff5ae7ec85b647af Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 22 Mar 2024 20:45:15 +0100
Subject: [PATCH 20/62] fix all mypi and linting issues

---
 .vscode/settings.json                         | 11 +++-
 .../annotation/annotator_logic.py             |  4 +-
 learning_loop_node/data_classes/__init__.py   |  4 +-
 learning_loop_node/data_classes/detections.py |  9 +++-
 learning_loop_node/data_classes/general.py    |  4 --
 learning_loop_node/data_classes/training.py   | 22 ++++----
 learning_loop_node/data_exchanger.py          | 10 ++--
 learning_loop_node/detector/__init__.py       |  1 -
 learning_loop_node/detector/detector_node.py  |  4 +-
 .../inbox_filter/cam_observation_history.py   | 11 ++--
 learning_loop_node/detector/outbox.py         |  1 -
 learning_loop_node/detector/tests/conftest.py |  1 -
 .../tests/test_client_communication.py        |  4 +-
 .../detector/tests/test_outbox.py             |  2 +
 learning_loop_node/globals.py                 |  4 +-
 .../helpers/gdrive_downloader.py              |  2 +-
 learning_loop_node/helpers/misc.py            | 24 +++------
 learning_loop_node/loop_communication.py      | 22 ++++----
 learning_loop_node/node.py                    |  2 +-
 learning_loop_node/tests/test_helper.py       |  1 -
 learning_loop_node/trainer/executor.py        |  2 +-
 learning_loop_node/trainer/io_helpers.py      | 10 ++--
 .../trainer/rest/backdoor_controls.py         |  1 -
 learning_loop_node/trainer/rest/controls.py   |  2 +
 learning_loop_node/trainer/tests/conftest.py  | 19 ++-----
 .../tests/states/test_state_cleanup.py        |  4 +-
 .../tests/states/test_state_detecting.py      |  3 +-
 .../states/test_state_download_train_model.py |  2 +
 .../tests/states/test_state_prepare.py        |  1 +
 .../test_state_sync_confusion_matrix.py       |  2 +
 .../trainer/tests/states/test_state_train.py  | 10 ++--
 .../states/test_state_upload_detections.py    |  1 +
 .../tests/states/test_state_upload_model.py   |  1 +
 .../trainer/tests/test_errors.py              |  2 +
 .../trainer/tests/testing_trainer_logic.py    | 20 +++----
 learning_loop_node/trainer/trainer_logic.py   | 14 ++---
 .../trainer/trainer_logic_generic.py          | 52 +++++++++---------
 .../trainer/training_syncronizer.py           | 53 -------------------
 mock_trainer/app_code/mock_trainer_logic.py   | 51 +++++++++---------
 mock_trainer/app_code/progress_simulator.py   |  6 +--
 .../app_code/tests/test_mock_trainer.py       |  4 +-
 41 files changed, 173 insertions(+), 230 deletions(-)
 delete mode 100644 learning_loop_node/trainer/training_syncronizer.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 45eb6e46..ff950a35 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -9,10 +9,19 @@
     "--disable=C0111", // Missing docstring (in function/class/method)
     "--disable=C0114", // Missing module docstring
     "--disable=C0301", // Line too long (exceeds character limit)
+    "--disable=W0511", // TODO/FIXME not being used
     "--disable=W0718", // Catching too general exception
     "--disable=W0719", // Raising too general exception
     "--disable=W1203", // Use % formatting in logging functions and pass the % parameters as arguments
-    "--disable=W1514" // Using open without explicitly specifying an encoding
+    "--disable=W1514", // Using open without explicitly specifying an encoding
+    "--disable=R0902", // Too many instance attributes
+    "--disable=R0903", // Too few public methods
+    "--disable=R0912", // Too many branches
+    "--disable=R0913", // Too many arguments
+    "--disable=R0914", // Too many local variables
+    "--disable=R0915", // Too many statements
+    "--disable=R1732", // Consider using with for resource-allocating operations
+    "--disable=R0801" // Similar lines in 2 files
   ],
   "[python]": {
     "editor.defaultFormatter": "ms-python.autopep8",
diff --git a/learning_loop_node/annotation/annotator_logic.py b/learning_loop_node/annotation/annotator_logic.py
index 932abce9..a80cc13b 100644
--- a/learning_loop_node/annotation/annotator_logic.py
+++ b/learning_loop_node/annotation/annotator_logic.py
@@ -7,10 +7,10 @@
 
 class AnnotatorLogic():
 
-    def __init__(self):
+    def __init__(self) -> None:
         self._node: Optional[Node] = None
 
-    def init(self, node: Node):
+    def init(self, node: Node) -> None:
         self._node = node
 
     @abstractmethod
diff --git a/learning_loop_node/data_classes/__init__.py b/learning_loop_node/data_classes/__init__.py
index 0e0a10e9..32188896 100644
--- a/learning_loop_node/data_classes/__init__.py
+++ b/learning_loop_node/data_classes/__init__.py
@@ -4,5 +4,5 @@
 from .general import (AnnotationNodeStatus, Category, CategoryType, Context, DetectionStatus, ErrorConfiguration,
                       ModelInformation, NodeState, NodeStatus)
 from .socket_response import SocketResponse
-from .training import (BasicModel, Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData,
-                       TrainingError, TrainingOut, TrainingStatus)
+from .training import (Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData,
+                       TrainingError, TrainingOut, TrainingStateData, TrainingStatus)
diff --git a/learning_loop_node/data_classes/detections.py b/learning_loop_node/data_classes/detections.py
index 21924720..0872b256 100644
--- a/learning_loop_node/data_classes/detections.py
+++ b/learning_loop_node/data_classes/detections.py
@@ -13,8 +13,11 @@
 
 @dataclass(**KWONLY_SLOTS)
 class BoxDetection():
+    """Coordinates according to COCO format. x,y is the top left corner of the box.
+    x increases to the right, y increases downwards.
+    """
     category_name: str
-    x: int  # TODO add definition of x,y,w,h
+    x: int
     y: int
     width: int
     height: int
@@ -47,6 +50,8 @@ def __str__(self):
 
 @dataclass(**KWONLY_SLOTS)
 class PointDetection():
+    """Coordinates according to COCO format. x,y is the center of the point.
+    x increases to the right, y increases downwards."""
     category_name: str
     x: float
     y: float
@@ -111,7 +116,7 @@ class Detections():
     point_detections: List[PointDetection] = field(default_factory=list)
     segmentation_detections: List[SegmentationDetection] = field(default_factory=list)
     classification_detections: List[ClassificationDetection] = field(default_factory=list)
-    tags: Optional[List[str]] = field(default_factory=list)
+    tags: List[str] = field(default_factory=list)
     date: Optional[str] = field(default_factory=current_datetime)
     image_id: Optional[str] = None  # used for detection of trainers
 
diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py
index 9d5c893e..41141395 100644
--- a/learning_loop_node/data_classes/general.py
+++ b/learning_loop_node/data_classes/general.py
@@ -34,10 +34,6 @@ def from_list(values: List[dict]) -> List['Category']:
         return [from_dict(data_class=Category, data=value) for value in values]
 
 
-def create_category(identifier: str, name: str, ctype: Union[CategoryType, str]):  # TODO: This is probably unused
-    return Category(id=identifier, name=name, description='', hotkey='', color='', type=ctype, point_size=None)
-
-
 @dataclass(**KWONLY_SLOTS)
 class Context():
     organization: str
diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index 2ce1c95b..4df5a289 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -72,7 +72,7 @@ class TrainerState(str, Enum):
 
 @dataclass(**KWONLY_SLOTS)
 class TrainingStatus():
-    id: str  # TODO this must not be changed, but tests wont detect it -> update tests!
+    id: str  # NOTE this must not be changed, but tests wont detect a change -> update tests!
     name: str
     state: Optional[str]
     errors: Optional[Dict]
@@ -87,7 +87,7 @@ class TrainingStatus():
     architecture: Optional[str] = None
     context: Optional[Context] = None
 
-    def short_str(self):
+    def short_str(self) -> str:
         prgr = f'{self.progress * 100:.0f}%' if self.progress else ''
         trtesk = f'{self.train_image_count}/{self.test_image_count}/{self.skipped_image_count}' if self.train_image_count else 'n.a.'
         cntxt = f'{self.context.organization}/{self.context.project}' if self.context else ''
@@ -106,14 +106,14 @@ class Training():
     training_folder: str  # f'{project_folder}/trainings/{trainings_id}'
     start_time: float = field(default_factory=time.time)
 
-    base_model_id: Optional[str] = None  # model uuid to download into base_model.json
+    base_model_id: Optional[str] = None  # model uuid to download (to continue training)
     data: Optional[TrainingData] = None
     training_number: Optional[int] = None
     training_state: Optional[str] = None
     model_id_for_detecting: Optional[str] = None
     hyperparameters: Optional[Dict] = None
 
-    def set_values_from_data(self, data: Dict):
+    def set_values_from_data(self, data: Dict) -> None:
         self.data = TrainingData(categories=Category.from_list(data['categories']))
         self.data.hyperparameter = Hyperparameter.from_data(data=data)
         self.training_number = data['training_number']
@@ -123,7 +123,7 @@ def set_values_from_data(self, data: Dict):
 
 @dataclass(**KWONLY_SLOTS)
 class TrainingOut():
-    confusion_matrix: Optional[Dict] = None
+    confusion_matrix: Optional[Dict] = None  # This is actually just class-wise metrics
     train_image_count: Optional[int] = None
     test_image_count: Optional[int] = None
     trainer_id: Optional[str] = None
@@ -131,8 +131,8 @@ class TrainingOut():
 
 
 @dataclass(**KWONLY_SLOTS)
-class BasicModel():
-    confusion_matrix: Optional[Dict] = None
+class TrainingStateData():
+    confusion_matrix: Optional[Dict] = None  # This is actually just class-wise metrics
     meta_information: Optional[Dict] = None
 
 
@@ -148,8 +148,8 @@ class Model():
 
 
 class Errors():
-    def __init__(self):
-        self._errors: Dict = {}
+    def __init__(self) -> None:
+        self._errors: Dict[str, str] = {}
 
     def set(self, key: str, value: str):
         self._errors[key] = value
@@ -158,7 +158,7 @@ def set(self, key: str, value: str):
     def errors(self) -> Dict:
         return self._errors
 
-    def reset(self, key: str):
+    def reset(self, key: str) -> None:
         try:
             del self._errors[key]
         except AttributeError:
@@ -166,7 +166,7 @@ def reset(self, key: str):
         except KeyError:
             pass
 
-    def reset_all(self):
+    def reset_all(self) -> None:
         self._errors = {}
 
     def has_error_for(self, key: str) -> bool:
diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py
index 840a0fe9..8a5633d4 100644
--- a/learning_loop_node/data_exchanger.py
+++ b/learning_loop_node/data_exchanger.py
@@ -161,8 +161,8 @@ async def upload_model_get_uuid(self, context: Context, files: List[str], traini
                 f'---- could not upload model for training {training_number} and format {mformat}. Details: {response.text}')
             response.raise_for_status()
             return None
-        else:
-            uploaded_model = response.json()
-            logging.info(
-                f'---- uploaded model for training {training_number} and format {mformat}. Model id is {uploaded_model}')
-            return uploaded_model['id']
+
+        uploaded_model = response.json()
+        logging.info(
+            f'---- uploaded model for training {training_number} and format {mformat}. Model id is {uploaded_model}')
+        return uploaded_model['id']
diff --git a/learning_loop_node/detector/__init__.py b/learning_loop_node/detector/__init__.py
index 8b137891..e69de29b 100644
--- a/learning_loop_node/detector/__init__.py
+++ b/learning_loop_node/detector/__init__.py
@@ -1 +0,0 @@
-
diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py
index 18b8ab6c..db657698 100644
--- a/learning_loop_node/detector/detector_node.py
+++ b/learning_loop_node/detector/detector_node.py
@@ -186,7 +186,9 @@ async def _check_for_update(self) -> None:
             if not update_to_model_id:
                 self.log.info('could not check for updates')
                 return
-            if self.detector_logic.is_initialized:  # TODO: solve race condition !!!
+
+            # TODO: solve race condition (it should not be required to recheck if model_info is not None, but it is!)
+            if self.detector_logic.is_initialized:
                 model_info = self.detector_logic._model_info  # pylint: disable=protected-access
                 if model_info is not None:
                     self.log.info(f'Current model: {model_info.version} with id {model_info.id}')
diff --git a/learning_loop_node/detector/inbox_filter/cam_observation_history.py b/learning_loop_node/detector/inbox_filter/cam_observation_history.py
index 88bbe881..a87c72ee 100644
--- a/learning_loop_node/detector/inbox_filter/cam_observation_history.py
+++ b/learning_loop_node/detector/inbox_filter/cam_observation_history.py
@@ -1,20 +1,17 @@
 import os
 from typing import List, Union
 
-from learning_loop_node.data_classes import (BoxDetection,
-                                             ClassificationDetection,
-                                             Detections, Observation,
-                                             PointDetection,
-                                             SegmentationDetection)
+from learning_loop_node.data_classes import (BoxDetection, ClassificationDetection, Detections, Observation,
+                                             PointDetection, SegmentationDetection)
 
 
 class CamObservationHistory:
-    def __init__(self):
+    def __init__(self) -> None:
         self.reset_time = 3600
         self.recent_observations: List[Observation] = []
         self.iou_threshold = 0.5
 
-    def forget_old_detections(self):
+    def forget_old_detections(self) -> None:
         self.recent_observations = [detection
                                     for detection in self.recent_observations
                                     if not detection.is_older_than(self.reset_time)]
diff --git a/learning_loop_node/detector/outbox.py b/learning_loop_node/detector/outbox.py
index 23138c85..ca1a200d 100644
--- a/learning_loop_node/detector/outbox.py
+++ b/learning_loop_node/detector/outbox.py
@@ -53,7 +53,6 @@ def save(self, image: bytes, detections: Optional[Detections] = None, tags: Opti
         with open(tmp + '/image.json', 'w') as f:
             json.dump(jsonable_encoder(asdict(detections)), f)
 
-        # TODO sometimes No such file or directory: '/tmp/learning_loop_lib_data/tmp/2023-09-07_13:27:38.399/image.jpg'
         with open(tmp + '/image.jpg', 'wb') as f:
             f.write(image)
 
diff --git a/learning_loop_node/detector/tests/conftest.py b/learning_loop_node/detector/tests/conftest.py
index ad183fe2..1611f265 100644
--- a/learning_loop_node/detector/tests/conftest.py
+++ b/learning_loop_node/detector/tests/conftest.py
@@ -12,7 +12,6 @@
 import uvicorn
 
 from learning_loop_node import DetectorNode
-from learning_loop_node.data_classes.general import Category, ModelInformation
 from learning_loop_node.detector.outbox import Outbox
 from learning_loop_node.globals import GLOBALS
 
diff --git a/learning_loop_node/detector/tests/test_client_communication.py b/learning_loop_node/detector/tests/test_client_communication.py
index 97e3f074..24fbd095 100644
--- a/learning_loop_node/detector/tests/test_client_communication.py
+++ b/learning_loop_node/detector/tests/test_client_communication.py
@@ -2,7 +2,7 @@
 import json
 
 import pytest
-import requests  # type: ignore
+import requests
 
 from learning_loop_node import DetectorNode
 from learning_loop_node.data_classes import ModelInformation
@@ -101,4 +101,4 @@ async def test_about_endpoint(test_detector_node: DetectorNode):
     assert response_dict['operation_mode'] == 'idle'
     assert response_dict['state'] == 'online'
     assert response_dict['target_model'] == '1.1'
-    assert any([c.name == 'purple point' for c in model_information.categories])
+    assert any(c.name == 'purple point' for c in model_information.categories)
diff --git a/learning_loop_node/detector/tests/test_outbox.py b/learning_loop_node/detector/tests/test_outbox.py
index 9db7dd09..adf56744 100644
--- a/learning_loop_node/detector/tests/test_outbox.py
+++ b/learning_loop_node/detector/tests/test_outbox.py
@@ -9,6 +9,8 @@
 from learning_loop_node.detector.detector_node import DetectorNode
 from learning_loop_node.detector.outbox import Outbox
 
+# pylint: disable=redefined-outer-name
+
 
 @pytest.fixture()
 def test_outbox():
diff --git a/learning_loop_node/globals.py b/learning_loop_node/globals.py
index eee9511a..336df3fa 100644
--- a/learning_loop_node/globals.py
+++ b/learning_loop_node/globals.py
@@ -1,8 +1,8 @@
 
 class Globals():
-    def __init__(self):
+    def __init__(self) -> None:
         self.data_folder: str = '/data'
-        self.detector_port: int = 5004  # TODO move to tests
+        self.detector_port: int = 5004  # NOTE used for tests
 
 
 GLOBALS = Globals()
diff --git a/learning_loop_node/helpers/gdrive_downloader.py b/learning_loop_node/helpers/gdrive_downloader.py
index 8e5b3120..deefed68 100755
--- a/learning_loop_node/helpers/gdrive_downloader.py
+++ b/learning_loop_node/helpers/gdrive_downloader.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-import requests
+import requests  # type: ignore
 
 # https://stackoverflow.com/a/39225272/4082686
 
diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py
index 5b996092..0f75509b 100644
--- a/learning_loop_node/helpers/misc.py
+++ b/learning_loop_node/helpers/misc.py
@@ -55,7 +55,7 @@ def _handle_task_result(task: asyncio.Task,
         logger.exception(message, *message_args)
 
 
-def get_free_memory_mb() -> float:  # TODO check if this is used
+def get_free_memory_mb() -> float:  # NOTE used by yolov5
     pynvml.nvmlInit()
     h = pynvml.nvmlDeviceGetHandleByIndex(0)
     info = pynvml.nvmlDeviceGetMemoryInfo(h)
@@ -89,15 +89,7 @@ async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) ->
 
 
 def create_resource_paths(organization_name: str, project_name: str, image_ids: List[str]) -> Tuple[List[str], List[str]]:
-    # TODO: experimental:
     return [f'/{organization_name}/projects/{project_name}/images/{id}/main' for id in image_ids], image_ids
-    # if not image_ids:
-    #     return [], []
-    # url_ids: List[Tuple(str, str)] = [(f'/{organization_name}/projects/{project_name}/images/{id}/main', id)
-    #                                   for id in image_ids]
-    # urls, ids = list(map(list, zip(*url_ids)))
-
-    # return urls, ids
 
 
 def create_image_folder(project_folder: str) -> str:
@@ -140,17 +132,17 @@ async def wrapper_ensure_socket_response(*args, **kwargs):
 
             if isinstance(value, str):
                 return asdict(SocketResponse.for_success(value))
-            elif isinstance(value, bool):
+            if isinstance(value, bool):
                 return asdict(SocketResponse.from_bool(value))
-            elif isinstance(value, SocketResponse):
+            if isinstance(value, SocketResponse):
                 return value
-            elif (args[0] in ['connect', 'disconnect', 'connect_error']):
+            if (args[0] in ['connect', 'disconnect', 'connect_error']):
                 return value
-            elif value is None:
+            if value is None:
                 return None
-            else:
-                raise Exception(
-                    f"Return type for sio must be str, bool, SocketResponse or None', but was {type(value)}'")
+
+            raise Exception(
+                f"Return type for sio must be str, bool, SocketResponse or None', but was {type(value)}'")
         except Exception as e:
             logging.exception(f'An error occured for {args[0]}')
 
diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index 75c57189..99d9f70b 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -80,8 +80,15 @@ async def put(self, path, files: Optional[List[str]] = None, requires_login=True
         if files is None:
             return await self.async_client.put(api_prefix+path, **kwargs)
 
-        file_list = [('files', open(f, 'rb')) for f in files]  # TODO: does this properly close the files after upload?
-        return await self.async_client.put(api_prefix+path, files=file_list)
+        file_handles = [open(f, 'rb') for f in files]  # Open files and store handles
+        try:
+            file_list = [('files', fh) for fh in file_handles]  # Use file handles
+            response = await self.async_client.put(api_prefix+path, files=file_list)
+        finally:
+            for fh in file_handles:
+                fh.close()  # Ensure all files are closed
+
+        return response
 
     async def post(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response:
         if requires_login:
@@ -92,14 +99,3 @@ async def delete(self, path, requires_login=True, api_prefix='/api', **kwargs) -
         if requires_login:
             await self.ensure_login()
         return await self.async_client.delete(api_prefix+path, **kwargs)
-
-    # --------------------------------- unused?! --------------------------------- #TODO remove?
-
-    # def get_data(self, path):
-    #     return asyncio.get_event_loop().run_until_complete(self._get_data_async(path))
-
-    # async def _get_data_async(self, path) -> bytes:
-    #     response = await self.get(f'{self.project_path}{path}')
-    #     if response.status_code != 200:
-    #         raise LoopCommunicationException('bad response: ' + str(response))
-    #     return response.content
diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py
index 38742fa4..5424c110 100644
--- a/learning_loop_node/node.py
+++ b/learning_loop_node/node.py
@@ -62,7 +62,7 @@ def sio_client(self) -> AsyncClient:
 
     # --------------------------------------------------- APPLICATION LIFECYCLE ---------------------------------------------------
     @asynccontextmanager
-    async def lifespan(self, app: FastAPI):
+    async def lifespan(self, app: FastAPI):  # pylint: disable=unused-argument
         try:
             await self._on_startup()
             self.repeat_task = asyncio.create_task(self.repeat_loop())
diff --git a/learning_loop_node/tests/test_helper.py b/learning_loop_node/tests/test_helper.py
index e802c7a0..c52037ed 100644
--- a/learning_loop_node/tests/test_helper.py
+++ b/learning_loop_node/tests/test_helper.py
@@ -9,7 +9,6 @@
 from learning_loop_node.data_classes import Context
 from learning_loop_node.helpers.misc import create_image_folder, create_project_folder, create_training_folder
 from learning_loop_node.loop_communication import LoopCommunicator
-from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
 
 def get_files_in_folder(folder: str):
diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py
index c768332c..628ef022 100644
--- a/learning_loop_node/trainer/executor.py
+++ b/learning_loop_node/trainer/executor.py
@@ -11,7 +11,7 @@
 
 
 def create_signal_handler(sig=signal.SIGTERM):
-    if platform == "linux" or platform == "linux2":
+    if platform in ('linux', 'linux2'):
         # "The system will send a signal to the child once the parent exits for any reason (even sigkill)."
         # https://stackoverflow.com/a/19448096
         libc = ctypes.CDLL("libc.so.6")
diff --git a/learning_loop_node/trainer/io_helpers.py b/learning_loop_node/trainer/io_helpers.py
index 453add80..1ae3bd43 100644
--- a/learning_loop_node/trainer/io_helpers.py
+++ b/learning_loop_node/trainer/io_helpers.py
@@ -174,9 +174,9 @@ async def _upload_detections(self, context: Context, batch_detections: List[Dete
             msg = f'could not upload detections. {str(response)}'
             logging.error(msg)
             raise Exception(msg)
+
+        logging.info('successfully uploaded detections')
+        if up_progress > len(batch_detections):
+            self.save_detection_upload_progress(0)
         else:
-            logging.info('successfully uploaded detections')
-            if up_progress > len(batch_detections):
-                self.save_detection_upload_progress(0)
-            else:
-                self.save_detection_upload_progress(up_progress)
+            self.save_detection_upload_progress(up_progress)
diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py
index a796fc4d..726cdb8e 100644
--- a/learning_loop_node/trainer/rest/backdoor_controls.py
+++ b/learning_loop_node/trainer/rest/backdoor_controls.py
@@ -5,7 +5,6 @@
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Dict
 
-from dacite import from_dict
 from fastapi import APIRouter, HTTPException, Request
 
 from ...data_classes import ErrorConfiguration, NodeState
diff --git a/learning_loop_node/trainer/rest/controls.py b/learning_loop_node/trainer/rest/controls.py
index b8fbbec8..6c92d9a8 100644
--- a/learning_loop_node/trainer/rest/controls.py
+++ b/learning_loop_node/trainer/rest/controls.py
@@ -7,6 +7,8 @@
 
 router = APIRouter()
 
+# pylint: disable=protected-access
+
 
 @router.post("/controls/detect/{organization}/{project}/{version}")
 async def operation_mode(organization: str, project: str, version: str, request: Request):
diff --git a/learning_loop_node/trainer/tests/conftest.py b/learning_loop_node/trainer/tests/conftest.py
index f07af98f..aca1919c 100644
--- a/learning_loop_node/trainer/tests/conftest.py
+++ b/learning_loop_node/trainer/tests/conftest.py
@@ -10,6 +10,8 @@
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_node import TrainerNode
 
+# pylint: disable=protected-access
+
 logging.basicConfig(level=logging.INFO)
 # show ouptut from uvicorn server https://stackoverflow.com/a/66132186/364388
 log_to_stderr(logging.INFO)
@@ -24,7 +26,7 @@ async def test_initialized_trainer_node():
 
     trainer = TestingTrainerLogic()
     node = TrainerNode(name='test', trainer_logic=trainer, uuid='NOD30000-0000-0000-0000-000000000000')
-    trainer._node = node  # pylint: disable=protected-access
+    trainer._node = node
     trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'),
                                details={'categories': [],
                                         'id': '917d5c7f-403d-7e92-f95f-577f79c2273a',  # version 1.2 of demo project
@@ -32,8 +34,6 @@ async def test_initialized_trainer_node():
                                         'resolution': 800,
                                         'flip_rl': False,
                                         'flip_ud': False})
-
-    # pylint: disable=protected-access
     await node._on_startup()
     yield node
     await node._on_shutdown()
@@ -44,9 +44,9 @@ async def test_initialized_trainer():
 
     trainer = TestingTrainerLogic()
     node = TrainerNode(name='test', trainer_logic=trainer, uuid='NODE-000-0000-0000-0000-000000000000')
-    # pylint: disable=protected-access
+
     await node._on_startup()
-    trainer._node = node  # pylint: disable=protected-access
+    trainer._node = node
     trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'),
                                details={'categories': [],
                                         'id': '917d5c7f-403d-7e92-f95f-577f79c2273a',  # version 1.2 of demo project
@@ -54,9 +54,7 @@ async def test_initialized_trainer():
                                         'resolution': 800,
                                         'flip_rl': False,
                                         'flip_ud': False})
-
     yield trainer
-    # await node._on_shutdown()
     try:
         await node._on_shutdown()
     except Exception:
@@ -66,10 +64,3 @@ async def test_initialized_trainer():
 def is_port_in_use(port):
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         return s.connect_ex(('localhost', port)) == 0
-
-
-# @pytest.fixture(autouse=True, scope='session')
-# def initialize_active_training():
-#     from learning_loop_node.trainer import active_training_module
-#     active_training_module.init('00000000-0000-0000-0000-000000000000')
-#     yield
diff --git a/learning_loop_node/trainer/tests/states/test_state_cleanup.py b/learning_loop_node/trainer/tests/states/test_state_cleanup.py
index 9fbf076d..f3911a54 100644
--- a/learning_loop_node/trainer/tests/states/test_state_cleanup.py
+++ b/learning_loop_node/trainer/tests/states/test_state_cleanup.py
@@ -1,6 +1,8 @@
 from learning_loop_node.trainer.tests.state_helper import create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 
+# pylint: disable=protected-access
+
 
 async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
@@ -18,7 +20,7 @@ async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic
 
     await trainer._clear_training()
 
-    assert trainer._training is None  # pylint: disable=protected-access
+    assert trainer._training is None
     assert trainer.node.last_training_io.exists() is False
     assert trainer.active_training_io.detections_exist() is False
     assert trainer.active_training_io.detection_upload_progress_exist() is False
diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py
index efd9b966..40a62e63 100644
--- a/learning_loop_node/trainer/tests/states/test_state_detecting.py
+++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py
@@ -6,6 +6,7 @@
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
+# pylint: disable=protected-access
 error_key = 'detecting'
 
 
@@ -13,7 +14,7 @@ def trainer_has_error(trainer: TrainerLogic):
     return trainer.errors.has_error_for(error_key)
 
 
-async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogic):  # TODO Flaky test
+async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogic):  # NOTE was a flaky test
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state='train_model_uploaded',
                                 model_id_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a')
diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
index f5ef302b..1679f70b 100644
--- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
@@ -6,6 +6,8 @@
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 
+# pylint: disable=protected-access
+
 
 async def test_downloading_is_successful(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
diff --git a/learning_loop_node/trainer/tests/states/test_state_prepare.py b/learning_loop_node/trainer/tests/states/test_state_prepare.py
index c6648ea4..d3222f9a 100644
--- a/learning_loop_node/trainer/tests/states/test_state_prepare.py
+++ b/learning_loop_node/trainer/tests/states/test_state_prepare.py
@@ -5,6 +5,7 @@
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
+# pylint: disable=protected-access
 error_key = 'prepare'
 
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
index 2fe586aa..6a292be5 100644
--- a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
+++ b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
@@ -10,6 +10,8 @@
 from ..state_helper import assert_training_state, create_active_training_file
 from ..testing_trainer_logic import TestingTrainerLogic
 
+# pylint: disable=protected-access
+
 error_key = 'sync_confusion_matrix'
 
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py
index 168a81d4..603d18e9 100644
--- a/learning_loop_node/trainer/tests/states/test_state_train.py
+++ b/learning_loop_node/trainer/tests/states/test_state_train.py
@@ -5,6 +5,8 @@
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 
+# pylint: disable=protected-access
+
 
 async def test_successful_training(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
@@ -19,7 +21,6 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'start_training'
 
-    # pylint: disable=protected-access
     assert trainer._executor is not None
     trainer._executor.stop()  # NOTE normally a training terminates itself
     await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
@@ -36,7 +37,7 @@ async def test_stop_running_training(test_initialized_trainer: TestingTrainerLog
 
     _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)  # pylint: disable=protected-access
+    await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)
     await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'start_training'
@@ -54,16 +55,15 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain
     # NOTE e.g. when a node-computer is restarted
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
     trainer._init_from_last_training()
-    trainer._can_resume = True  # pylint: disable=protected-access
+    trainer._can_resume = True
 
     _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)  # pylint: disable=protected-access
+    await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)
     await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'resume'
 
-    # pylint: disable=protected-access
     assert trainer._executor is not None
     trainer._executor.stop()  # NOTE normally a training terminates itself e.g
     await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
index 8567e69d..8918eece 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
@@ -10,6 +10,7 @@
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
+# pylint: disable=protected-access
 error_key = 'upload_detections'
 
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
index ac147065..36c625f4 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
@@ -7,6 +7,7 @@
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
+# pylint: disable=protected-access
 error_key = 'upload_model'
 
 
diff --git a/learning_loop_node/trainer/tests/test_errors.py b/learning_loop_node/trainer/tests/test_errors.py
index bdb40c95..507c494a 100644
--- a/learning_loop_node/trainer/tests/test_errors.py
+++ b/learning_loop_node/trainer/tests/test_errors.py
@@ -5,6 +5,8 @@
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 
+# pylint: disable=protected-access
+
 
 async def test_training_process_is_stopped_when_trainer_reports_error(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index c7faeca8..188d37d7 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -2,7 +2,7 @@
 import time
 from typing import Dict, List, Optional, Union
 
-from learning_loop_node.data_classes import BasicModel, Context, Detections, ModelInformation, PretrainedModel
+from learning_loop_node.data_classes import Context, Detections, ModelInformation, PretrainedModel, TrainingStateData
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
 
@@ -35,15 +35,17 @@ async def start_training(self, model: str = 'model.model') -> None:
         assert self._executor is not None
         self._executor.start('while true; do sleep 1; done')
 
-    async def start_training_from_scratch(self, base_model_id: str) -> None:
+    async def start_training_from_scratch(self) -> None:
+        base_model_id = self.training.base_model_id
+        assert base_model_id is not None
         await self.start_training(model=f'model_{base_model_id}.pt')
 
-    def get_new_best_model(self) -> Optional[BasicModel]:
+    def _get_new_best_model(self) -> Optional[TrainingStateData]:
         if self.has_new_model:
-            return BasicModel(confusion_matrix={})
+            return TrainingStateData(confusion_matrix={})
         return None
 
-    def on_model_published(self, basic_model: BasicModel) -> None:
+    def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
         pass
 
     async def _prepare(self) -> None:
@@ -54,9 +56,9 @@ async def _download_model(self) -> None:
         await super()._download_model()
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
 
-    async def upload_model(self) -> None:
+    async def _upload_model(self) -> None:
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
-        await super().upload_model()
+        await super()._upload_model()
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
 
     async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
@@ -66,7 +68,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
         assert isinstance(result, str)
         return result
 
-    def get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
+    def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
         time.sleep(1)  # NOTE reduce flakyness in Backend tests du to wrong order of events.
         fake_weight_file = '/tmp/weightfile.weights'
         with open(fake_weight_file, 'wb') as f:
@@ -87,7 +89,7 @@ async def _detect(self, model_information: ModelInformation, images:  List[str],
         detections: List[Detections] = []
         return detections
 
-    async def clear_training_data(self, training_folder: str) -> None:
+    async def _clear_training_data(self, training_folder: str) -> None:
         return
 
     def get_executor_error_from_log(self) -> Optional[str]:
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index c5b47df9..ee408cf9 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -66,7 +66,8 @@ async def _train(self) -> None:
             error = self.get_executor_error_from_log()
             if error:
                 raise TrainingError(cause=error)
-            # TODO check if this works:
+
+            # TODO check if this works to catch errors from the executor:
             # if self.executor.return_code != 0:
             #     self.errors.set(error_key, f'Executor return code was {self.executor.return_code}')
             #     raise TrainingError(cause=f'Executor return code was {self.executor.return_code}')
@@ -85,9 +86,7 @@ async def _start_training(self):
         else:
             base_model_id = self.training.base_model_id
             if not is_valid_uuid4(base_model_id):  # TODO this check was done earlier!
-                assert isinstance(base_model_id, str)
-                # TODO this could be removed here and accessed via self.training.base_model_id
-                self.start_training_task = self.start_training_from_scratch(base_model_id)
+                self.start_training_task = self.start_training_from_scratch()
             else:
                 self.start_training_task = self.start_training()
         await self.start_training_task
@@ -146,7 +145,7 @@ async def stop(self) -> None:
                 except asyncio.CancelledError:
                     pass
                 logging.info('cancelled training task')
-                self.may_restart()
+                self._may_restart()
 
     def get_log(self) -> str:
         return self.executor.get_log()
@@ -158,9 +157,10 @@ async def start_training(self) -> None:
         '''Should be used to start a training.'''
 
     @abstractmethod
-    async def start_training_from_scratch(self, base_model_id: str) -> None:
+    async def start_training_from_scratch(self) -> None:
         '''Should be used to start a training from scratch.
-        base_model_id is the id of a pretrained model provided by self.provided_pretrained_models.'''
+        NOTE base_model_id is now accessible via self.training.base_model_id 
+        the id of a pretrained model provided by self.provided_pretrained_models.'''
 
     @abstractmethod
     def can_resume(self) -> bool:
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index d9abff34..a526fa62 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -10,8 +10,8 @@
 
 from fastapi.encoders import jsonable_encoder
 
-from ..data_classes import (BasicModel, Context, Errors, PretrainedModel, TrainerState, Training, TrainingData,
-                            TrainingOut)
+from ..data_classes import (Context, Errors, PretrainedModel, TrainerState, Training, TrainingData, TrainingOut,
+                            TrainingStateData)
 from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4
 from .downloader import TrainingsDownloader
 from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO
@@ -22,7 +22,7 @@
 
 class TrainerLogicGeneric(ABC):
 
-    def __init__(self, model_format: str):
+    def __init__(self, model_format: str) -> None:
 
         # NOTE: model_format is used in the file path for the model on the server:
         # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
@@ -86,8 +86,7 @@ def state(self) -> str:
         """
         if (not self.training_active) or (self.training.training_state is None):
             return TrainerState.Idle.value
-        else:
-            return self.training.training_state
+        return self.training.training_state
 
     @property
     def training_uptime(self) -> Optional[float]:
@@ -245,7 +244,7 @@ async def _training_loop(self) -> None:
                 await self._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions)
             elif tstate == TrainerState.ReadyForCleanup:  # -> RESTART or TrainingFinished
                 await self._clear_training()
-                self.may_restart()
+                self._may_restart()
 
     async def _perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False):
         await asyncio.sleep(0.1)
@@ -305,7 +304,7 @@ async def _sync_confusion_matrix(self) -> None:
         """
         error_key = 'sync_confusion_matrix'
         try:
-            new_best_model = self.get_new_best_model()
+            new_best_model = self._get_new_best_model()
             if new_best_model and self.training.data:
                 new_training = TrainingOut(trainer_id=self.node.uuid,
                                            confusion_matrix=new_best_model.confusion_matrix,
@@ -319,7 +318,7 @@ async def _sync_confusion_matrix(self) -> None:
                 if isinstance(result,  dict) and result['success']:
                     logging.info(
                         f'successfully updated training {asdict(new_training)}')
-                    self.on_model_published(new_best_model)
+                    self._on_metrics_published(new_best_model)
                 else:
                     raise Exception(
                         f'Error for update_training: Response from loop was : {result}')
@@ -346,7 +345,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
         Note that trainer may train with different classes, which is why we send an initial model.json file.
         """
         # NOTE: I guess this is in executor because originally the conversion happened here..
-        files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files)
+        files = await asyncio.get_running_loop().run_in_executor(None, self._get_latest_model_files)
         if files is None:
             return None
 
@@ -385,7 +384,7 @@ async def _clear_training(self):
         self.active_training_io.delete_detections()
         self.active_training_io.delete_detection_upload_progress()
         self.active_training_io.delete_detections_upload_file_index()
-        await self.clear_training_data(self.training.training_folder)
+        await self._clear_training_data(self.training.training_folder)
         self.last_training_io.delete()
 
         await self.node.send_status()
@@ -393,15 +392,6 @@ async def _clear_training(self):
 
     # ---------------------------------------- OTHER METHODS ----------------------------------------
 
-    def may_restart(self) -> None:
-        """If the environment variable RESTART_AFTER_TRAINING is set, the trainer will restart after a training.
-        """
-        if self._environment_vars.restart_after_training:
-            logging.info('restarting')
-            sys.exit(0)
-        else:
-            logging.info('not restarting')
-
     async def on_shutdown(self) -> None:
         self.shutdown_event.set()
         await self.stop()
@@ -420,8 +410,16 @@ async def stop(self):
                 except asyncio.CancelledError:
                     pass
                 logging.info('cancelled training task')
-                self.may_restart()
+                self._may_restart()
 
+    def _may_restart(self) -> None:
+        """If the environment variable RESTART_AFTER_TRAINING is set, the trainer will restart after a training.
+        """
+        if self._environment_vars.restart_after_training:
+            logging.info('restarting')
+            sys.exit(0)
+        else:
+            logging.info('not restarting')
     # ---------------------------------------- ABSTRACT METHODS ----------------------------------------
 
     @abstractmethod
@@ -443,9 +441,9 @@ async def _do_detections(self) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def get_new_best_model(self) -> Optional[BasicModel]:
+    def _get_new_best_model(self) -> Optional[TrainingStateData]:
         """Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe.
-        Returns None if no new model could be found. Otherwise BasicModel(confusion_matrix, meta_information).
+        Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information).
         `confusion_matrix` contains a dict of all classes:
             - The classes must be identified by their id, not their name.
             - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives).
@@ -454,17 +452,17 @@ def get_new_best_model(self) -> Optional[BasicModel]:
         raise NotImplementedError
 
     @abstractmethod
-    def on_model_published(self, basic_model: BasicModel) -> None:
-        """Called after the confusion matrix corresponding to BasicModel has been successfully send to the Learning Loop.
+    def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
+        """Called after the metrics corresponding to TrainingStateData have been successfully send to the Learning Loop.
         The respective files for this model should be stored so they can be later uploaded in get_latest_model_files.
         """
         raise NotImplementedError
 
     @abstractmethod
-    def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]:
+    def _get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]:
         """Called when the Learning Loop requests to backup the latest model for the training.
         This function is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop.
-        In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model
+        In the simplest implementation this method just renames the weight file (e.g. stored in TrainingStateData.meta_information) into a file name like latest_published_model
         Should return a list of file paths which describe the model.
         These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.)
         and will be stored in the Learning Loop unter the format of this trainer.
@@ -477,7 +475,7 @@ def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str
         raise NotImplementedError
 
     @abstractmethod
-    async def clear_training_data(self, training_folder: str) -> None:
+    async def _clear_training_data(self, training_folder: str) -> None:
         """Called after a training has finished. Deletes all data that is not needed anymore after a training run. 
         This can be old weightfiles or any additional files.
         """
diff --git a/learning_loop_node/trainer/training_syncronizer.py b/learning_loop_node/trainer/training_syncronizer.py
deleted file mode 100644
index 97041bb9..00000000
--- a/learning_loop_node/trainer/training_syncronizer.py
+++ /dev/null
@@ -1,53 +0,0 @@
-
-import asyncio
-import logging
-from dataclasses import asdict
-from typing import TYPE_CHECKING
-
-import socketio
-from dacite import from_dict
-from fastapi.encoders import jsonable_encoder
-
-from ..data_classes import TrainingOut
-from ..data_classes.socket_response import SocketResponse
-
-if TYPE_CHECKING:
-    from .trainer_logic import TrainerLogic
-
-
-class TrainingSyncronizer:
-    def __init__(self, trainer_node_uuid: str, sio_client: socketio.AsyncClient):
-        self.trainer_node_uuid = trainer_node_uuid
-        self.sio_client = sio_client
-
-    async def sync_model(model, current_training):
-        new_training = TrainingOut(
-            trainer_id=self.trainer_node_uuid,
-            confusion_matrix=model.confusion_matrix,
-            train_image_count=current_training.data.train_image_count(),
-            test_image_count=current_training.data.test_image_count(),
-            hyperparameters=trainer.hyperparameters)
-
-        await asyncio.sleep(0.1)  # NOTE needed for tests.
-
-        result = await self.sio_client.call('update_training', (current_training.context.organization, current_training.context.project, jsonable_encoder(new_training)))
-        response = from_dict(data_class=SocketResponse, data=result)
-
-        return response
-
-
-async def try_sync_model(mo):
-    try:
-        model = trainer.get_new_model()
-    except Exception as exc:
-        logging.exception('error while getting new model')
-        raise Exception(f'Could not get new model: {str(exc)}') from exc
-    logging.debug(f'new model {model}')
-
-    if model:
-        response = await sync_model(trainer, trainer_node_uuid, sio_client, model)
-
-        if not response.success:
-            error_msg = f'Error for update_training: Response from loop was : {asdict(response)}'
-            logging.error(error_msg)
-            raise Exception(error_msg)
diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py
index e88a2de3..3d992f4e 100644
--- a/mock_trainer/app_code/mock_trainer_logic.py
+++ b/mock_trainer/app_code/mock_trainer_logic.py
@@ -4,9 +4,9 @@
 import time
 from typing import Dict, List, Optional, Union
 
-from learning_loop_node.data_classes import (BasicModel, BoxDetection, CategoryType, ClassificationDetection,
-                                             Detections, ErrorConfiguration, ModelInformation, Point, PointDetection,
-                                             PretrainedModel, SegmentationDetection, Shape)
+from learning_loop_node.data_classes import (BoxDetection, CategoryType, ClassificationDetection, Detections,
+                                             ErrorConfiguration, ModelInformation, Point, PointDetection,
+                                             PretrainedModel, SegmentationDetection, Shape, TrainingStateData)
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
 from . import progress_simulator
@@ -35,7 +35,7 @@ async def start_training(self) -> None:
             raise Exception('Could not start training')
         self.executor.start('while true; do sleep 1; done')
 
-    async def start_training_from_scratch(self, base_model_id: str) -> None:
+    async def start_training_from_scratch(self) -> None:
         self.current_iteration = 0
         self.executor.start('while true; do sleep 1; done')
 
@@ -44,7 +44,7 @@ def get_executor_error_from_log(self) -> Optional[str]:
             return 'mocked crash'
         return None
 
-    def get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
+    def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
         if self.error_configuration.save_model:
             raise Exception()
 
@@ -66,37 +66,34 @@ async def _detect(self, model_information: ModelInformation, images:  List[str],
         for image in images:
             image_id = image.split('/')[-1].replace('.jpg', '')
 
-            box_detections = []
-            point_detections = []
-            segmentation_detections = []
-            classification_detections = []
-            det_entry = {
-                'image_id': image_id, 'box_detections': box_detections, 'point_detections': point_detections,
-                'segmentation_detections': segmentation_detections,
-                'classification_detections': classification_detections}
+            box_detections: List[BoxDetection] = []
+            point_detections: List[PointDetection] = []
+            segmentation_detections: List[SegmentationDetection] = []
+            classification_detections: List[ClassificationDetection] = []
+
             for c in model_information.categories:
                 if c.type == CategoryType.Box:
-                    d = BoxDetection(category_name=c.name, x=1, y=2, width=30, height=40,
-                                     model_name=model_information.version, confidence=.99, category_id=c.id)
-                    box_detections.append(d)
+                    bd = BoxDetection(category_name=c.name, x=1, y=2, width=30, height=40,
+                                      model_name=model_information.version, confidence=.99, category_id=c.id)
+                    box_detections.append(bd)
                 elif c.type == CategoryType.Point:
-                    d = PointDetection(category_name=c.name, x=100, y=200,
-                                       model_name=model_information.version, confidence=.97, category_id=c.id)
-                    point_detections.append(d)
+                    pd = PointDetection(category_name=c.name, x=100, y=200,
+                                        model_name=model_information.version, confidence=.97, category_id=c.id)
+                    point_detections.append(pd)
                 elif c.type == CategoryType.Segmentation:
-                    d = SegmentationDetection(category_name=c.name, shape=Shape(points=[Point(x=1, y=2), Point(
+                    sd = SegmentationDetection(category_name=c.name, shape=Shape(points=[Point(x=1, y=2), Point(
                         x=3, y=4)]), model_name=model_information.version, confidence=.96, category_id=c.id)
-                    segmentation_detections.append(d)
+                    segmentation_detections.append(sd)
                 elif c.type == CategoryType.Classification:
-                    d = ClassificationDetection(category_name=c.name, model_name=model_information.version,
-                                                confidence=.95, category_id=c.id)
-                    classification_detections.append(d)
+                    cd = ClassificationDetection(category_name=c.name, model_name=model_information.version,
+                                                 confidence=.95, category_id=c.id)
+                    classification_detections.append(cd)
             detections.append(Detections(box_detections=box_detections, point_detections=point_detections,
                                          segmentation_detections=segmentation_detections,
                                          classification_detections=classification_detections, image_id=image_id))
         return detections
 
-    async def clear_training_data(self, training_folder: str):
+    async def _clear_training_data(self, training_folder: str):
         pass
 
     @property
@@ -111,7 +108,7 @@ def training_progress(self) -> float:
         print(f'prog. is {self.current_iteration} / {self.max_iterations} = {self.current_iteration / self.max_iterations}')
         return self.current_iteration / self.max_iterations
 
-    def get_new_best_model(self) -> Optional[BasicModel]:
+    def _get_new_best_model(self) -> Optional[TrainingStateData]:
         logging.warning('get_new_model called')
         if self.error_configuration.get_new_model:
             raise Exception('Could not get new model')
@@ -120,7 +117,7 @@ def get_new_best_model(self) -> Optional[BasicModel]:
         self.current_iteration += 1
         return progress_simulator.increment_time(self, self.latest_known_confusion_matrix)
 
-    def on_model_published(self, basic_model: BasicModel) -> None:
+    def _on_metrics_published(self, basic_model: TrainingStateData) -> None:
         assert isinstance(basic_model.confusion_matrix, Dict)
         self.latest_known_confusion_matrix = basic_model.confusion_matrix
 
diff --git a/mock_trainer/app_code/progress_simulator.py b/mock_trainer/app_code/progress_simulator.py
index 042a0b29..76f8be52 100644
--- a/mock_trainer/app_code/progress_simulator.py
+++ b/mock_trainer/app_code/progress_simulator.py
@@ -1,11 +1,11 @@
 import random
 from typing import Dict, Optional
 
-from learning_loop_node.data_classes import BasicModel
+from learning_loop_node.data_classes import TrainingStateData
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
 
-def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -> Optional[BasicModel]:
+def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -> Optional[TrainingStateData]:
     if not trainer._training or not trainer._training.data:  # pylint: disable=protected-access
         return None
 
@@ -23,7 +23,7 @@ def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -
             'fn': max(random.randint(10-maximum, 10-minimum), 2),
         }
 
-    new_model = BasicModel(
+    new_model = TrainingStateData(
         confusion_matrix=confusion_matrix,
     )
 
diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py
index fecbe868..9f08b779 100644
--- a/mock_trainer/app_code/tests/test_mock_trainer.py
+++ b/mock_trainer/app_code/tests/test_mock_trainer.py
@@ -16,7 +16,7 @@ async def create_mock_trainer() -> MockTrainerLogic:
 
 async def test_get_model_files(setup_test_project2):
     mock_trainer = await create_mock_trainer()
-    files = mock_trainer.get_latest_model_files()
+    files = mock_trainer._get_latest_model_files()
 
     assert isinstance(files, Dict)
 
@@ -38,5 +38,5 @@ async def test_get_new_model(setup_test_project2):
         images_folder="",
         training_folder="",)
     mock_trainer.training.data = TrainingData(image_data=[], categories=[])
-    model = mock_trainer.get_new_best_model()
+    model = mock_trainer._get_new_best_model()
     assert model is not None

From 14a27eab1958e5a2629010c5aaca50cb3e4231d1 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 22 Mar 2024 20:51:41 +0100
Subject: [PATCH 21/62] solve all linting errors in mock nodes

---
 mock_detector/app_code/tests/test_detector.py    |  2 ++
 mock_trainer/app_code/mock_trainer_logic.py      |  6 +++---
 mock_trainer/app_code/tests/conftest.py          |  3 ++-
 mock_trainer/app_code/tests/test_detections.py   | 10 ++++++----
 mock_trainer/app_code/tests/test_mock_trainer.py |  3 +++
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/mock_detector/app_code/tests/test_detector.py b/mock_detector/app_code/tests/test_detector.py
index 3d05d99e..75816212 100644
--- a/mock_detector/app_code/tests/test_detector.py
+++ b/mock_detector/app_code/tests/test_detector.py
@@ -5,6 +5,8 @@
 from learning_loop_node.detector.detector_node import DetectorNode
 from learning_loop_node.globals import GLOBALS
 
+# pylint: disable=unused-argument
+
 
 @pytest.fixture(scope="session")
 def event_loop(request):
diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py
index 3d992f4e..4f0d2708 100644
--- a/mock_trainer/app_code/mock_trainer_logic.py
+++ b/mock_trainer/app_code/mock_trainer_logic.py
@@ -117,9 +117,9 @@ def _get_new_best_model(self) -> Optional[TrainingStateData]:
         self.current_iteration += 1
         return progress_simulator.increment_time(self, self.latest_known_confusion_matrix)
 
-    def _on_metrics_published(self, basic_model: TrainingStateData) -> None:
-        assert isinstance(basic_model.confusion_matrix, Dict)
-        self.latest_known_confusion_matrix = basic_model.confusion_matrix
+    def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
+        assert isinstance(training_state_data.confusion_matrix, Dict)
+        self.latest_known_confusion_matrix = training_state_data.confusion_matrix
 
     @property
     def model_architecture(self) -> str:
diff --git a/mock_trainer/app_code/tests/conftest.py b/mock_trainer/app_code/tests/conftest.py
index 86c62dc2..6c23ca7e 100644
--- a/mock_trainer/app_code/tests/conftest.py
+++ b/mock_trainer/app_code/tests/conftest.py
@@ -1,5 +1,4 @@
 import asyncio
-import logging
 import shutil
 
 import pytest
@@ -7,6 +6,8 @@
 from learning_loop_node.globals import GLOBALS
 from learning_loop_node.loop_communication import LoopCommunicator
 
+# pylint: disable=redefined-outer-name
+
 
 @pytest.fixture()
 async def glc():
diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py
index 5b5aa461..7b909db3 100644
--- a/mock_trainer/app_code/tests/test_detections.py
+++ b/mock_trainer/app_code/tests/test_detections.py
@@ -12,8 +12,10 @@
 
 from ..mock_trainer_logic import MockTrainerLogic
 
+# pylint: disable=protected-access,redefined-outer-name,unused-argument
 
-async def test_all(setup_test_project1, glc: LoopCommunicator):  # pylint: disable=unused-argument, redefined-outer-name
+
+async def test_all(setup_test_project1, glc: LoopCommunicator):
     assert_image_count(0)
     assert GLOBALS.data_folder == '/tmp/learning_loop_lib_data'
 
@@ -28,14 +30,14 @@ async def test_all(setup_test_project1, glc: LoopCommunicator):  # pylint: disab
                'resolution': 800,
                'flip_rl': False,
                'flip_ud': False}
-    trainer._node = node  # pylint: disable=protected-access
+    trainer._node = node
     trainer._init_new_training(context=context, details=details)
 
     project_folder = create_project_folder(context)
     training = generate_training(project_folder, context)
     training.model_id_for_detecting = latest_model_id
-    trainer._training = training  # pylint: disable=protected-access
-    await trainer._do_detections()  # pylint: disable=protected-access
+    trainer._training = training
+    await trainer._do_detections()
     detections = trainer.active_training_io.load_detections()
 
     assert_image_count(10)  # TODO This assert fails frequently on Drone
diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py
index 9f08b779..60029db2 100644
--- a/mock_trainer/app_code/tests/test_mock_trainer.py
+++ b/mock_trainer/app_code/tests/test_mock_trainer.py
@@ -7,6 +7,9 @@
 
 from ..mock_trainer_logic import MockTrainerLogic
 
+# pylint: disable=protected-access
+# pylint: disable=unused-argument
+
 
 async def create_mock_trainer() -> MockTrainerLogic:
     mock_trainer = MockTrainerLogic(model_format='mocked')

From dfcb37f008070e6f4e735173696a825f2f0841af Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Mon, 25 Mar 2024 11:59:23 +0100
Subject: [PATCH 22/62] further improvements of documentation and refactoring

---
 learning_loop_node/data_classes/__init__.py   | 11 +++++++++++
 learning_loop_node/trainer/trainer_logic.py   |  4 ++--
 .../trainer/trainer_logic_generic.py          | 19 +++++++++++++------
 learning_loop_node/trainer/trainer_node.py    |  2 +-
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/learning_loop_node/data_classes/__init__.py b/learning_loop_node/data_classes/__init__.py
index 32188896..524cb8bb 100644
--- a/learning_loop_node/data_classes/__init__.py
+++ b/learning_loop_node/data_classes/__init__.py
@@ -6,3 +6,14 @@
 from .socket_response import SocketResponse
 from .training import (Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData,
                        TrainingError, TrainingOut, TrainingStateData, TrainingStatus)
+
+__all__ = [
+    'AnnotationData', 'AnnotationEventType', 'SegmentationAnnotation', 'ToolOutput', 'UserInput',
+    'BoxDetection', 'ClassificationDetection', 'Detections', 'Observation', 'Point', 'PointDetection',
+    'SegmentationDetection', 'Shape',
+    'AnnotationNodeStatus', 'Category', 'CategoryType', 'Context', 'DetectionStatus', 'ErrorConfiguration',
+    'ModelInformation', 'NodeState', 'NodeStatus',
+    'SocketResponse',
+    'Errors', 'Hyperparameter', 'Model', 'PretrainedModel', 'TrainerState', 'Training', 'TrainingData',
+    'TrainingError', 'TrainingOut', 'TrainingStateData', 'TrainingStatus',
+]
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index ee408cf9..35587a97 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -154,11 +154,11 @@ def get_log(self) -> str:
 
     @abstractmethod
     async def start_training(self) -> None:
-        '''Should be used to start a training.'''
+        '''Should be used to start a training on executer, e.g. self.executor.start(cmd).'''
 
     @abstractmethod
     async def start_training_from_scratch(self) -> None:
-        '''Should be used to start a training from scratch.
+        '''Should be used to start a training from scratch on executer, e.g. self.executor.start(cmd).
         NOTE base_model_id is now accessible via self.training.base_model_id 
         the id of a pretrained model provided by self.provided_pretrained_models.'''
 
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index a526fa62..0eada30b 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -10,8 +10,8 @@
 
 from fastapi.encoders import jsonable_encoder
 
-from ..data_classes import (Context, Errors, PretrainedModel, TrainerState, Training, TrainingData, TrainingOut,
-                            TrainingStateData)
+from ..data_classes import (Context, Errors, Hyperparameter, PretrainedModel, TrainerState, Training, TrainingData,
+                            TrainingOut, TrainingStateData)
 from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4
 from .downloader import TrainingsDownloader
 from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO
@@ -61,6 +61,14 @@ def training(self) -> Training:
         assert self._training is not None, 'training must be initialized, call `init` first'
         return self._training
 
+    @property
+    def hyperparameter(self) -> Hyperparameter:
+        assert self.training_data is not None, 'Training should have data'
+        assert self.training_data.hyperparameter is not None, 'Training.data should have hyperparameter'
+        return self.training_data.hyperparameter
+
+    # ---------------------------------------- PROPERTIES ----------------------------------------
+
     @property
     def training_data(self) -> Optional[TrainingData]:
         if self.training_active and self.training.data:
@@ -72,7 +80,6 @@ def training_context(self) -> Optional[Context]:
         if self.training_active:
             return self.training.context
         return None
-    # ---------------------------------------- PROPERTIES ----------------------------------------
 
     @property
     def training_active(self) -> bool:
@@ -97,7 +104,7 @@ def training_uptime(self) -> Optional[float]:
         return None
 
     @property
-    def hyperparameters(self) -> Optional[Dict]:
+    def hyperparameters_for_state_sync(self) -> Optional[Dict]:
         """Used in sync_confusion_matrix and send_status to provide information about the training configuration.
         """
         if self._training and self._training.data and self._training.data.hyperparameter:
@@ -310,7 +317,7 @@ async def _sync_confusion_matrix(self) -> None:
                                            confusion_matrix=new_best_model.confusion_matrix,
                                            train_image_count=self.training.data.train_image_count(),
                                            test_image_count=self.training.data.test_image_count(),
-                                           hyperparameters=self.hyperparameters)
+                                           hyperparameters=self.hyperparameters_for_state_sync)
                 await asyncio.sleep(0.1)  # NOTE needed for tests.
 
                 result = await self.node.sio_client.call('update_training', (
@@ -447,7 +454,7 @@ def _get_new_best_model(self) -> Optional[TrainingStateData]:
         `confusion_matrix` contains a dict of all classes:
             - The classes must be identified by their id, not their name.
             - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives).
-        `meta_information` can hold any data which is helpful for self.on_model_published to store weight file etc for later upload via self.get_model_files
+        `meta_information` can hold any data which is helpful for self._on_metrics_published to store weight file etc for later upload via self.get_model_files
         """
         raise NotImplementedError
 
diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py
index 6112d449..f69cf103 100644
--- a/learning_loop_node/trainer/trainer_node.py
+++ b/learning_loop_node/trainer/trainer_node.py
@@ -84,7 +84,7 @@ async def send_status(self):
             status.train_image_count = data.train_image_count()
             status.test_image_count = data.test_image_count()
             status.skipped_image_count = data.skipped_image_count
-            status.hyperparameters = self.trainer_logic.hyperparameters
+            status.hyperparameters = self.trainer_logic.hyperparameters_for_state_sync
             status.errors = self.trainer_logic.errors.errors
             status.context = self.trainer_logic.training_context
 

From 70aa44d30400139b7363b8559ae3e7094f122d99 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Mon, 25 Mar 2024 18:03:29 +0100
Subject: [PATCH 23/62] Further refactoring and API improvements

---
 learning_loop_node/data_classes/general.py    |  2 +
 learning_loop_node/data_classes/training.py   | 12 +++-
 learning_loop_node/tests/test_executor.py     |  4 +-
 learning_loop_node/trainer/executor.py        | 34 +++++-----
 .../trainer/rest/backdoor_controls.py         |  4 +-
 .../trainer/tests/states/test_state_train.py  |  4 +-
 .../trainer/tests/testing_trainer_logic.py    | 16 ++---
 learning_loop_node/trainer/trainer_logic.py   | 63 ++++++++++---------
 .../trainer/trainer_logic_generic.py          | 22 ++++---
 mock_trainer/app_code/mock_trainer_logic.py   | 12 ++--
 .../app_code/tests/test_mock_trainer.py       |  4 +-
 11 files changed, 96 insertions(+), 81 deletions(-)

diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py
index 41141395..5c616841 100644
--- a/learning_loop_node/data_classes/general.py
+++ b/learning_loop_node/data_classes/general.py
@@ -60,6 +60,8 @@ def context(self):
 
     @staticmethod
     def load_from_disk(model_root_path: str) -> Optional['ModelInformation']:
+        """Load model.json from model_root_path and return ModelInformation object.
+        """
         model_info_file_path = f'{model_root_path}/model.json'
         if not os.path.exists(model_info_file_path):
             logging.warning(f"could not find model information file '{model_info_file_path}'")
diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index 4df5a289..c192ba41 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -3,6 +3,7 @@
 import time
 from dataclasses import dataclass, field
 from enum import Enum
+from pathlib import Path
 from typing import Dict, List, Optional
 
 # pylint: disable=no-name-in-module
@@ -106,13 +107,18 @@ class Training():
     training_folder: str  # f'{project_folder}/trainings/{trainings_id}'
     start_time: float = field(default_factory=time.time)
 
-    base_model_id: Optional[str] = None  # model uuid to download (to continue training)
+    # model uuid to download (to continue training) | is '' when training from scratch
+    base_model_id: Optional[str] = None
     data: Optional[TrainingData] = None
     training_number: Optional[int] = None
     training_state: Optional[str] = None
     model_id_for_detecting: Optional[str] = None
     hyperparameters: Optional[Dict] = None
 
+    @property
+    def training_folder_path(self) -> Path:
+        return Path(self.training_folder)
+
     def set_values_from_data(self, data: Dict) -> None:
         self.data = TrainingData(categories=Category.from_list(data['categories']))
         self.data.hyperparameter = Hyperparameter.from_data(data=data)
@@ -132,8 +138,8 @@ class TrainingOut():
 
 @dataclass(**KWONLY_SLOTS)
 class TrainingStateData():
-    confusion_matrix: Optional[Dict] = None  # This is actually just class-wise metrics
-    meta_information: Optional[Dict] = None
+    confusion_matrix: Dict = field(default_factory=dict)
+    meta_information: Dict = field(default_factory=dict)
 
 
 @dataclass(**KWONLY_SLOTS)
diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py
index b661c818..1079ea1c 100644
--- a/learning_loop_node/tests/test_executor.py
+++ b/learning_loop_node/tests/test_executor.py
@@ -32,7 +32,7 @@ def test_executor_lifecycle():
 
     executor.start(cmd)
 
-    assert executor.is_process_running()
+    assert executor.is_running()
     assert_process_is_running('some_executable.sh')
 
     sleep(1)
@@ -40,7 +40,7 @@ def test_executor_lifecycle():
 
     executor.stop()
 
-    assert not executor.is_process_running()
+    assert not executor.is_running()
     sleep(1)
     assert_process_is_running('some_executable.sh', False)
 
diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py
index 628ef022..e8dc66ae 100644
--- a/learning_loop_node/trainer/executor.py
+++ b/learning_loop_node/trainer/executor.py
@@ -28,33 +28,33 @@ class Executor:
     def __init__(self, base_path: str) -> None:
         self.path = base_path
         os.makedirs(self.path, exist_ok=True)
-        self.process: Optional[subprocess.Popen[bytes]] = None
+        self._process: Optional[subprocess.Popen[bytes]] = None
 
-    def start(self, cmd: str):
+    def start(self, cmd: str) -> None:
         with open(f'{self.path}/last_training.log', 'a') as f:
             f.write(f'\nStarting executor with command: {cmd}\n')
+
         # pylint: disable=subprocess-popen-preexec-fn
-        self.process = subprocess.Popen(
+        self._process = subprocess.Popen(
             f'cd {self.path}; {cmd} >> last_training.log 2>&1',
             shell=True,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             executable='/bin/bash',
-            preexec_fn=create_signal_handler(),
-        )
+            preexec_fn=create_signal_handler())
 
-    def is_process_running(self):
-        if self.process is None:
+    def is_running(self) -> bool:
+        if self._process is None:
             return False
 
-        if self.process.poll() is not None:
+        if self._process.poll() is not None:
             return False
 
         try:
-            psutil.Process(self.process.pid)
+            psutil.Process(self._process.pid)
         except psutil.NoSuchProcess:
             # self.process.terminate() # TODO does this make sense?
-            # self.process = None
+            self._process = None
             return False
 
         return True
@@ -82,24 +82,24 @@ def get_log_by_lines(self, since_last_start=False) -> List[str]:  # TODO do not
             return []
 
     def stop(self):
-        if self.process is None:
+        if self._process is None:
             logging.info('no process running ... nothing to stop')
             return
 
         logging.info('terminating process')
 
         try:
-            os.killpg(os.getpgid(self.process.pid), signal.SIGTERM)
+            os.killpg(os.getpgid(self._process.pid), signal.SIGTERM)
         except ProcessLookupError:
             pass
 
-        self.process.terminate()
-        _, _ = self.process.communicate(timeout=3)
+        self._process.terminate()
+        _, _ = self._process.communicate(timeout=3)
 
     @property
     def return_code(self):
-        if not self.process:
+        if not self._process:
             return None
-        if self.is_process_running():
+        if self.is_running():
             return None
-        return self.process.poll()
+        return self._process.poll()
diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py
index 726cdb8e..e3b17ed3 100644
--- a/learning_loop_node/trainer/rest/backdoor_controls.py
+++ b/learning_loop_node/trainer/rest/backdoor_controls.py
@@ -97,7 +97,7 @@ async def add_steps(request: Request):
 
     assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic'
 
-    if not trainer_logic._executor or not trainer_logic._executor.is_process_running():  # pylint: disable=protected-access
+    if not trainer_logic._executor or not trainer_logic._executor.is_running():  # pylint: disable=protected-access
         training = trainer_logic._training  # pylint: disable=protected-access
         logging.error(f'cannot add steps when not running, state: {training.training_state if training else "None"}')
         raise HTTPException(status_code=409, detail="trainer is not running")
@@ -126,7 +126,7 @@ async def kill_process(request: Request):
     trainer_node = trainer_node_from_request(request)
     trainer_logic = trainer_node.trainer_logic
     assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic'
-    if not trainer_logic._executor or not trainer_logic._executor.is_process_running():
+    if not trainer_logic._executor or not trainer_logic._executor.is_running():
         raise HTTPException(status_code=409, detail="trainer is not running")
     trainer_logic._executor.stop()
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py
index 603d18e9..3c6b579a 100644
--- a/learning_loop_node/trainer/tests/states/test_state_train.py
+++ b/learning_loop_node/trainer/tests/states/test_state_train.py
@@ -37,7 +37,7 @@ async def test_stop_running_training(test_initialized_trainer: TestingTrainerLog
 
     _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)
+    await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01)
     await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'start_training'
@@ -59,7 +59,7 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain
 
     _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)
+    await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01)
     await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
     assert trainer.start_training_task.__name__ == 'resume'
diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index 188d37d7..d4daaff1 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -31,16 +31,16 @@ def provided_pretrained_models(self) -> List[PretrainedModel]:
             PretrainedModel(name='large', label='Large', description='a large model')]
 
     # pylint: disable=unused-argument
-    async def start_training(self, model: str = 'model.model') -> None:
+    async def _start_training_from_base_model(self, model: str = 'model.model') -> None:
         assert self._executor is not None
         self._executor.start('while true; do sleep 1; done')
 
-    async def start_training_from_scratch(self) -> None:
+    async def _start_training_from_scratch(self) -> None:
         base_model_id = self.training.base_model_id
         assert base_model_id is not None
-        await self.start_training(model=f'model_{base_model_id}.pt')
+        await self._start_training_from_base_model(model=f'model_{base_model_id}.pt')
 
-    def _get_new_best_model(self) -> Optional[TrainingStateData]:
+    def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
         if self.has_new_model:
             return TrainingStateData(confusion_matrix={})
         return None
@@ -79,11 +79,11 @@ def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
             f.write('zweiundvierzig')
         return {'mocked': [fake_weight_file, more_data_file], 'mocked_2': [fake_weight_file, more_data_file]}
 
-    def can_resume(self) -> bool:
+    def _can_resume(self) -> bool:
         return self._can_resume
 
-    async def resume(self) -> None:
-        return await self.start_training()
+    async def _resume(self) -> None:
+        return await self._start_training_from_base_model()
 
     async def _detect(self, model_information: ModelInformation, images:  List[str], model_folder: str) -> List[Detections]:
         detections: List[Detections] = []
@@ -92,5 +92,5 @@ async def _detect(self, model_information: ModelInformation, images:  List[str],
     async def _clear_training_data(self, training_folder: str) -> None:
         return
 
-    def get_executor_error_from_log(self) -> Optional[str]:
+    def _get_executor_error_from_log(self) -> Optional[str]:
         return self.error_msg
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 35587a97..985479b8 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -26,15 +26,21 @@ def __init__(self, model_format: str) -> None:
         self._executor: Optional[Executor] = None
         self.start_training_task: Optional[Coroutine] = None
 
+    # ---------------------------------------- IMPLEMENTED ABSTRACT PROPERTIES ----------------------------------------
+
     @property
     def detection_progress(self) -> Optional[float]:
         return self._detection_progress
 
+    # ---------------------------------------- PROPERTIES ----------------------------------------
+
     @property
     def executor(self) -> Executor:
         assert self._executor is not None, 'executor must be set, call `run_training` first'
         return self._executor
 
+    # ---------------------------------------- IMPLEMENTED ABSTRACT MEHTODS ----------------------------------------
+
     async def _train(self) -> None:
         previous_state = TrainerState.TrainModelDownloaded
         error_key = 'run_training'
@@ -46,11 +52,11 @@ async def _train(self) -> None:
 
             last_sync_time = datetime.now()
             while True:
-                if not self.executor.is_process_running():
+                if not self.executor.is_running():
                     break
                 if (datetime.now() - last_sync_time).total_seconds() > 5:
                     last_sync_time = datetime.now()
-                    if self.get_executor_error_from_log():
+                    if self._get_executor_error_from_log():
                         break
                     self.errors.reset(error_key)
                     try:
@@ -63,34 +69,20 @@ async def _train(self) -> None:
                 else:
                     await asyncio.sleep(0.1)
 
-            error = self.get_executor_error_from_log()
+            error = self._get_executor_error_from_log()
             if error:
                 raise TrainingError(cause=error)
 
-            # TODO check if this works to catch errors from the executor:
-            # if self.executor.return_code != 0:
-            #     self.errors.set(error_key, f'Executor return code was {self.executor.return_code}')
-            #     raise TrainingError(cause=f'Executor return code was {self.executor.return_code}')
+            if self.executor.return_code != 0:  # TODO check if this works to catch errors from the executor:
+                raise TrainingError(cause=f'Executor returned with error code: {self.executor.return_code}')
 
         except TrainingError:
             logging.exception('Error in TrainingProcess')
-            if self.executor.is_process_running():
+            if self.executor.is_running():
                 self.executor.stop()
             self.training.training_state = previous_state
             raise
 
-    async def _start_training(self):
-        self.start_training_task = None  # NOTE: this is used i.e. by tests
-        if self.can_resume():
-            self.start_training_task = self.resume()
-        else:
-            base_model_id = self.training.base_model_id
-            if not is_valid_uuid4(base_model_id):  # TODO this check was done earlier!
-                self.start_training_task = self.start_training_from_scratch()
-            else:
-                self.start_training_task = self.start_training()
-        await self.start_training_task
-
     async def _do_detections(self) -> None:
         context = self.training.context
         model_id = self.training.model_id_for_detecting
@@ -131,11 +123,27 @@ async def _do_detections(self) -> None:
             batch_detections = await self._detect(model_information, batch_images, tmp_folder)
             self.active_training_io.save_detections(batch_detections, idx)
 
+    # ---------------------------------------- METHODS ----------------------------------------
+
+    async def _start_training(self):
+        self.start_training_task = None  # NOTE: this is used i.e. by tests
+        if self._can_resume():
+            self.start_training_task = self._resume()
+        else:
+            base_model_id = self.training.base_model_id
+            if not is_valid_uuid4(base_model_id):
+                self.start_training_task = self._start_training_from_scratch()
+            else:
+                self.start_training_task = self._start_training_from_base_model()
+        await self.start_training_task
+
+    # ---------------------------------------- OVERWRITTEN METHODS ----------------------------------------
+
     async def stop(self) -> None:
         """If executor is running, stop it. Else cancel training task."""
         if not self.training_active:
             return
-        if self._executor and self._executor.is_process_running():
+        if self._executor and self._executor.is_running():
             self.executor.stop()
         elif self.training_task:
             logging.info('cancelling training task')
@@ -147,32 +155,29 @@ async def stop(self) -> None:
                 logging.info('cancelled training task')
                 self._may_restart()
 
-    def get_log(self) -> str:
-        return self.executor.get_log()
-
     # ---------------------------------------- ABSTRACT METHODS ----------------------------------------
 
     @abstractmethod
-    async def start_training(self) -> None:
+    async def _start_training_from_base_model(self) -> None:
         '''Should be used to start a training on executer, e.g. self.executor.start(cmd).'''
 
     @abstractmethod
-    async def start_training_from_scratch(self) -> None:
+    async def _start_training_from_scratch(self) -> None:
         '''Should be used to start a training from scratch on executer, e.g. self.executor.start(cmd).
         NOTE base_model_id is now accessible via self.training.base_model_id 
         the id of a pretrained model provided by self.provided_pretrained_models.'''
 
     @abstractmethod
-    def can_resume(self) -> bool:
+    def _can_resume(self) -> bool:
         '''Override this method to return True if the trainer can resume training.'''
 
     @abstractmethod
-    async def resume(self) -> None:
+    async def _resume(self) -> None:
         '''Is called when self.can_resume() returns True.
         One may resume the training on a previously trained model stored by self.on_model_published(basic_model).'''
 
     @abstractmethod
-    def get_executor_error_from_log(self) -> Optional[str]:
+    def _get_executor_error_from_log(self) -> Optional[str]:
         '''Should be used to provide error informations to the Learning Loop by extracting data from self.executor.get_log().'''
 
     @abstractmethod
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index 0eada30b..e1798c46 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -293,17 +293,17 @@ async def _download_model(self) -> None:
         """If training is continued, the model is downloaded from the Learning Loop to the training_folder.
         The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training.
         """
-        model_id = self.training.base_model_id
+        base_model_id = self.training.base_model_id
         # TODO this checks if we continue a training -> make more explicit
-        if model_id and is_valid_uuid4(self.training.base_model_id):
+        if base_model_id and is_valid_uuid4(self.training.base_model_id):
             logging.info('loading model from Learning Loop')
-            logging.info(f'downloading model {model_id} as {self.model_format}')
-            await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, model_id, self.model_format)
+            logging.info(f'downloading model {base_model_id} as {self.model_format}')
+            await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_id, self.model_format)
             shutil.move(f'{self.training.training_folder}/model.json',
                         f'{self.training.training_folder}/base_model.json')
         else:
             logging.info(
-                f'base_model_id {model_id} is not a valid uuid4 (or no base model was not provided), skipping download')
+                f'base_model_id {base_model_id} is not a valid uuid4 (or no base model was not provided), skipping download')
 
     async def _sync_confusion_matrix(self) -> None:
         """Syncronizes the confusion matrix with the Learning Loop via the update_training endpoint.
@@ -311,7 +311,7 @@ async def _sync_confusion_matrix(self) -> None:
         """
         error_key = 'sync_confusion_matrix'
         try:
-            new_best_model = self._get_new_best_model()
+            new_best_model = self._get_new_best_training_state()
             if new_best_model and self.training.data:
                 new_training = TrainingOut(trainer_id=self.node.uuid,
                                            confusion_matrix=new_best_model.confusion_matrix,
@@ -441,15 +441,15 @@ async def _train(self) -> None:
 
     @abstractmethod
     async def _do_detections(self) -> None:
-        """Should be used to execute detections.
+        """Should be used to infer detections of all images and save them to drive.
         active_training_io.save_detections(...) should be used to store the detections.
         asyncio.CancelledError should be catched and re-raised.
         """
         raise NotImplementedError
 
     @abstractmethod
-    def _get_new_best_model(self) -> Optional[TrainingStateData]:
-        """Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe.
+    def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
+        """Is called frequently by `_sync_confusion_matrix` to check if a new "best" model is availabe.
         Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information).
         `confusion_matrix` contains a dict of all classes:
             - The classes must be identified by their id, not their name.
@@ -461,6 +461,8 @@ def _get_new_best_model(self) -> Optional[TrainingStateData]:
     @abstractmethod
     def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
         """Called after the metrics corresponding to TrainingStateData have been successfully send to the Learning Loop.
+        Receives the TrainingStateData object which was returned by self._get_new_best_training_state. 
+        If above function returns None, this function is not called.
         The respective files for this model should be stored so they can be later uploaded in get_latest_model_files.
         """
         raise NotImplementedError
@@ -468,7 +470,7 @@ def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
     @abstractmethod
     def _get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]:
         """Called when the Learning Loop requests to backup the latest model for the training.
-        This function is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop.
+        This function is used to generate and gather all files needed for transfering the actual data from the trainer node to the Learning Loop.
         In the simplest implementation this method just renames the weight file (e.g. stored in TrainingStateData.meta_information) into a file name like latest_published_model
         Should return a list of file paths which describe the model.
         These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.)
diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py
index 4f0d2708..f4fb3fc8 100644
--- a/mock_trainer/app_code/mock_trainer_logic.py
+++ b/mock_trainer/app_code/mock_trainer_logic.py
@@ -23,23 +23,23 @@ def __init__(self, model_format: str) -> None:
         self.current_iteration = 0
         self.provide_new_model = True
 
-    def can_resume(self) -> bool:
+    def _can_resume(self) -> bool:
         return False
 
-    async def resume(self) -> None:
+    async def _resume(self) -> None:
         pass
 
-    async def start_training(self) -> None:
+    async def _start_training_from_base_model(self) -> None:
         self.current_iteration = 0
         if self.error_configuration.begin_training:
             raise Exception('Could not start training')
         self.executor.start('while true; do sleep 1; done')
 
-    async def start_training_from_scratch(self) -> None:
+    async def _start_training_from_scratch(self) -> None:
         self.current_iteration = 0
         self.executor.start('while true; do sleep 1; done')
 
-    def get_executor_error_from_log(self) -> Optional[str]:
+    def _get_executor_error_from_log(self) -> Optional[str]:
         if self.error_configuration.crash_training:
             return 'mocked crash'
         return None
@@ -108,7 +108,7 @@ def training_progress(self) -> float:
         print(f'prog. is {self.current_iteration} / {self.max_iterations} = {self.current_iteration / self.max_iterations}')
         return self.current_iteration / self.max_iterations
 
-    def _get_new_best_model(self) -> Optional[TrainingStateData]:
+    def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
         logging.warning('get_new_model called')
         if self.error_configuration.get_new_model:
             raise Exception('Could not get new model')
diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py
index 60029db2..0946f991 100644
--- a/mock_trainer/app_code/tests/test_mock_trainer.py
+++ b/mock_trainer/app_code/tests/test_mock_trainer.py
@@ -30,7 +30,7 @@ async def test_get_model_files(setup_test_project2):
 
 async def test_get_new_model(setup_test_project2):
     mock_trainer = await create_mock_trainer()
-    await mock_trainer.start_training()
+    await mock_trainer._start_training_from_base_model()
 
     model = Model(uuid=(str(uuid4())))
     context = Context(organization="", project="")
@@ -41,5 +41,5 @@ async def test_get_new_model(setup_test_project2):
         images_folder="",
         training_folder="",)
     mock_trainer.training.data = TrainingData(image_data=[], categories=[])
-    model = mock_trainer._get_new_best_model()
+    model = mock_trainer._get_new_best_training_state()
     assert model is not None

From 44466e956ba2cb41396dccbaecec84a3fdcd5585 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Mon, 25 Mar 2024 18:45:38 +0100
Subject: [PATCH 24/62] Further refactoring and API improvements

---
 learning_loop_node/data_classes/training.py   |  2 +-
 learning_loop_node/trainer/executor.py        |  1 +
 .../trainer/trainer_logic_generic.py          | 29 +++++++++++--------
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index c192ba41..f88a7a00 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -107,7 +107,7 @@ class Training():
     training_folder: str  # f'{project_folder}/trainings/{trainings_id}'
     start_time: float = field(default_factory=time.time)
 
-    # model uuid to download (to continue training) | is '' when training from scratch
+    # model uuid to download (to continue training) | is not a uuid when training from scratch (blank or pt-name ?!)
     base_model_id: Optional[str] = None
     data: Optional[TrainingData] = None
     training_number: Optional[int] = None
diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py
index e8dc66ae..e823c0d4 100644
--- a/learning_loop_node/trainer/executor.py
+++ b/learning_loop_node/trainer/executor.py
@@ -31,6 +31,7 @@ def __init__(self, base_path: str) -> None:
         self._process: Optional[subprocess.Popen[bytes]] = None
 
     def start(self, cmd: str) -> None:
+        logging.info(f'Starting executor with command: {cmd}')
         with open(f'{self.path}/last_training.log', 'a') as f:
             f.write(f'\nStarting executor with command: {cmd}\n')
 
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index e1798c46..34535bbc 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -25,6 +25,7 @@ class TrainerLogicGeneric(ABC):
     def __init__(self, model_format: str) -> None:
 
         # NOTE: model_format is used in the file path for the model on the server:
+        # It acts as a key for list of files (cf. _get_latest_model_files)
         # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
         self.model_format: str = model_format
         self.errors = Errors()
@@ -294,16 +295,17 @@ async def _download_model(self) -> None:
         The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training.
         """
         base_model_id = self.training.base_model_id
+
         # TODO this checks if we continue a training -> make more explicit
-        if base_model_id and is_valid_uuid4(self.training.base_model_id):
-            logging.info('loading model from Learning Loop')
-            logging.info(f'downloading model {base_model_id} as {self.model_format}')
-            await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_id, self.model_format)
-            shutil.move(f'{self.training.training_folder}/model.json',
-                        f'{self.training.training_folder}/base_model.json')
-        else:
-            logging.info(
-                f'base_model_id {base_model_id} is not a valid uuid4 (or no base model was not provided), skipping download')
+        if not base_model_id or not is_valid_uuid4(base_model_id):
+            logging.info(f'skipping model download. No base model id provided: {base_model_id}')
+            return
+
+        logging.info('loading model from Learning Loop')
+        logging.info(f'downloading model {base_model_id} as {self.model_format}')
+        await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_id, self.model_format)
+        shutil.move(f'{self.training.training_folder}/model.json',
+                    f'{self.training.training_folder}/base_model.json')
 
     async def _sync_confusion_matrix(self) -> None:
         """Syncronizes the confusion matrix with the Learning Loop via the update_training endpoint.
@@ -468,11 +470,12 @@ def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def _get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]:
+    def _get_latest_model_files(self) -> Dict[str, List[str]]:
         """Called when the Learning Loop requests to backup the latest model for the training.
-        This function is used to generate and gather all files needed for transfering the actual data from the trainer node to the Learning Loop.
+        This function is used to __generate and gather__ all files needed for transfering the actual data from the trainer node to the Learning Loop.
         In the simplest implementation this method just renames the weight file (e.g. stored in TrainingStateData.meta_information) into a file name like latest_published_model
-        Should return a list of file paths which describe the model.
+
+        The function should return a list of file paths which describe the model per format.
         These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.)
         and will be stored in the Learning Loop unter the format of this trainer.
         Note: by convention the weightfile should be named "model.<extension>" where extension is the file format of the weightfile.
@@ -480,6 +483,8 @@ def _get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[st
 
         If a trainer can also generate other formats (for example for an detector),
         a dictionary mapping format -> list of files can be returned.
+
+        If the function returns an empty dict, something went wrong and the model upload will be skipped.
         """
         raise NotImplementedError
 

From 793e8bbff7fd3085a01f1874a11e48bbb67b4d6f Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Tue, 26 Mar 2024 09:19:01 +0100
Subject: [PATCH 25/62] Further refactoring and API improvements

---
 learning_loop_node/data_classes/training.py   |  2 +-
 learning_loop_node/data_exchanger.py          |  6 ++----
 .../tests/states/test_state_detecting.py      |  4 ++--
 .../tests/states/test_state_upload_model.py   |  4 ++--
 learning_loop_node/trainer/trainer_logic.py   | 20 +++++++++----------
 .../trainer/trainer_logic_generic.py          |  8 ++++----
 .../app_code/tests/test_detections.py         |  2 +-
 7 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index f88a7a00..f65503b5 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -112,7 +112,7 @@ class Training():
     data: Optional[TrainingData] = None
     training_number: Optional[int] = None
     training_state: Optional[str] = None
-    model_id_for_detecting: Optional[str] = None
+    model_uuid_for_detecting: Optional[str] = None
     hyperparameters: Optional[Dict] = None
 
     @property
diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py
index 8a5633d4..0d4d2add 100644
--- a/learning_loop_node/data_exchanger.py
+++ b/learning_loop_node/data_exchanger.py
@@ -157,12 +157,10 @@ async def upload_model_get_uuid(self, context: Context, files: List[str], traini
         """Used by the trainers. Function returns the new model uuid to use for detection."""
         response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/trainings/{training_number}/models/latest/{mformat}/file', files=files)
         if response.status_code != 200:
-            logging.error(
-                f'---- could not upload model for training {training_number} and format {mformat}. Details: {response.text}')
+            logging.error(f'Could not upload model for training {training_number}, format {mformat}: {response.text}')
             response.raise_for_status()
             return None
 
         uploaded_model = response.json()
-        logging.info(
-            f'---- uploaded model for training {training_number} and format {mformat}. Model id is {uploaded_model}')
+        logging.info(f'Uploaded model for training {training_number}, format {mformat}. Response is: {uploaded_model}')
         return uploaded_model['id']
diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py
index 40a62e63..770429c8 100644
--- a/learning_loop_node/trainer/tests/states/test_state_detecting.py
+++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py
@@ -37,7 +37,7 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded)
     trainer._init_from_last_training()
-    trainer.training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242'
+    trainer.training.model_uuid_for_detecting = '12345678-bobo-7e92-f95f-424242424242'
 
     _ = asyncio.get_running_loop().create_task(trainer._run())
 
@@ -64,7 +64,7 @@ async def test_model_not_downloadable_error(test_initialized_trainer: TestingTra
 
     assert trainer_has_error(trainer)
     assert trainer.training.training_state == TrainerState.TrainModelUploaded
-    assert trainer.training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000'
+    assert trainer.training.model_uuid_for_detecting == '00000000-0000-0000-0000-000000000000'
     assert trainer.node.last_training_io.load() == trainer.training
 
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
index 36c625f4..b2bfa4c7 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
@@ -30,7 +30,7 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer
 
     assert trainer_has_error(trainer) is False
     assert trainer.training.training_state == TrainerState.TrainModelUploaded
-    assert trainer.training.model_id_for_detecting is not None
+    assert trainer.training.model_uuid_for_detecting is not None
     assert trainer.node.last_training_io.load() == trainer.training
 
 
@@ -68,7 +68,7 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai
 
     assert trainer_has_error(trainer)
     assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
-    assert trainer.training.model_id_for_detecting is None
+    assert trainer.training.model_uuid_for_detecting is None
     assert trainer.node.last_training_io.load() == trainer.training
 
 
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 985479b8..32377b01 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -18,10 +18,10 @@
 class TrainerLogic(TrainerLogicGeneric):
 
     def __init__(self, model_format: str) -> None:
+        """This class is the base class for all trainers that use an executor to run training processes.
+        The executor is used to run the training process in a separate process."""
+
         super().__init__(model_format)
-        self.model_format: str = model_format
-        # NOTE: String to be used in the file path for the model on the server:
-        # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
         self._detection_progress: Optional[float] = None
         self._executor: Optional[Executor] = None
         self.start_training_task: Optional[Coroutine] = None
@@ -49,9 +49,10 @@ async def _train(self) -> None:
 
         try:
             await self._start_training()
-
             last_sync_time = datetime.now()
+
             while True:
+                await asyncio.sleep(0.1)
                 if not self.executor.is_running():
                     break
                 if (datetime.now() - last_sync_time).total_seconds() > 5:
@@ -65,19 +66,16 @@ async def _train(self) -> None:
                         logging.warning('CancelledError in run_training')
                         raise
                     except Exception:
-                        pass
-                else:
-                    await asyncio.sleep(0.1)
+                        logging.error('Error in sync_confusion_matrix (this error is ignored)')
 
-            error = self._get_executor_error_from_log()
-            if error:
+            if error := self._get_executor_error_from_log():
                 raise TrainingError(cause=error)
 
             if self.executor.return_code != 0:  # TODO check if this works to catch errors from the executor:
                 raise TrainingError(cause=f'Executor returned with error code: {self.executor.return_code}')
 
         except TrainingError:
-            logging.exception('Error in TrainingProcess')
+            logging.exception('Exception in trainer_logic._train')
             if self.executor.is_running():
                 self.executor.stop()
             self.training.training_state = previous_state
@@ -85,7 +83,7 @@ async def _train(self) -> None:
 
     async def _do_detections(self) -> None:
         context = self.training.context
-        model_id = self.training.model_id_for_detecting
+        model_id = self.training.model_uuid_for_detecting
         assert model_id, 'model_id must be set'
         tmp_folder = f'/tmp/model_for_auto_detections_{model_id}_{self.model_format}'
 
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index 34535bbc..44cc7463 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -340,12 +340,12 @@ async def _sync_confusion_matrix(self) -> None:
     async def _upload_model(self) -> None:
         """Uploads the latest model to the Learning Loop.
         """
-        new_model_id = await self._upload_model_return_new_model_uuid(self.training.context)
-        if new_model_id is None:
+        new_model_uuid = await self._upload_model_return_new_model_uuid(self.training.context)
+        if new_model_uuid is None:
             self.training.training_state = TrainerState.ReadyForCleanup
             logging.error('could not upload model - maybe training failed.. cleaning up')
-        logging.info(f'Successfully uploaded model and received new model id: {new_model_id}')
-        self.training.model_id_for_detecting = new_model_id
+        logging.info(f'Successfully uploaded model and received new model id: {new_model_uuid}')
+        self.training.model_uuid_for_detecting = new_model_uuid
 
     async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
         """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file.
diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py
index 7b909db3..a1e3b471 100644
--- a/mock_trainer/app_code/tests/test_detections.py
+++ b/mock_trainer/app_code/tests/test_detections.py
@@ -35,7 +35,7 @@ async def test_all(setup_test_project1, glc: LoopCommunicator):
 
     project_folder = create_project_folder(context)
     training = generate_training(project_folder, context)
-    training.model_id_for_detecting = latest_model_id
+    training.model_uuid_for_detecting = latest_model_id
     trainer._training = training
     await trainer._do_detections()
     detections = trainer.active_training_io.load_detections()

From 161dde5b4bf810a9d7c91c74ce55ba01a74ec8f7 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Tue, 26 Mar 2024 09:40:03 +0100
Subject: [PATCH 26/62] fix tests

---
 .../trainer/tests/states/test_state_detecting.py       | 10 ++++------
 .../trainer/tests/states/test_state_train.py           |  9 +++------
 .../trainer/tests/testing_trainer_logic.py             |  6 +++---
 learning_loop_node/trainer/trainer_logic.py            |  7 +++++--
 4 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py
index 770429c8..5492f8dc 100644
--- a/learning_loop_node/trainer/tests/states/test_state_detecting.py
+++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py
@@ -17,12 +17,10 @@ def trainer_has_error(trainer: TrainerLogic):
 async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogic):  # NOTE was a flaky test
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state='train_model_uploaded',
-                                model_id_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a')
+                                model_uuid_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a')
     # trainer.load_active_training()
     _ = asyncio.get_running_loop().create_task(
-        trainer._perform_state('do_detections', TrainerState.Detecting,
-                               TrainerState.Detected, trainer._do_detections)
-    )
+        trainer._perform_state('do_detections', TrainerState.Detecting, TrainerState.Detected, trainer._do_detections))
 
     await assert_training_state(trainer.training, TrainerState.Detecting, timeout=1, interval=0.001)
     await assert_training_state(trainer.training, TrainerState.Detected, timeout=10, interval=0.001)
@@ -45,7 +43,7 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer
     await trainer.stop()
     await asyncio.sleep(0.1)
 
-    assert trainer._training is None  # pylint: disable=protected-access
+    assert trainer._training is None
     assert trainer.active_training_io.detections_exist() is False
     assert trainer.node.last_training_io.exists() is False
 
@@ -53,7 +51,7 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer
 async def test_model_not_downloadable_error(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded,
-                                model_id_for_detecting='00000000-0000-0000-0000-000000000000')  # bad model id
+                                model_uuid_for_detecting='00000000-0000-0000-0000-000000000000')  # bad model id
     trainer._init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer._run())
diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py
index 3c6b579a..f5ac282f 100644
--- a/learning_loop_node/trainer/tests/states/test_state_train.py
+++ b/learning_loop_node/trainer/tests/states/test_state_train.py
@@ -19,7 +19,6 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic
     await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
     assert trainer.start_training_task is not None
-    assert trainer.start_training_task.__name__ == 'start_training'
 
     assert trainer._executor is not None
     trainer._executor.stop()  # NOTE normally a training terminates itself
@@ -38,12 +37,11 @@ async def test_stop_running_training(test_initialized_trainer: TestingTrainerLog
     _ = asyncio.get_running_loop().create_task(trainer._run())
 
     await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01)
-    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.01)
     assert trainer.start_training_task is not None
-    assert trainer.start_training_task.__name__ == 'start_training'
 
     await trainer.stop()
-    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=2, interval=0.01)
 
     assert trainer.training.training_state == TrainerState.TrainingFinished
     assert trainer.node.last_training_io.load() == trainer.training
@@ -55,14 +53,13 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain
     # NOTE e.g. when a node-computer is restarted
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
     trainer._init_from_last_training()
-    trainer._can_resume = True
+    trainer._can_resume_flag = True
 
     _ = asyncio.get_running_loop().create_task(trainer._run())
 
     await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01)
     await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
-    assert trainer.start_training_task.__name__ == 'resume'
 
     assert trainer._executor is not None
     trainer._executor.stop()  # NOTE normally a training terminates itself e.g
diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index d4daaff1..73b1b5a9 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -11,7 +11,7 @@ class TestingTrainerLogic(TrainerLogic):
 
     def __init__(self, can_resume: bool = False) -> None:
         super().__init__('mocked')
-        self._can_resume: bool = can_resume
+        self._can_resume_flag: bool = can_resume
         self.has_new_model: bool = False
         self.error_msg: Optional[str] = None
 
@@ -68,7 +68,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
         assert isinstance(result, str)
         return result
 
-    def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
+    def _get_latest_model_files(self) -> Dict[str, List[str]]:
         time.sleep(1)  # NOTE reduce flakyness in Backend tests du to wrong order of events.
         fake_weight_file = '/tmp/weightfile.weights'
         with open(fake_weight_file, 'wb') as f:
@@ -80,7 +80,7 @@ def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
         return {'mocked': [fake_weight_file, more_data_file], 'mocked_2': [fake_weight_file, more_data_file]}
 
     def _can_resume(self) -> bool:
-        return self._can_resume
+        return self._can_resume_flag
 
     async def _resume(self) -> None:
         return await self._start_training_from_base_model()
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 32377b01..c2e73eb1 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -71,8 +71,9 @@ async def _train(self) -> None:
             if error := self._get_executor_error_from_log():
                 raise TrainingError(cause=error)
 
-            if self.executor.return_code != 0:  # TODO check if this works to catch errors from the executor:
-                raise TrainingError(cause=f'Executor returned with error code: {self.executor.return_code}')
+            # NOTE: This is problematic, because the return code is not 0 when executor was stoppen e.g. via self.stop()
+            # if self.executor.return_code != 0:
+            #     raise TrainingError(cause=f'Executor returned with error code: {self.executor.return_code}')
 
         except TrainingError:
             logging.exception('Exception in trainer_logic._train')
@@ -139,6 +140,8 @@ async def _start_training(self):
 
     async def stop(self) -> None:
         """If executor is running, stop it. Else cancel training task."""
+        print('===============> stop received in trainer_logic.', flush=True)
+
         if not self.training_active:
             return
         if self._executor and self._executor.is_running():

From 875f184a84870d82200076f9f9ddc0a821bf09e0 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Tue, 26 Mar 2024 10:47:00 +0100
Subject: [PATCH 27/62] Refactoring

---
 learning_loop_node/data_classes/training.py               | 7 ++++---
 learning_loop_node/trainer/tests/testing_trainer_logic.py | 2 +-
 learning_loop_node/trainer/trainer_logic.py               | 2 +-
 learning_loop_node/trainer/trainer_logic_generic.py       | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index f65503b5..ecb3025c 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -107,8 +107,9 @@ class Training():
     training_folder: str  # f'{project_folder}/trainings/{trainings_id}'
     start_time: float = field(default_factory=time.time)
 
-    # model uuid to download (to continue training) | is not a uuid when training from scratch (blank or pt-name ?!)
-    base_model_id: Optional[str] = None
+    # model uuid to download (to continue training) | is not a uuid when training from scratch (blank or pt-name from provided_pretrained_models->name)
+    base_model_uuid_or_name: Optional[str] = None
+
     data: Optional[TrainingData] = None
     training_number: Optional[int] = None
     training_state: Optional[str] = None
@@ -123,7 +124,7 @@ def set_values_from_data(self, data: Dict) -> None:
         self.data = TrainingData(categories=Category.from_list(data['categories']))
         self.data.hyperparameter = Hyperparameter.from_data(data=data)
         self.training_number = data['training_number']
-        self.base_model_id = data['id']
+        self.base_model_uuid_or_name = data['id']
         self.training_state = TrainerState.Initialized
 
 
diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index 73b1b5a9..02ffa2d4 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -36,7 +36,7 @@ async def _start_training_from_base_model(self, model: str = 'model.model') -> N
         self._executor.start('while true; do sleep 1; done')
 
     async def _start_training_from_scratch(self) -> None:
-        base_model_id = self.training.base_model_id
+        base_model_id = self.training.base_model_uuid_or_name
         assert base_model_id is not None
         await self._start_training_from_base_model(model=f'model_{base_model_id}.pt')
 
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index c2e73eb1..0e67d992 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -129,7 +129,7 @@ async def _start_training(self):
         if self._can_resume():
             self.start_training_task = self._resume()
         else:
-            base_model_id = self.training.base_model_id
+            base_model_id = self.training.base_model_uuid_or_name
             if not is_valid_uuid4(base_model_id):
                 self.start_training_task = self._start_training_from_scratch()
             else:
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index 44cc7463..d329e488 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -294,7 +294,7 @@ async def _download_model(self) -> None:
         """If training is continued, the model is downloaded from the Learning Loop to the training_folder.
         The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training.
         """
-        base_model_id = self.training.base_model_id
+        base_model_id = self.training.base_model_uuid_or_name
 
         # TODO this checks if we continue a training -> make more explicit
         if not base_model_id or not is_valid_uuid4(base_model_id):

From 101c7faf084358bc1d5f2296d91b44840f9c8b90 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Tue, 26 Mar 2024 10:56:55 +0100
Subject: [PATCH 28/62] Refactoring, fix tests

---
 learning_loop_node/helpers/misc.py                 |  2 ++
 .../states/test_state_download_train_model.py      |  2 +-
 .../trainer/tests/testing_trainer_logic.py         |  5 ++---
 .../trainer/trainer_logic_generic.py               | 14 ++++++++------
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py
index 0f75509b..aea20e60 100644
--- a/learning_loop_node/helpers/misc.py
+++ b/learning_loop_node/helpers/misc.py
@@ -152,6 +152,8 @@ async def wrapper_ensure_socket_response(*args, **kwargs):
 
 
 def is_valid_uuid4(val):
+    if not val:
+        return False
     try:
         _ = UUID(str(val)).version
         return True
diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
index 1679f70b..282a2288 100644
--- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
@@ -50,7 +50,7 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi
 async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.DataDownloaded,
-                                base_model_id='00000000-0000-0000-0000-000000000000')  # bad model id)
+                                base_model_uuid_or_name='00000000-0000-0000-0000-000000000000')  # bad model id)
     trainer._init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer._run())
diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index 02ffa2d4..dacfd2b6 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -36,9 +36,8 @@ async def _start_training_from_base_model(self, model: str = 'model.model') -> N
         self._executor.start('while true; do sleep 1; done')
 
     async def _start_training_from_scratch(self) -> None:
-        base_model_id = self.training.base_model_uuid_or_name
-        assert base_model_id is not None
-        await self._start_training_from_base_model(model=f'model_{base_model_id}.pt')
+        assert self.training.base_model_uuid_or_name is not None, 'base_model_uuid_or_name must be set'
+        await self._start_training_from_base_model(model=f'model_{self.training.base_model_uuid_or_name}.pt')
 
     def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
         if self.has_new_model:
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index d329e488..e9abdc1a 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -157,7 +157,9 @@ def model_architecture(self) -> Optional[str]:
     @property
     @abstractmethod
     def provided_pretrained_models(self) -> List[PretrainedModel]:
-        """Returns the list of provided pretrained models"""
+        """Returns the list of provided pretrained models.
+        The names of the models will come back as model_uuid_or_name in the training details.
+        """
         raise NotImplementedError
 
     # ---------------------------------------- METHODS ----------------------------------------
@@ -294,16 +296,16 @@ async def _download_model(self) -> None:
         """If training is continued, the model is downloaded from the Learning Loop to the training_folder.
         The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training.
         """
-        base_model_id = self.training.base_model_uuid_or_name
+        base_model_uuid = self.training.base_model_uuid_or_name
 
         # TODO this checks if we continue a training -> make more explicit
-        if not base_model_id or not is_valid_uuid4(base_model_id):
-            logging.info(f'skipping model download. No base model id provided: {base_model_id}')
+        if not is_valid_uuid4(base_model_uuid):
+            logging.info(f'skipping model download. No base model provided (in form of uuid): {base_model_uuid}')
             return
 
         logging.info('loading model from Learning Loop')
-        logging.info(f'downloading model {base_model_id} as {self.model_format}')
-        await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_id, self.model_format)
+        logging.info(f'downloading model {base_model_uuid} as {self.model_format}')
+        await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_uuid, self.model_format)
         shutil.move(f'{self.training.training_folder}/model.json',
                     f'{self.training.training_folder}/base_model.json')
 

From 7b64f9cd14659ac8c898a83f9708bdb920b6a223 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Tue, 26 Mar 2024 13:00:09 +0100
Subject: [PATCH 29/62] Minor fixes

---
 learning_loop_node/__init__.py                      | 4 ++--
 learning_loop_node/trainer/trainer_logic_generic.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/learning_loop_node/__init__.py b/learning_loop_node/__init__.py
index 5f4433bc..2fa5362e 100644
--- a/learning_loop_node/__init__.py
+++ b/learning_loop_node/__init__.py
@@ -1,6 +1,4 @@
 import logging
-import os
-import sys
 
 # from . import log_conf
 from .detector.detector_logic import DetectorLogic
@@ -8,4 +6,6 @@
 from .globals import GLOBALS
 from .trainer.trainer_node import TrainerNode
 
+__all__ = ['TrainerNode', 'DetectorNode', 'DetectorLogic', 'GLOBALS']
+
 logging.info('>>>>>>>>>>>>>>>>>> LOOP INITIALIZED <<<<<<<<<<<<<<<<<<<<<<<')
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index e9abdc1a..6334e82e 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -456,7 +456,7 @@ def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
         """Is called frequently by `_sync_confusion_matrix` to check if a new "best" model is availabe.
         Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information).
         `confusion_matrix` contains a dict of all classes:
-            - The classes must be identified by their id, not their name.
+            - The classes must be identified by their uuid, not their name.
             - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives).
         `meta_information` can hold any data which is helpful for self._on_metrics_published to store weight file etc for later upload via self.get_model_files
         """

From 676484f9bf2921bcdf79a42d40012693a5fcdafd Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Tue, 26 Mar 2024 18:06:35 +0100
Subject: [PATCH 30/62] Prevent deadlock when training is stopped before a
 valid model was created

---
 learning_loop_node/trainer/io_helpers.py    | 3 ++-
 learning_loop_node/trainer/trainer_logic.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/learning_loop_node/trainer/io_helpers.py b/learning_loop_node/trainer/io_helpers.py
index 1ae3bd43..4849d67a 100644
--- a/learning_loop_node/trainer/io_helpers.py
+++ b/learning_loop_node/trainer/io_helpers.py
@@ -147,7 +147,8 @@ async def upload_detetions(self):
         num_files = self.get_number_of_detection_files()
         print(f'num_files: {num_files}', flush=True)
         if not num_files:
-            raise Exception('no detection files found')
+            logging.error('no detection files found')
+            return
         current_json_file_index = self.load_detections_upload_file_index()
         for i in range(current_json_file_index, num_files):
             detections = self.load_detections(i)
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 0e67d992..8cbba5e2 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -85,7 +85,9 @@ async def _train(self) -> None:
     async def _do_detections(self) -> None:
         context = self.training.context
         model_id = self.training.model_uuid_for_detecting
-        assert model_id, 'model_id must be set'
+        if not model_id:
+            logging.error('model_id is not set! Cannot do detections.')
+            return
         tmp_folder = f'/tmp/model_for_auto_detections_{model_id}_{self.model_format}'
 
         shutil.rmtree(tmp_folder, ignore_errors=True)

From 4a6636f921efe65059e28221e3ad0e16f0cceba5 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 15:59:37 +0200
Subject: [PATCH 31/62] make _get_latest_model_files async and don't run it on
 threadpool

---
 .../trainer/trainer_logic_generic.py            | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index 6334e82e..27819b2b 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -6,7 +6,7 @@
 import time
 from abc import ABC, abstractmethod
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Callable, Coroutine, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Coroutine, Dict, List, Optional
 
 from fastapi.encoders import jsonable_encoder
 
@@ -327,12 +327,10 @@ async def _sync_confusion_matrix(self) -> None:
                 result = await self.node.sio_client.call('update_training', (
                     self.training.context.organization, self.training.context.project, jsonable_encoder(new_training)))
                 if isinstance(result,  dict) and result['success']:
-                    logging.info(
-                        f'successfully updated training {asdict(new_training)}')
+                    logging.info(f'successfully updated training {asdict(new_training)}')
                     self._on_metrics_published(new_best_model)
                 else:
-                    raise Exception(
-                        f'Error for update_training: Response from loop was : {result}')
+                    raise Exception(f'Error for update_training: Response from loop was : {result}')
         except Exception as e:
             logging.exception('Error during confusion matrix syncronization')
             self.errors.set(error_key, str(e))
@@ -353,10 +351,9 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
         """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file.
         Note that with the latest trainers the conversion to (.wts) is done by the trainer.
         The conversion from .wts to .engine is done by the detector (needs to be done on target hardware).
-        Note that trainer may train with different classes, which is why we send an initial model.json file.
-        """
-        # NOTE: I guess this is in executor because originally the conversion happened here..
-        files = await asyncio.get_running_loop().run_in_executor(None, self._get_latest_model_files)
+        Note that trainer may train with different classes, which is why we send an initial model.json file."""
+
+        files = await self._get_latest_model_files()
         if files is None:
             return None
 
@@ -472,7 +469,7 @@ def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def _get_latest_model_files(self) -> Dict[str, List[str]]:
+    async def _get_latest_model_files(self) -> Dict[str, List[str]]:
         """Called when the Learning Loop requests to backup the latest model for the training.
         This function is used to __generate and gather__ all files needed for transfering the actual data from the trainer node to the Learning Loop.
         In the simplest implementation this method just renames the weight file (e.g. stored in TrainingStateData.meta_information) into a file name like latest_published_model

From 9cddd10092bf88fc55b020ad713b37881ae0e6ee Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 16:08:44 +0200
Subject: [PATCH 32/62] make sure no old cookies are used

---
 learning_loop_node/loop_communication.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index 99d9f70b..57feaf4b 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -24,6 +24,7 @@ def __init__(self) -> None:
         self.project: str = environment_reader.project()  # used by mock_detector
         self.base_url: str = f'http{"s" if "learning-loop.ai" in host else ""}://' + host
         self.async_client: httpx.AsyncClient = httpx.AsyncClient(base_url=self.base_url, timeout=Timeout(60.0))
+        self.async_client.cookies.clear()
 
         logging.info(f'Loop interface initialized with base_url: {self.base_url} / user: {self.username}')
 

From ccde1c34ad5fe0f9997a277e3c6935e43d084f5c Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 16:09:00 +0200
Subject: [PATCH 33/62] refactoring

---
 learning_loop_node/detector/detector_node.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py
index db657698..92b5fa21 100644
--- a/learning_loop_node/detector/detector_node.py
+++ b/learning_loop_node/detector/detector_node.py
@@ -223,8 +223,7 @@ async def _check_for_update(self) -> None:
                     await self.data_exchanger.download_model(target_model_folder,
                                                              Context(organization=self.organization,
                                                                      project=self.project),
-                                                             update_to_model_id,
-                                                             self.detector_logic.model_format)
+                                                             update_to_model_id, self.detector_logic.model_format)
                     try:
                         os.unlink(model_symlink)
                         os.remove(model_symlink)

From a4e2167e8bfe3dabb78cb112364449b7e25f6d1d Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 16:09:37 +0200
Subject: [PATCH 34/62] simplify process executor and use async process api

---
 learning_loop_node/trainer/executor.py | 149 +++++++++++--------------
 1 file changed, 67 insertions(+), 82 deletions(-)

diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py
index e823c0d4..2e50e498 100644
--- a/learning_loop_node/trainer/executor.py
+++ b/learning_loop_node/trainer/executor.py
@@ -1,106 +1,91 @@
-
-import ctypes
+import asyncio
 import logging
 import os
-import signal
-import subprocess
-from sys import platform
+import shlex
+from io import BufferedWriter
 from typing import List, Optional
 
-import psutil
-
-
-def create_signal_handler(sig=signal.SIGTERM):
-    if platform in ('linux', 'linux2'):
-        # "The system will send a signal to the child once the parent exits for any reason (even sigkill)."
-        # https://stackoverflow.com/a/19448096
-        libc = ctypes.CDLL("libc.so.6")
-
-        def callable_():
-            os.setsid()
-            return libc.prctl(1, sig)
-
-        return callable_
-    return os.setsid
-
 
 class Executor:
-    def __init__(self, base_path: str) -> None:
+    def __init__(self, base_path: str, log_name='last_training.log') -> None:
+        """An executor that runs a command in a separate async subprocess.
+        The log of the process is written to 'last_training.log' in the base_path.
+        Tthe process is executed in the base_path directory.
+        The process should be awaited to finish using `wait` or stopped using `stop` to 
+        avoid zombie processes and close the log file."""
+
         self.path = base_path
+        self.log_file_path = f'{self.path}/{log_name}'
+        self.log_file: None | BufferedWriter = None
+        self._process: Optional[asyncio.subprocess.Process] = None  # pylint: disable=no-member
         os.makedirs(self.path, exist_ok=True)
-        self._process: Optional[subprocess.Popen[bytes]] = None
-
-    def start(self, cmd: str) -> None:
-        logging.info(f'Starting executor with command: {cmd}')
-        with open(f'{self.path}/last_training.log', 'a') as f:
-            f.write(f'\nStarting executor with command: {cmd}\n')
-
-        # pylint: disable=subprocess-popen-preexec-fn
-        self._process = subprocess.Popen(
-            f'cd {self.path}; {cmd} >> last_training.log 2>&1',
-            shell=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            executable='/bin/bash',
-            preexec_fn=create_signal_handler())
 
-    def is_running(self) -> bool:
-        if self._process is None:
-            return False
+    async def start(self, cmd: str, env: Optional[dict[str, str]] = None) -> None:
+        """Start the process with the given command and environment variables."""
 
-        if self._process.poll() is not None:
-            return False
+        full_env = os.environ.copy()
+        if env is not None:
+            full_env.update(env)
 
-        try:
-            psutil.Process(self._process.pid)
-        except psutil.NoSuchProcess:
-            # self.process.terminate() # TODO does this make sense?
-            self._process = None
-            return False
+        logging.info(f'Starting executor with command: {cmd} in {self.path} - logging to {self.log_file_path}')
+        self.log_file = open(self.log_file_path, 'ab')
 
-        return True
+        self._process = await asyncio.create_subprocess_exec(
+            *shlex.split(cmd),
+            cwd=self.path,
+            stdout=self.log_file,
+            stderr=asyncio.subprocess.STDOUT,  # Merge stderr with stdout
+            env=full_env
+        )
+
+    def is_running(self) -> bool:
+        """Check if the process is still running."""
+        return self._process is not None and self._process.returncode is None
 
     def get_log(self) -> str:
-        try:
-            with open(f'{self.path}/last_training.log') as f:
-                return f.read()
-        except Exception:
+        """Get the log of the process as a string."""
+        if not os.path.exists(self.log_file_path):
             return ''
+        with open(self.log_file_path, 'r') as f:
+            return f.read()
 
-    def get_log_by_lines(self, since_last_start=False) -> List[str]:  # TODO do not read whole log again
-        try:
-            with open(f'{self.path}/last_training.log') as f:
-                lines = f.readlines()
-            if since_last_start:
-                lines_since_last_start = []
-                for line in reversed(lines):
-                    lines_since_last_start.append(line)
-                    if line.startswith('Starting executor with command:'):
-                        break
-                return list(reversed(lines_since_last_start))
-            return lines
-        except Exception:
+    def get_log_by_lines(self, tail: Optional[int] = None) -> List[str]:
+        """Get the log of the process as a list of lines."""
+        if not os.path.exists(self.log_file_path):
             return []
+        with open(self.log_file_path) as f:
+            lines = f.readlines()
+        if tail is not None:
+            lines = lines[-tail:]
+        return lines
 
-    def stop(self):
-        if self._process is None:
-            logging.info('no process running ... nothing to stop')
-            return
+    def close_log(self):
+        """Close the log file."""
+        if self.log_file is not None:
+            self.log_file.close()
+            self.log_file = None
 
-        logging.info('terminating process')
+    async def wait(self) -> Optional[int]:
+        """Wait for the process to finish. Returns the return code of the process."""
 
-        try:
-            os.killpg(os.getpgid(self._process.pid), signal.SIGTERM)
-        except ProcessLookupError:
-            pass
+        if not self._process:
+            logging.info('No process started... nothing to wait for')
+            return None
+        return_code = await self._process.wait()
+        self.close_log()
+        return return_code
 
-        self._process.terminate()
-        _, _ = self._process.communicate(timeout=3)
+    async def stop(self) -> Optional[int]:
+        """Stop the process and wait for it to finish. Returns the return code of the process."""
 
-    @property
-    def return_code(self):
-        if not self._process:
+        if self._process is None:
+            logging.info('No process started... nothing to stop')
             return None
-        if self.is_running():
+
+        try:
+            self._process.terminate()
+        except ProcessLookupError:
+            logging.info('Process not found... nothing to stop')
             return None
-        return self._process.poll()
+
+        return await self.wait()

From 4c254bb5050d94dc8eae6153299b3311be063cec Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 16:34:07 +0200
Subject: [PATCH 35/62] Refactor executor

---
 learning_loop_node/trainer/executor.py | 69 ++++++++++++++++----------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py
index 2e50e498..0ffa4da6 100644
--- a/learning_loop_node/trainer/executor.py
+++ b/learning_loop_node/trainer/executor.py
@@ -19,6 +19,13 @@ def __init__(self, base_path: str, log_name='last_training.log') -> None:
         self.log_file: None | BufferedWriter = None
         self._process: Optional[asyncio.subprocess.Process] = None  # pylint: disable=no-member
         os.makedirs(self.path, exist_ok=True)
+        return None
+
+    def _get_running_process(self) -> Optional[asyncio.subprocess.Process]:  # pylint: disable=no-member
+        """Get the running process if available."""
+        if self._process is not None and self._process.returncode is None:
+            return self._process
+        return None
 
     async def start(self, cmd: str, env: Optional[dict[str, str]] = None) -> None:
         """Start the process with the given command and environment variables."""
@@ -42,6 +49,43 @@ def is_running(self) -> bool:
         """Check if the process is still running."""
         return self._process is not None and self._process.returncode is None
 
+    def terminate(self) -> None:
+        """Terminate the process."""
+
+        if process := self._get_running_process():
+            try:
+                process.terminate()
+                return
+            except ProcessLookupError:
+                logging.error('No process to terminate')
+        self._process = None
+
+    async def wait(self) -> Optional[int]:
+        """Wait for the process to finish. Returns the return code of the process or None if no process is running."""
+
+        if not self._process:
+            logging.info('No process to wait for')
+            return None
+
+        return_code = await self._process.wait()
+
+        self.close_log()
+        self._process = None
+
+        return return_code
+
+    async def stop_and_wait(self) -> Optional[int]:
+        """Terminate the process and wait for it to finish. Returns the return code of the process."""
+
+        if not self.is_running():
+            logging.info('No process to stop')
+            return None
+
+        self.terminate()
+        return await self.wait()
+
+    # -------------------------------------------------------------------------------------------- LOGGING
+
     def get_log(self) -> str:
         """Get the log of the process as a string."""
         if not os.path.exists(self.log_file_path):
@@ -64,28 +108,3 @@ def close_log(self):
         if self.log_file is not None:
             self.log_file.close()
             self.log_file = None
-
-    async def wait(self) -> Optional[int]:
-        """Wait for the process to finish. Returns the return code of the process."""
-
-        if not self._process:
-            logging.info('No process started... nothing to wait for')
-            return None
-        return_code = await self._process.wait()
-        self.close_log()
-        return return_code
-
-    async def stop(self) -> Optional[int]:
-        """Stop the process and wait for it to finish. Returns the return code of the process."""
-
-        if self._process is None:
-            logging.info('No process started... nothing to stop')
-            return None
-
-        try:
-            self._process.terminate()
-        except ProcessLookupError:
-            logging.info('Process not found... nothing to stop')
-            return None
-
-        return await self.wait()

From c8d64afdb74df8b22bc7f98120cf4b2b2d9a2120 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 16:34:59 +0200
Subject: [PATCH 36/62] adapt backdoor controls to async api

---
 learning_loop_node/trainer/rest/backdoor_controls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py
index e3b17ed3..e2dafc26 100644
--- a/learning_loop_node/trainer/rest/backdoor_controls.py
+++ b/learning_loop_node/trainer/rest/backdoor_controls.py
@@ -128,7 +128,7 @@ async def kill_process(request: Request):
     assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic'
     if not trainer_logic._executor or not trainer_logic._executor.is_running():
         raise HTTPException(status_code=409, detail="trainer is not running")
-    trainer_logic._executor.stop()
+    await trainer_logic._executor.stop_and_wait()
 
 
 @router.post("/force_status_update")

From 25966bd0cc515c4ea5634a3da34ab20a1da5959c Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 16:36:21 +0200
Subject: [PATCH 37/62] adapt trainer_logic to async executor api

---
 learning_loop_node/trainer/trainer_logic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 8cbba5e2..286f05b3 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -77,8 +77,7 @@ async def _train(self) -> None:
 
         except TrainingError:
             logging.exception('Exception in trainer_logic._train')
-            if self.executor.is_running():
-                self.executor.stop()
+            await self.executor.stop_and_wait()
             self.training.training_state = previous_state
             raise
 
@@ -123,6 +122,7 @@ async def _do_detections(self) -> None:
             batch_images = images[i:i+batch_size]
             batch_detections = await self._detect(model_information, batch_images, tmp_folder)
             self.active_training_io.save_detections(batch_detections, idx)
+            break
 
     # ---------------------------------------- METHODS ----------------------------------------
 
@@ -147,7 +147,7 @@ async def stop(self) -> None:
         if not self.training_active:
             return
         if self._executor and self._executor.is_running():
-            self.executor.stop()
+            await self.executor.stop_and_wait()
         elif self.training_task:
             logging.info('cancelling training task')
             if self.training_task.cancel():

From 5d99f839d49ecef9412624f5398dd6aebf87c1e4 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 16:37:07 +0200
Subject: [PATCH 38/62] adapt tests to async executor api

---
 learning_loop_node/tests/test_executor.py                | 9 +++++----
 .../trainer/tests/states/test_state_train.py             | 4 ++--
 .../trainer/tests/testing_trainer_logic.py               | 9 ++++-----
 mock_trainer/app_code/mock_trainer_logic.py              | 4 ++--
 mock_trainer/app_code/tests/test_mock_trainer.py         | 2 +-
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py
index 1079ea1c..ab359c3c 100644
--- a/learning_loop_node/tests/test_executor.py
+++ b/learning_loop_node/tests/test_executor.py
@@ -21,16 +21,17 @@ def cleanup():
     cleanup_process.communicate()
 
 
-def test_executor_lifecycle():
+@pytest.mark.asyncio
+async def test_executor_lifecycle():
     assert_process_is_running('some_executable.sh', False)
 
     executor = Executor('/tmp/test_executor/' + str(uuid4()))
     cmd = executor.path + '/some_executable.sh'
     with open(cmd, 'w') as f:
-        f.write('while true; do echo "some output"; sleep 1; done')
+        f.write('/bin/bash -c "while true; do sleep 1; done"')
     os.chmod(cmd, 0o755)
 
-    executor.start(cmd)
+    await executor.start(cmd)
 
     assert executor.is_running()
     assert_process_is_running('some_executable.sh')
@@ -38,7 +39,7 @@ def test_executor_lifecycle():
     sleep(1)
     assert 'some output' in executor.get_log()
 
-    executor.stop()
+    await executor.stop_and_wait()
 
     assert not executor.is_running()
     sleep(1)
diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py
index f5ac282f..66fa2639 100644
--- a/learning_loop_node/trainer/tests/states/test_state_train.py
+++ b/learning_loop_node/trainer/tests/states/test_state_train.py
@@ -21,7 +21,7 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic
     assert trainer.start_training_task is not None
 
     assert trainer._executor is not None
-    trainer._executor.stop()  # NOTE normally a training terminates itself
+    await trainer._executor.stop_and_wait()  # NOTE normally a training terminates itself
     await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer.training.training_state == TrainerState.TrainingFinished
@@ -62,7 +62,7 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain
     assert trainer.start_training_task is not None
 
     assert trainer._executor is not None
-    trainer._executor.stop()  # NOTE normally a training terminates itself e.g
+    await trainer._executor.stop_and_wait()  # NOTE normally a training terminates itself e.g
     await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer.training.training_state == TrainerState.TrainingFinished
diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index dacfd2b6..62eba09a 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -25,15 +25,14 @@ def model_architecture(self) -> str:
 
     @property
     def provided_pretrained_models(self) -> List[PretrainedModel]:
-        return [
-            PretrainedModel(name='small', label='Small', description='a small model'),
-            PretrainedModel(name='medium', label='Medium', description='a medium model'),
-            PretrainedModel(name='large', label='Large', description='a large model')]
+        return [PretrainedModel(name='small', label='Small', description='a small model'),
+                PretrainedModel(name='medium', label='Medium', description='a medium model'),
+                PretrainedModel(name='large', label='Large', description='a large model')]
 
     # pylint: disable=unused-argument
     async def _start_training_from_base_model(self, model: str = 'model.model') -> None:
         assert self._executor is not None
-        self._executor.start('while true; do sleep 1; done')
+        await self._executor.start('/bin/bash -c "while true; do sleep 1; done"')
 
     async def _start_training_from_scratch(self) -> None:
         assert self.training.base_model_uuid_or_name is not None, 'base_model_uuid_or_name must be set'
diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py
index f4fb3fc8..51840904 100644
--- a/mock_trainer/app_code/mock_trainer_logic.py
+++ b/mock_trainer/app_code/mock_trainer_logic.py
@@ -33,11 +33,11 @@ async def _start_training_from_base_model(self) -> None:
         self.current_iteration = 0
         if self.error_configuration.begin_training:
             raise Exception('Could not start training')
-        self.executor.start('while true; do sleep 1; done')
+        await self.executor.start('/bin/bash -c "while true; do sleep 1; done"')
 
     async def _start_training_from_scratch(self) -> None:
         self.current_iteration = 0
-        self.executor.start('while true; do sleep 1; done')
+        await self.executor.start('/bin/bash -c "while true; do sleep 1; done"')
 
     def _get_executor_error_from_log(self) -> Optional[str]:
         if self.error_configuration.crash_training:
diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py
index 0946f991..a5d397f5 100644
--- a/mock_trainer/app_code/tests/test_mock_trainer.py
+++ b/mock_trainer/app_code/tests/test_mock_trainer.py
@@ -13,7 +13,7 @@
 
 async def create_mock_trainer() -> MockTrainerLogic:
     mock_trainer = MockTrainerLogic(model_format='mocked')
-    mock_trainer._executor = Executor(GLOBALS.data_folder)  # pylint: disable=protected-access
+    mock_trainer._executor = Executor(GLOBALS.data_folder)
     return mock_trainer
 
 

From 1f699f90edc762e10846b1d2c7f807a5ff6b43c3 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 16:38:48 +0200
Subject: [PATCH 39/62] adapt mock_trainer_logic to new api

---
 mock_trainer/app_code/mock_trainer_logic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py
index 51840904..d293758e 100644
--- a/mock_trainer/app_code/mock_trainer_logic.py
+++ b/mock_trainer/app_code/mock_trainer_logic.py
@@ -2,7 +2,7 @@
 import asyncio
 import logging
 import time
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional
 
 from learning_loop_node.data_classes import (BoxDetection, CategoryType, ClassificationDetection, Detections,
                                              ErrorConfiguration, ModelInformation, Point, PointDetection,
@@ -44,7 +44,7 @@ def _get_executor_error_from_log(self) -> Optional[str]:
             return 'mocked crash'
         return None
 
-    def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
+    async def _get_latest_model_files(self) -> Dict[str, List[str]]:
         if self.error_configuration.save_model:
             raise Exception()
 

From ed5cb3a9889840015dd9ebb09ca1bd56c3fa6b21 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 16:39:37 +0200
Subject: [PATCH 40/62] adapt testing_trainer_logic to new api

---
 learning_loop_node/trainer/tests/testing_trainer_logic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index 62eba09a..fc71f277 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -66,7 +66,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
         assert isinstance(result, str)
         return result
 
-    def _get_latest_model_files(self) -> Dict[str, List[str]]:
+    async def _get_latest_model_files(self) -> Dict[str, List[str]]:
         time.sleep(1)  # NOTE reduce flakyness in Backend tests du to wrong order of events.
         fake_weight_file = '/tmp/weightfile.weights'
         with open(fake_weight_file, 'wb') as f:

From 8266fa0bb1148486455dcfcfed6dc22216a54b84 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 16:40:30 +0200
Subject: [PATCH 41/62] fix typing error

---
 learning_loop_node/trainer/trainer_logic_generic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index 27819b2b..f790bbd9 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -299,7 +299,7 @@ async def _download_model(self) -> None:
         base_model_uuid = self.training.base_model_uuid_or_name
 
         # TODO this checks if we continue a training -> make more explicit
-        if not is_valid_uuid4(base_model_uuid):
+        if not base_model_uuid or not is_valid_uuid4(base_model_uuid):
             logging.info(f'skipping model download. No base model provided (in form of uuid): {base_model_uuid}')
             return
 

From f740e910b40c8fa1bc9d8b06e8bf334315f7684d Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 17:06:40 +0200
Subject: [PATCH 42/62] minor refactoring, fix of tests

---
 learning_loop_node/trainer/executor.py                    | 1 -
 learning_loop_node/trainer/tests/test_errors.py           | 5 ++++-
 learning_loop_node/trainer/tests/testing_trainer_logic.py | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py
index 0ffa4da6..082407ad 100644
--- a/learning_loop_node/trainer/executor.py
+++ b/learning_loop_node/trainer/executor.py
@@ -19,7 +19,6 @@ def __init__(self, base_path: str, log_name='last_training.log') -> None:
         self.log_file: None | BufferedWriter = None
         self._process: Optional[asyncio.subprocess.Process] = None  # pylint: disable=no-member
         os.makedirs(self.path, exist_ok=True)
-        return None
 
     def _get_running_process(self) -> Optional[asyncio.subprocess.Process]:  # pylint: disable=no-member
         """Get the running process if available."""
diff --git a/learning_loop_node/trainer/tests/test_errors.py b/learning_loop_node/trainer/tests/test_errors.py
index 507c494a..9a9c1cd8 100644
--- a/learning_loop_node/trainer/tests/test_errors.py
+++ b/learning_loop_node/trainer/tests/test_errors.py
@@ -1,6 +1,8 @@
 import asyncio
 import re
 
+import pytest
+
 from learning_loop_node.data_classes import TrainerState
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
@@ -19,6 +21,7 @@ async def test_training_process_is_stopped_when_trainer_reports_error(test_initi
     await assert_training_state(trainer.training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
 
 
+@pytest.mark.skip(reason='The since_last_start flag is deprecated.')
 async def test_log_can_provide_only_data_for_current_run(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
@@ -39,4 +42,4 @@ async def test_log_can_provide_only_data_for_current_run(test_initialized_traine
 
     assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) > 1
     # Here only the current run is provided
-    assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines(since_last_start=True)))) == 1
+    # assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines(since_last_start=True)))) == 1
diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index fc71f277..50171e08 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -1,6 +1,6 @@
 import asyncio
 import time
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional
 
 from learning_loop_node.data_classes import Context, Detections, ModelInformation, PretrainedModel, TrainingStateData
 from learning_loop_node.trainer.trainer_logic import TrainerLogic

From 0ed1673188708878f26289765ce5d883f6b41842 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 3 Apr 2024 17:47:40 +0200
Subject: [PATCH 43/62] simplify state string and do all detections

---
 learning_loop_node/data_classes/training.py | 2 +-
 learning_loop_node/trainer/trainer_logic.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index ecb3025c..d530ae7a 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -94,7 +94,7 @@ def short_str(self) -> str:
         cntxt = f'{self.context.organization}/{self.context.project}' if self.context else ''
         hyps = f'({self.hyperparameters})' if self.hyperparameters else ''
         arch = f'.{self.architecture} - ' if self.architecture else ''
-        return f'[{str(self.state)} {prgr}. {self.name}({self.id}). Tr/Ts/Tsk: {trtesk} {cntxt}{arch}{hyps}]'
+        return f'[{str(self.state).rsplit(".", maxsplit=1)[-1]} {prgr}. {self.name}({self.id}). Tr/Ts/Tsk: {trtesk} {cntxt}{arch}{hyps}]'
 
 
 @dataclass(**KWONLY_SLOTS)
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 286f05b3..93128b4b 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -122,7 +122,6 @@ async def _do_detections(self) -> None:
             batch_images = images[i:i+batch_size]
             batch_detections = await self._detect(model_information, batch_images, tmp_folder)
             self.active_training_io.save_detections(batch_detections, idx)
-            break
 
     # ---------------------------------------- METHODS ----------------------------------------
 

From dd6b998d4f58936a8bdf52ab0642e0ea35f4c43f Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Thu, 4 Apr 2024 13:18:47 +0200
Subject: [PATCH 44/62] fix test test_executor_lifecycle

---
 learning_loop_node/tests/test_executor.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py
index ab359c3c..38d50b4b 100644
--- a/learning_loop_node/tests/test_executor.py
+++ b/learning_loop_node/tests/test_executor.py
@@ -25,10 +25,10 @@ def cleanup():
 async def test_executor_lifecycle():
     assert_process_is_running('some_executable.sh', False)
 
-    executor = Executor('/tmp/test_executor/' + str(uuid4()))
-    cmd = executor.path + '/some_executable.sh'
-    with open(cmd, 'w') as f:
-        f.write('/bin/bash -c "while true; do sleep 1; done"')
+    executor = Executor('/tmp/test_executor/' + str(uuid4())+'/')
+    cmd = 'bash some_executable.sh'
+    with open(executor.path+'some_executable.sh', 'w') as f:
+        f.write('/bin/bash -c "while true; do sleep 1; echo some output; done"')
     os.chmod(cmd, 0o755)
 
     await executor.start(cmd)
@@ -49,6 +49,7 @@ async def test_executor_lifecycle():
 def assert_process_is_running(process_name, running=True):
     if running:
         for process in psutil.process_iter():
+            print(process.name(), process.cmdline())
             process_name_match = process_name in process.name()
             process_cmd_match = process_name in str(process.cmdline())
             if process_name_match or process_cmd_match:

From c156d0b6434630ed45670b63f906550a0779a069 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Thu, 4 Apr 2024 13:23:10 +0200
Subject: [PATCH 45/62] fix test test_go_to_cleanup_if_no_detections_exist

---
 .../tests/states/test_state_upload_detections.py      | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
index 8918eece..e2784514 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
@@ -133,7 +133,9 @@ async def test_bad_status_from_LearningLoop(test_initialized_trainer: TestingTra
     assert trainer.node.last_training_io.load() == trainer.training
 
 
-async def test_other_errors(test_initialized_trainer: TestingTrainerLogic):
+async def test_go_to_cleanup_if_no_detections_exist(test_initialized_trainer: TestingTrainerLogic):
+    """This test simulates a situation where the detection file is missing.
+    In this case, the trainer should report an error and move to the ReadyForCleanup state."""
     trainer = test_initialized_trainer
 
     # e.g. missing detection file
@@ -141,12 +143,7 @@ async def test_other_errors(test_initialized_trainer: TestingTrainerLogic):
     trainer._init_from_last_training()
 
     _ = asyncio.get_running_loop().create_task(trainer._run())
-    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
-    await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001)
-
-    assert trainer_has_error(trainer)
-    assert trainer.training.training_state == TrainerState.Detected
-    assert trainer.node.last_training_io.load() == trainer.training
+    await assert_training_state(trainer.training, TrainerState.ReadyForCleanup, timeout=1, interval=0.001)
 
 
 async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic):

From 185b9edcad44513075e43806f00957c5f315dee4 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Thu, 4 Apr 2024 18:56:15 +0200
Subject: [PATCH 46/62] deactivate asyncio warnings

---
 learning_loop_node/node.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py
index 5424c110..9418123e 100644
--- a/learning_loop_node/node.py
+++ b/learning_loop_node/node.py
@@ -78,7 +78,7 @@ async def lifespan(self, app: FastAPI):  # pylint: disable=unused-argument
 
     async def _on_startup(self):
         self.log.info('received "startup" lifecycle-event')
-        activate_asyncio_warnings()
+        # activate_asyncio_warnings()
         if self.needs_login:
             await self.loop_communicator.backend_ready()
             self.log.info('ensuring login')

From 6510f88503a791e13cdfdd0e8341901fce7f3dd0 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 5 Apr 2024 11:06:26 +0200
Subject: [PATCH 47/62] set request throttle to avoid Error 429

---
 learning_loop_node/data_exchanger.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py
index 0d4d2add..c9815e24 100644
--- a/learning_loop_node/data_exchanger.py
+++ b/learning_loop_node/data_exchanger.py
@@ -6,6 +6,7 @@
 from glob import glob
 from http import HTTPStatus
 from io import BytesIO
+from time import time
 from typing import Dict, List, Optional
 
 import aiofiles  # type: ignore
@@ -108,13 +109,15 @@ async def download_images(self, image_uuids: List[str], image_folder: str, chunk
             chunk_ids = image_uuids[i:i+chunk_size]
             tasks = []
             for j, chunk_j in enumerate(chunk_paths):
+                start = time()
                 tasks.append(create_task(self._download_one_image(chunk_j, chunk_ids[j], image_folder)))
+                await asyncio.sleep(max(0, 0.02 - (time() - start)))  # prevent too many requests at once
             await asyncio.gather(*tasks)
 
     async def _download_one_image(self, path: str, image_id: str, image_folder: str) -> None:
         response = await self.loop_communicator.get(path)
         if response.status_code != HTTPStatus.OK:
-            logging.error(f'bad status code {response.status_code} for {path}')
+            logging.error(f'bad status code {response.status_code} for {path}. Details: {response.text}')
             return
         filename = f'{image_folder}/{image_id}.jpg'
         async with aiofiles.open(filename, 'wb') as f:

From d811b53d47365f6e94399019b9ef29c75b2274f1 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 5 Apr 2024 11:07:47 +0200
Subject: [PATCH 48/62] make inference_batch_size an attribute that can be
 overwritten by children

---
 learning_loop_node/trainer/trainer_logic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 93128b4b..108349ad 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -25,6 +25,7 @@ def __init__(self, model_format: str) -> None:
         self._detection_progress: Optional[float] = None
         self._executor: Optional[Executor] = None
         self.start_training_task: Optional[Coroutine] = None
+        self.inference_batch_size = 10
 
     # ---------------------------------------- IMPLEMENTED ABSTRACT PROPERTIES ----------------------------------------
 
@@ -116,10 +117,9 @@ async def _do_detections(self) -> None:
             self.active_training_io.save_detections([], 0)
         num_images = len(images)
 
-        batch_size = 200
-        for idx, i in enumerate(range(0, num_images, batch_size)):
+        for idx, i in enumerate(range(0, num_images, self.inference_batch_size)):
             self._detection_progress = 0.5 + (i/num_images)*0.5
-            batch_images = images[i:i+batch_size]
+            batch_images = images[i:i+self.inference_batch_size]
             batch_detections = await self._detect(model_information, batch_images, tmp_folder)
             self.active_training_io.save_detections(batch_detections, idx)
 

From a55788303ade5f44caad512763281b231c8eb7c5 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Fri, 5 Apr 2024 18:44:22 +0200
Subject: [PATCH 49/62] add 'model_size' to ModelInformation. Required when
 continuing training

Fix tests
---
 learning_loop_node/data_classes/general.py                  | 1 +
 learning_loop_node/tests/test_executor.py                   | 2 +-
 learning_loop_node/trainer/tests/states/test_state_train.py | 6 +++---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py
index 5c616841..3ef5e412 100644
--- a/learning_loop_node/data_classes/general.py
+++ b/learning_loop_node/data_classes/general.py
@@ -53,6 +53,7 @@ class ModelInformation():
     categories: List[Category]
     resolution: Optional[int] = None
     model_root_path: Optional[str] = None
+    model_size: Optional[str] = None
 
     @property
     def context(self):
diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py
index 38d50b4b..7a69dca4 100644
--- a/learning_loop_node/tests/test_executor.py
+++ b/learning_loop_node/tests/test_executor.py
@@ -29,7 +29,7 @@ async def test_executor_lifecycle():
     cmd = 'bash some_executable.sh'
     with open(executor.path+'some_executable.sh', 'w') as f:
         f.write('/bin/bash -c "while true; do sleep 1; echo some output; done"')
-    os.chmod(cmd, 0o755)
+    os.chmod(executor.path+'some_executable.sh', 0o755)
 
     await executor.start(cmd)
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py
index 66fa2639..4e1d200c 100644
--- a/learning_loop_node/trainer/tests/states/test_state_train.py
+++ b/learning_loop_node/trainer/tests/states/test_state_train.py
@@ -16,12 +16,12 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic
 
     _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
-    await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
+    await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.01)
     assert trainer.start_training_task is not None
 
     assert trainer._executor is not None
-    await trainer._executor.stop_and_wait()  # NOTE normally a training terminates itself
+    await trainer.stop()  # NOTE normally a training terminates itself
     await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer.training.training_state == TrainerState.TrainingFinished

From d4551f33471cfc190da4186cd26243c96192afc1 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Mon, 8 Apr 2024 11:25:49 +0200
Subject: [PATCH 50/62] Improve logs. Cleanup tmp-dir after model download.

---
 learning_loop_node/data_exchanger.py        | 9 ++++++---
 learning_loop_node/trainer/trainer_logic.py | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py
index c9815e24..92b899b4 100644
--- a/learning_loop_node/data_exchanger.py
+++ b/learning_loop_node/data_exchanger.py
@@ -126,8 +126,10 @@ async def _download_one_image(self, path: str, image_id: str, image_folder: str)
             os.remove(filename)
 
     async def download_model(self, target_folder: str, context: Context, model_uuid: str, model_format: str) -> List[str]:
-        """Downloads a model and returns the paths of the downloaded files."""
-        logging.info(f'Downloading model {model_uuid} to {target_folder}..')
+        """Downloads a model (and additional meta data like model.json) and returns the paths of the downloaded files.
+        Used before training a model (in case of resuming a training) or before detecting images.
+        """
+        logging.info(f'Downloading model data for uuid {model_uuid} from the loop to {target_folder}..')
 
         path = f'/{context.organization}/projects/{context.project}/models/{model_uuid}/{model_format}/file'
         response = await self.loop_communicator.get(path, requires_login=False)
@@ -153,7 +155,8 @@ async def download_model(self, target_folder: str, context: Context, model_uuid:
             new_file = shutil.move(file, target_folder)
             created_files.append(new_file)
 
-        logging.info(f'---- downloaded model {model_uuid}/{model_format} to {tmp_path}. Moved to {target_folder}.')
+        shutil.rmtree(tmp_path, ignore_errors=True)
+        logging.info(f'Downloaded model {model_uuid}({model_format}) to {target_folder}.')
         return created_files
 
     async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]:
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 108349ad..ea32b6dc 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -130,8 +130,8 @@ async def _start_training(self):
         if self._can_resume():
             self.start_training_task = self._resume()
         else:
-            base_model_id = self.training.base_model_uuid_or_name
-            if not is_valid_uuid4(base_model_id):
+            base_model_uuid_or_name = self.training.base_model_uuid_or_name
+            if not is_valid_uuid4(base_model_uuid_or_name):
                 self.start_training_task = self._start_training_from_scratch()
             else:
                 self.start_training_task = self._start_training_from_base_model()

From 26c9a33531eabed0a820d31b959aa33ee9559660 Mon Sep 17 00:00:00 2001
From: Niklas Neugebauer <68709968+NiklasNeugebauer@users.noreply.github.com>
Date: Mon, 8 Apr 2024 17:01:55 +0200
Subject: [PATCH 51/62] Update data_exchanger.py

clarify docstring
---
 learning_loop_node/data_exchanger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py
index 92b899b4..9e8ffdb8 100644
--- a/learning_loop_node/data_exchanger.py
+++ b/learning_loop_node/data_exchanger.py
@@ -127,7 +127,7 @@ async def _download_one_image(self, path: str, image_id: str, image_folder: str)
 
     async def download_model(self, target_folder: str, context: Context, model_uuid: str, model_format: str) -> List[str]:
         """Downloads a model (and additional meta data like model.json) and returns the paths of the downloaded files.
-        Used before training a model (in case of resuming a training) or before detecting images.
+        Used before training a model (when continuing a finished training) or before detecting images.
         """
         logging.info(f'Downloading model data for uuid {model_uuid} from the loop to {target_folder}..')
 

From 1fa6c4ebae6c5025db3dcd97a1cbe9ca91f41f07 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Tue, 9 Apr 2024 11:29:03 +0200
Subject: [PATCH 52/62] Handle file opening error

---
 .syncignore                              |  5 +++++
 learning_loop_node/loop_communication.py | 10 +++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 .syncignore

diff --git a/.syncignore b/.syncignore
new file mode 100644
index 00000000..7bf361fc
--- /dev/null
+++ b/.syncignore
@@ -0,0 +1,5 @@
+.git/
+__pycache__/
+.DS_Store
+*.tmp
+.env
\ No newline at end of file
diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index 57feaf4b..901532fd 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -81,7 +81,15 @@ async def put(self, path, files: Optional[List[str]] = None, requires_login=True
         if files is None:
             return await self.async_client.put(api_prefix+path, **kwargs)
 
-        file_handles = [open(f, 'rb') for f in files]  # Open files and store handles
+        file_handles = []
+        for f in files:
+            try:
+                file_handles.append(open(f, 'rb'))
+            except FileNotFoundError:
+                for fh in file_handles:
+                    fh.close()  # Ensure all files are closed
+                return httpx.Response(404, content=b'File not found')
+
         try:
             file_list = [('files', fh) for fh in file_handles]  # Use file handles
             response = await self.async_client.put(api_prefix+path, files=file_list)

From 7cd73e1a0125e974aa2899564ba985853553fff8 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Tue, 9 Apr 2024 11:54:32 +0200
Subject: [PATCH 53/62] (re)enable todo/fixme warnings in mypy

---
 .vscode/settings.json | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index ff950a35..aec19884 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -9,7 +9,6 @@
     "--disable=C0111", // Missing docstring (in function/class/method)
     "--disable=C0114", // Missing module docstring
     "--disable=C0301", // Line too long (exceeds character limit)
-    "--disable=W0511", // TODO/FIXME not being used
     "--disable=W0718", // Catching too general exception
     "--disable=W0719", // Raising too general exception
     "--disable=W1203", // Use % formatting in logging functions and pass the % parameters as arguments

From 20adbc7b90e9201940ea3fd2d935a2a166d228bc Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Wed, 10 Apr 2024 10:21:25 +0200
Subject: [PATCH 54/62] improve handling of authorization in rest communication

---
 learning_loop_node/loop_communication.py | 29 +++++++++++++++++++-----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index 901532fd..62ecccd9 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -31,14 +31,14 @@ def __init__(self) -> None:
     def websocket_url(self) -> str:
         return f'ws{"s" if "learning-loop.ai" in self.host else ""}://' + self.host
 
-    async def ensure_login(self) -> None:
+    async def ensure_login(self, relogin=False) -> None:
         """aiohttp client session needs to be created on the event loop"""
 
         assert not self.async_client.is_closed, 'async client must not be used after shutdown'
-        if not self.async_client.cookies.keys():
+        if not self.async_client.cookies.keys() or relogin:
+            self.async_client.cookies.clear()
             response = await self.async_client.post('/api/login', data={'username': self.username, 'password': self.password})
             if response.status_code != 200:
-                self.async_client.cookies.clear()
                 logging.info(f'Login failed with response: {response}')
                 raise LoopCommunicationException('Login failed with response: ' + str(response))
             self.async_client.cookies.update(response.cookies)
@@ -50,6 +50,7 @@ async def logout(self) -> None:
         if response.status_code != 200:
             logging.info(f'Logout failed with response: {response}')
             raise LoopCommunicationException('Logout failed with response: ' + str(response))
+        self.async_client.cookies.clear()
 
     def get_cookies(self) -> Cookies:
         return self.async_client.cookies
@@ -73,7 +74,12 @@ async def backend_ready(self) -> bool:
     async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/api') -> httpx.Response:
         if requires_login:
             await self.ensure_login()
-        return await self.async_client.get(api_prefix+path)
+
+        response = await self.async_client.get(api_prefix+path)
+
+        if response.status_code == 401:
+            await self.ensure_login(relogin=True)
+        return response
 
     async def put(self, path, files: Optional[List[str]] = None, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response:
         if requires_login:
@@ -97,14 +103,25 @@ async def put(self, path, files: Optional[List[str]] = None, requires_login=True
             for fh in file_handles:
                 fh.close()  # Ensure all files are closed
 
+        if response.status_code == 401:
+            await self.ensure_login(relogin=True)
         return response
 
     async def post(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response:
         if requires_login:
             await self.ensure_login()
-        return await self.async_client.post(api_prefix+path, **kwargs)
+        response = await self.async_client.post(api_prefix+path, **kwargs)
+
+        if response.status_code == 401:
+            await self.ensure_login(relogin=True)
+        return response
 
     async def delete(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response:
         if requires_login:
             await self.ensure_login()
-        return await self.async_client.delete(api_prefix+path, **kwargs)
+
+        response = await self.async_client.delete(api_prefix+path, **kwargs)
+
+        if response.status_code == 401:
+            await self.ensure_login(relogin=True)
+        return response

From f2e41a525da801c785206c82cd3dd4bca4593ac7 Mon Sep 17 00:00:00 2001
From: Niklas Neugebauer <68709968+NiklasNeugebauer@users.noreply.github.com>
Date: Thu, 11 Apr 2024 10:27:37 +0200
Subject: [PATCH 55/62] create executable_path variable to avoid redundancy

---
 learning_loop_node/tests/test_executor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py
index 7a69dca4..1842f71e 100644
--- a/learning_loop_node/tests/test_executor.py
+++ b/learning_loop_node/tests/test_executor.py
@@ -27,9 +27,10 @@ async def test_executor_lifecycle():
 
     executor = Executor('/tmp/test_executor/' + str(uuid4())+'/')
     cmd = 'bash some_executable.sh'
-    with open(executor.path+'some_executable.sh', 'w') as f:
+    executable_path = executor.path+'some_executable.sh'
+    with open(executable_path, 'w') as f:
         f.write('/bin/bash -c "while true; do sleep 1; echo some output; done"')
-    os.chmod(executor.path+'some_executable.sh', 0o755)
+    os.chmod(executable_path, 0o755)
 
     await executor.start(cmd)
 

From 719c664d0f248f6de83629b5106ad738a11839d6 Mon Sep 17 00:00:00 2001
From: Niklas Neugebauer <niklas@zauberzeug.com>
Date: Thu, 11 Apr 2024 11:37:17 +0200
Subject: [PATCH 56/62] Retry http requests if login required and 401 was
 thrown

---
 learning_loop_node/loop_communication.py | 56 +++++++++++++++++++-----
 1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index 62ecccd9..0642c3c1 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -1,6 +1,6 @@
 import asyncio
 import logging
-from typing import List, Optional
+from typing import Awaitable, Callable, List, Optional
 
 import httpx
 from httpx import Cookies, Timeout
@@ -71,19 +71,40 @@ async def backend_ready(self) -> bool:
                 logging.info(f'backend not ready: {e}')
             await asyncio.sleep(10)
 
+    async def retry_on_401(self, func: Callable[..., Awaitable[httpx.Response]], *args, **kwargs) -> httpx.Response:
+        response = await func(*args, **kwargs)
+        if response.status_code == 401:
+            await self.ensure_login(relogin=True)
+            response = await func(*args, **kwargs)
+        return response
+
     async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/api') -> httpx.Response:
         if requires_login:
             await self.ensure_login()
 
+        # retry on 401 if required
+        if requires_login:
+            return await self.retry_on_401(self._get, path, api_prefix)
+        else:
+            return await self._get(path, api_prefix)
+
+    async def _get(self, path: str, api_prefix: str = '/api') -> httpx.Response:
+
         response = await self.async_client.get(api_prefix+path)
 
-        if response.status_code == 401:
-            await self.ensure_login(relogin=True)
         return response
 
-    async def put(self, path, files: Optional[List[str]] = None, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response:
+    async def put(self, path: str, files: Optional[List[str]] = None, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response:
         if requires_login:
             await self.ensure_login()
+
+        # retry on 401 if required
+        if requires_login:
+            return await self.retry_on_401(self._put, path, api_prefix, **kwargs)
+        else:
+            return await self._put(path, files, api_prefix, **kwargs)
+
+    async def _put(self, path, files: Optional[List[str]] = None, api_prefix='/api', **kwargs) -> httpx.Response:
         if files is None:
             return await self.async_client.put(api_prefix+path, **kwargs)
 
@@ -103,25 +124,36 @@ async def put(self, path, files: Optional[List[str]] = None, requires_login=True
             for fh in file_handles:
                 fh.close()  # Ensure all files are closed
 
-        if response.status_code == 401:
-            await self.ensure_login(relogin=True)
         return response
 
-    async def post(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response:
+    async def post(self, path: str, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response:
         if requires_login:
             await self.ensure_login()
+
+        # retry on 401 if required
+        if requires_login:
+            return await self.retry_on_401(self._post, path, api_prefix, **kwargs)
+        else:
+            return await self._post(path, api_prefix, **kwargs)
+
+    async def _post(self, path, api_prefix='/api', **kwargs) -> httpx.Response:
+
         response = await self.async_client.post(api_prefix+path, **kwargs)
 
-        if response.status_code == 401:
-            await self.ensure_login(relogin=True)
         return response
 
-    async def delete(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response:
+    async def delete(self, path: str, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response:
         if requires_login:
             await self.ensure_login()
 
+        # retry on 401 if required
+        if requires_login:
+            return await self.retry_on_401(self._delete, path, api_prefix, **kwargs)
+        else:
+            return await self._delete(path, api_prefix, **kwargs)
+
+    async def _delete(self, path, api_prefix='/api', **kwargs) -> httpx.Response:
+
         response = await self.async_client.delete(api_prefix+path, **kwargs)
 
-        if response.status_code == 401:
-            await self.ensure_login(relogin=True)
         return response

From 9bde621711004c3b7ad22a5b026165db8feb3aa4 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Thu, 11 Apr 2024 11:45:54 +0200
Subject: [PATCH 57/62] Major rerfactoring and Api changes. Resolve linting
 hints (#14)

* improve code documentation, abstraction layers and api

* fix all mypi and linting issues

* solve all linting errors in mock nodes

* further improvements of documentation and refactoring

* Further refactoring and API improvements

* Further refactoring and API improvements

* Further refactoring and API improvements

* fix tests

* Refactoring

* Refactoring, fix tests

* Minor fixes

* Prevent deadlock when training is stopped before a valid model was created

* make _get_latest_model_files async and don't run it on threadpool

* make sure no old cookies are used

* refactoring

* simplify process executor and use async process api

* Refactor executor

* adapt backdoor controls to async api

* adapt trainer_logic to async executor api

* adapt tests to async executor api

* adapt mock_trainer_logic to new api

* adapt testing_trainer_logic to new api

* fix typing error

* minor refactoring, fix of tests

* simplify state string and do all detections

* fix test test_executor_lifecycle

* fix test test_go_to_cleanup_if_no_detections_exist

* deactivate asyncio warnings

* set request throttle to avoid Error 429

* make inference_batch_size an attribute that can be overwritten by children

* add 'model_size' to ModelInformation. Required when continuing training

Fix tests

* Improve logs. Cleanup tmp-dir after model download.

* Update data_exchanger.py

clarify docstring

* Handle file opening error

* (re)enable todo/fixme warnings in mypy

* create executable_path variable to avoid redundancy

---------

Co-authored-by: Niklas Neugebauer <68709968+NiklasNeugebauer@users.noreply.github.com>
---
 .syncignore                                   |   5 +
 .vscode/settings.json                         |  10 +-
 learning_loop_node/__init__.py                |   4 +-
 .../annotation/annotator_logic.py             |   4 +-
 learning_loop_node/data_classes/__init__.py   |  15 +-
 learning_loop_node/data_classes/detections.py |   9 +-
 learning_loop_node/data_classes/general.py    |   7 +-
 learning_loop_node/data_classes/training.py   |  54 ++-
 learning_loop_node/data_exchanger.py          |  30 +-
 learning_loop_node/detector/__init__.py       |   1 -
 learning_loop_node/detector/detector_node.py  |   7 +-
 .../inbox_filter/cam_observation_history.py   |  11 +-
 learning_loop_node/detector/outbox.py         |   1 -
 learning_loop_node/detector/tests/conftest.py |   1 -
 .../tests/test_client_communication.py        |   4 +-
 .../detector/tests/test_outbox.py             |   2 +
 learning_loop_node/globals.py                 |   4 +-
 .../helpers/gdrive_downloader.py              |   2 +-
 learning_loop_node/helpers/misc.py            |  31 +-
 learning_loop_node/loop_communication.py      |  31 +-
 learning_loop_node/node.py                    |   4 +-
 learning_loop_node/py.typed                   |   0
 learning_loop_node/tests/test_executor.py     |  23 +-
 learning_loop_node/tests/test_helper.py       |   1 -
 learning_loop_node/trainer/executor.py        | 170 +++----
 learning_loop_node/trainer/io_helpers.py      |  23 +-
 .../trainer/rest/backdoor_controls.py         |   7 +-
 learning_loop_node/trainer/rest/controls.py   |   2 +
 learning_loop_node/trainer/tests/conftest.py  |  47 +-
 .../trainer/tests/state_helper.py             |   2 +-
 .../tests/states/test_state_cleanup.py        |   8 +-
 .../tests/states/test_state_detecting.py      |  45 +-
 .../states/test_state_download_train_model.py |  44 +-
 .../tests/states/test_state_prepare.py        |  29 +-
 .../test_state_sync_confusion_matrix.py       |  40 +-
 .../trainer/tests/states/test_state_train.py  |  55 +--
 .../states/test_state_upload_detections.py    |  52 +-
 .../tests/states/test_state_upload_model.py   |  35 +-
 .../trainer/tests/test_errors.py              |  25 +-
 .../trainer/tests/testing_trainer_logic.py    |  46 +-
 learning_loop_node/trainer/trainer_logic.py   | 145 +++---
 .../trainer/trainer_logic_abstraction.py      | 146 ------
 .../trainer/trainer_logic_generic.py          | 456 ++++++++++++------
 learning_loop_node/trainer/trainer_node.py    |   6 +-
 .../trainer/training_syncronizer.py           |  53 --
 mock_detector/app_code/tests/test_detector.py |   2 +
 mock_trainer/app_code/mock_trainer_logic.py   |  69 ++-
 mock_trainer/app_code/progress_simulator.py   |  10 +-
 mock_trainer/app_code/tests/conftest.py       |   3 +-
 .../app_code/tests/test_detections.py         |  14 +-
 .../app_code/tests/test_mock_trainer.py       |  13 +-
 51 files changed, 922 insertions(+), 886 deletions(-)
 create mode 100644 .syncignore
 create mode 100644 learning_loop_node/py.typed
 delete mode 100644 learning_loop_node/trainer/trainer_logic_abstraction.py
 delete mode 100644 learning_loop_node/trainer/training_syncronizer.py

diff --git a/.syncignore b/.syncignore
new file mode 100644
index 00000000..7bf361fc
--- /dev/null
+++ b/.syncignore
@@ -0,0 +1,5 @@
+.git/
+__pycache__/
+.DS_Store
+*.tmp
+.env
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 45eb6e46..aec19884 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -12,7 +12,15 @@
     "--disable=W0718", // Catching too general exception
     "--disable=W0719", // Raising too general exception
     "--disable=W1203", // Use % formatting in logging functions and pass the % parameters as arguments
-    "--disable=W1514" // Using open without explicitly specifying an encoding
+    "--disable=W1514", // Using open without explicitly specifying an encoding
+    "--disable=R0902", // Too many instance attributes
+    "--disable=R0903", // Too few public methods
+    "--disable=R0912", // Too many branches
+    "--disable=R0913", // Too many arguments
+    "--disable=R0914", // Too many local variables
+    "--disable=R0915", // Too many statements
+    "--disable=R1732", // Consider using with for resource-allocating operations
+    "--disable=R0801" // Similar lines in 2 files
   ],
   "[python]": {
     "editor.defaultFormatter": "ms-python.autopep8",
diff --git a/learning_loop_node/__init__.py b/learning_loop_node/__init__.py
index 5f4433bc..2fa5362e 100644
--- a/learning_loop_node/__init__.py
+++ b/learning_loop_node/__init__.py
@@ -1,6 +1,4 @@
 import logging
-import os
-import sys
 
 # from . import log_conf
 from .detector.detector_logic import DetectorLogic
@@ -8,4 +6,6 @@
 from .globals import GLOBALS
 from .trainer.trainer_node import TrainerNode
 
+__all__ = ['TrainerNode', 'DetectorNode', 'DetectorLogic', 'GLOBALS']
+
 logging.info('>>>>>>>>>>>>>>>>>> LOOP INITIALIZED <<<<<<<<<<<<<<<<<<<<<<<')
diff --git a/learning_loop_node/annotation/annotator_logic.py b/learning_loop_node/annotation/annotator_logic.py
index 932abce9..a80cc13b 100644
--- a/learning_loop_node/annotation/annotator_logic.py
+++ b/learning_loop_node/annotation/annotator_logic.py
@@ -7,10 +7,10 @@
 
 class AnnotatorLogic():
 
-    def __init__(self):
+    def __init__(self) -> None:
         self._node: Optional[Node] = None
 
-    def init(self, node: Node):
+    def init(self, node: Node) -> None:
         self._node = node
 
     @abstractmethod
diff --git a/learning_loop_node/data_classes/__init__.py b/learning_loop_node/data_classes/__init__.py
index 0e0a10e9..524cb8bb 100644
--- a/learning_loop_node/data_classes/__init__.py
+++ b/learning_loop_node/data_classes/__init__.py
@@ -4,5 +4,16 @@
 from .general import (AnnotationNodeStatus, Category, CategoryType, Context, DetectionStatus, ErrorConfiguration,
                       ModelInformation, NodeState, NodeStatus)
 from .socket_response import SocketResponse
-from .training import (BasicModel, Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData,
-                       TrainingError, TrainingOut, TrainingStatus)
+from .training import (Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData,
+                       TrainingError, TrainingOut, TrainingStateData, TrainingStatus)
+
+__all__ = [
+    'AnnotationData', 'AnnotationEventType', 'SegmentationAnnotation', 'ToolOutput', 'UserInput',
+    'BoxDetection', 'ClassificationDetection', 'Detections', 'Observation', 'Point', 'PointDetection',
+    'SegmentationDetection', 'Shape',
+    'AnnotationNodeStatus', 'Category', 'CategoryType', 'Context', 'DetectionStatus', 'ErrorConfiguration',
+    'ModelInformation', 'NodeState', 'NodeStatus',
+    'SocketResponse',
+    'Errors', 'Hyperparameter', 'Model', 'PretrainedModel', 'TrainerState', 'Training', 'TrainingData',
+    'TrainingError', 'TrainingOut', 'TrainingStateData', 'TrainingStatus',
+]
diff --git a/learning_loop_node/data_classes/detections.py b/learning_loop_node/data_classes/detections.py
index 21924720..0872b256 100644
--- a/learning_loop_node/data_classes/detections.py
+++ b/learning_loop_node/data_classes/detections.py
@@ -13,8 +13,11 @@
 
 @dataclass(**KWONLY_SLOTS)
 class BoxDetection():
+    """Coordinates according to COCO format. x,y is the top left corner of the box.
+    x increases to the right, y increases downwards.
+    """
     category_name: str
-    x: int  # TODO add definition of x,y,w,h
+    x: int
     y: int
     width: int
     height: int
@@ -47,6 +50,8 @@ def __str__(self):
 
 @dataclass(**KWONLY_SLOTS)
 class PointDetection():
+    """Coordinates according to COCO format. x,y is the center of the point.
+    x increases to the right, y increases downwards."""
     category_name: str
     x: float
     y: float
@@ -111,7 +116,7 @@ class Detections():
     point_detections: List[PointDetection] = field(default_factory=list)
     segmentation_detections: List[SegmentationDetection] = field(default_factory=list)
     classification_detections: List[ClassificationDetection] = field(default_factory=list)
-    tags: Optional[List[str]] = field(default_factory=list)
+    tags: List[str] = field(default_factory=list)
     date: Optional[str] = field(default_factory=current_datetime)
     image_id: Optional[str] = None  # used for detection of trainers
 
diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py
index 9d5c893e..3ef5e412 100644
--- a/learning_loop_node/data_classes/general.py
+++ b/learning_loop_node/data_classes/general.py
@@ -34,10 +34,6 @@ def from_list(values: List[dict]) -> List['Category']:
         return [from_dict(data_class=Category, data=value) for value in values]
 
 
-def create_category(identifier: str, name: str, ctype: Union[CategoryType, str]):  # TODO: This is probably unused
-    return Category(id=identifier, name=name, description='', hotkey='', color='', type=ctype, point_size=None)
-
-
 @dataclass(**KWONLY_SLOTS)
 class Context():
     organization: str
@@ -57,6 +53,7 @@ class ModelInformation():
     categories: List[Category]
     resolution: Optional[int] = None
     model_root_path: Optional[str] = None
+    model_size: Optional[str] = None
 
     @property
     def context(self):
@@ -64,6 +61,8 @@ def context(self):
 
     @staticmethod
     def load_from_disk(model_root_path: str) -> Optional['ModelInformation']:
+        """Load model.json from model_root_path and return ModelInformation object.
+        """
         model_info_file_path = f'{model_root_path}/model.json'
         if not os.path.exists(model_info_file_path):
             logging.warning(f"could not find model information file '{model_info_file_path}'")
diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py
index 449cc85b..d530ae7a 100644
--- a/learning_loop_node/data_classes/training.py
+++ b/learning_loop_node/data_classes/training.py
@@ -3,6 +3,7 @@
 import time
 from dataclasses import dataclass, field
 from enum import Enum
+from pathlib import Path
 from typing import Dict, List, Optional
 
 # pylint: disable=no-name-in-module
@@ -17,6 +18,14 @@ class Hyperparameter():
     flip_rl: bool
     flip_ud: bool
 
+    @staticmethod
+    def from_data(data: Dict):
+        return Hyperparameter(
+            resolution=data['resolution'],
+            flip_rl=data.get('flip_rl', False),
+            flip_ud=data.get('flip_ud', False)
+        )
+
 
 @dataclass(**KWONLY_SLOTS)
 class TrainingData():
@@ -64,7 +73,7 @@ class TrainerState(str, Enum):
 
 @dataclass(**KWONLY_SLOTS)
 class TrainingStatus():
-    id: str  # TODO this must not be changed, but tests wont detect it -> update tests!
+    id: str  # NOTE this must not be changed, but tests wont detect a change -> update tests!
     name: str
     state: Optional[str]
     errors: Optional[Dict]
@@ -79,13 +88,13 @@ class TrainingStatus():
     architecture: Optional[str] = None
     context: Optional[Context] = None
 
-    def short_str(self):
+    def short_str(self) -> str:
         prgr = f'{self.progress * 100:.0f}%' if self.progress else ''
         trtesk = f'{self.train_image_count}/{self.test_image_count}/{self.skipped_image_count}' if self.train_image_count else 'n.a.'
         cntxt = f'{self.context.organization}/{self.context.project}' if self.context else ''
         hyps = f'({self.hyperparameters})' if self.hyperparameters else ''
         arch = f'.{self.architecture} - ' if self.architecture else ''
-        return f'[{str(self.state)} {prgr}. {self.name}({self.id}). Tr/Ts/Tsk: {trtesk} {cntxt}{arch}{hyps}]'
+        return f'[{str(self.state).rsplit(".", maxsplit=1)[-1]} {prgr}. {self.name}({self.id}). Tr/Ts/Tsk: {trtesk} {cntxt}{arch}{hyps}]'
 
 
 @dataclass(**KWONLY_SLOTS)
@@ -93,22 +102,35 @@ class Training():
     id: str
     context: Context
 
-    project_folder: str
-    images_folder: str
-    training_folder: str
+    project_folder: str  # f'{GLOBALS.data_folder}/{context.organization}/{context.project}'
+    images_folder: str  # f'{project_folder}/images'
+    training_folder: str  # f'{project_folder}/trainings/{trainings_id}'
     start_time: float = field(default_factory=time.time)
 
-    base_model_id: Optional[str] = None
+    # model uuid to download (to continue training) | is not a uuid when training from scratch (blank or pt-name from provided_pretrained_models->name)
+    base_model_uuid_or_name: Optional[str] = None
+
     data: Optional[TrainingData] = None
     training_number: Optional[int] = None
     training_state: Optional[str] = None
-    model_id_for_detecting: Optional[str] = None
+    model_uuid_for_detecting: Optional[str] = None
     hyperparameters: Optional[Dict] = None
 
+    @property
+    def training_folder_path(self) -> Path:
+        return Path(self.training_folder)
+
+    def set_values_from_data(self, data: Dict) -> None:
+        self.data = TrainingData(categories=Category.from_list(data['categories']))
+        self.data.hyperparameter = Hyperparameter.from_data(data=data)
+        self.training_number = data['training_number']
+        self.base_model_uuid_or_name = data['id']
+        self.training_state = TrainerState.Initialized
+
 
 @dataclass(**KWONLY_SLOTS)
 class TrainingOut():
-    confusion_matrix: Optional[Dict] = None
+    confusion_matrix: Optional[Dict] = None  # This is actually just class-wise metrics
     train_image_count: Optional[int] = None
     test_image_count: Optional[int] = None
     trainer_id: Optional[str] = None
@@ -116,9 +138,9 @@ class TrainingOut():
 
 
 @dataclass(**KWONLY_SLOTS)
-class BasicModel():
-    confusion_matrix: Optional[Dict] = None
-    meta_information: Optional[Dict] = None
+class TrainingStateData():
+    confusion_matrix: Dict = field(default_factory=dict)
+    meta_information: Dict = field(default_factory=dict)
 
 
 @dataclass(**KWONLY_SLOTS)
@@ -133,8 +155,8 @@ class Model():
 
 
 class Errors():
-    def __init__(self):
-        self._errors: Dict = {}
+    def __init__(self) -> None:
+        self._errors: Dict[str, str] = {}
 
     def set(self, key: str, value: str):
         self._errors[key] = value
@@ -143,7 +165,7 @@ def set(self, key: str, value: str):
     def errors(self) -> Dict:
         return self._errors
 
-    def reset(self, key: str):
+    def reset(self, key: str) -> None:
         try:
             del self._errors[key]
         except AttributeError:
@@ -151,7 +173,7 @@ def reset(self, key: str):
         except KeyError:
             pass
 
-    def reset_all(self):
+    def reset_all(self) -> None:
         self._errors = {}
 
     def has_error_for(self, key: str) -> bool:
diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py
index ab53b243..9e8ffdb8 100644
--- a/learning_loop_node/data_exchanger.py
+++ b/learning_loop_node/data_exchanger.py
@@ -6,6 +6,7 @@
 from glob import glob
 from http import HTTPStatus
 from io import BytesIO
+from time import time
 from typing import Dict, List, Optional
 
 import aiofiles  # type: ignore
@@ -108,13 +109,15 @@ async def download_images(self, image_uuids: List[str], image_folder: str, chunk
             chunk_ids = image_uuids[i:i+chunk_size]
             tasks = []
             for j, chunk_j in enumerate(chunk_paths):
+                start = time()
                 tasks.append(create_task(self._download_one_image(chunk_j, chunk_ids[j], image_folder)))
+                await asyncio.sleep(max(0, 0.02 - (time() - start)))  # prevent too many requests at once
             await asyncio.gather(*tasks)
 
     async def _download_one_image(self, path: str, image_id: str, image_folder: str) -> None:
         response = await self.loop_communicator.get(path)
         if response.status_code != HTTPStatus.OK:
-            logging.error(f'bad status code {response.status_code} for {path}')
+            logging.error(f'bad status code {response.status_code} for {path}. Details: {response.text}')
             return
         filename = f'{image_folder}/{image_id}.jpg'
         async with aiofiles.open(filename, 'wb') as f:
@@ -122,11 +125,13 @@ async def _download_one_image(self, path: str, image_id: str, image_folder: str)
         if not await is_valid_image(filename, self.check_jpeg):
             os.remove(filename)
 
-    async def download_model(self, target_folder: str, context: Context, model_id: str, model_format: str) -> List[str]:
-        """Downloads a model and returns the paths of the downloaded files."""
-        logging.info(f'Downloading model {model_id} to {target_folder}..')
+    async def download_model(self, target_folder: str, context: Context, model_uuid: str, model_format: str) -> List[str]:
+        """Downloads a model (and additional meta data like model.json) and returns the paths of the downloaded files.
+        Used before training a model (when continuing a finished training) or before detecting images.
+        """
+        logging.info(f'Downloading model data for uuid {model_uuid} from the loop to {target_folder}..')
 
-        path = f'/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
+        path = f'/{context.organization}/projects/{context.project}/models/{model_uuid}/{model_format}/file'
         response = await self.loop_communicator.get(path, requires_login=False)
         if response.status_code != 200:
             content = response.json()
@@ -150,19 +155,18 @@ async def download_model(self, target_folder: str, context: Context, model_id: s
             new_file = shutil.move(file, target_folder)
             created_files.append(new_file)
 
-        logging.info(f'---- downloaded model {model_id}/{model_format} to {tmp_path}. Moved to {target_folder}.')
+        shutil.rmtree(tmp_path, ignore_errors=True)
+        logging.info(f'Downloaded model {model_uuid}({model_format}) to {target_folder}.')
         return created_files
 
     async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]:
         """Used by the trainers. Function returns the new model uuid to use for detection."""
         response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/trainings/{training_number}/models/latest/{mformat}/file', files=files)
         if response.status_code != 200:
-            logging.error(
-                f'---- could not upload model for training {training_number} and format {mformat}. Details: {response.text}')
+            logging.error(f'Could not upload model for training {training_number}, format {mformat}: {response.text}')
             response.raise_for_status()
             return None
-        else:
-            uploaded_model = response.json()
-            logging.info(
-                f'---- uploaded model for training {training_number} and format {mformat}. Model id is {uploaded_model}')
-            return uploaded_model['id']
+
+        uploaded_model = response.json()
+        logging.info(f'Uploaded model for training {training_number}, format {mformat}. Response is: {uploaded_model}')
+        return uploaded_model['id']
diff --git a/learning_loop_node/detector/__init__.py b/learning_loop_node/detector/__init__.py
index 8b137891..e69de29b 100644
--- a/learning_loop_node/detector/__init__.py
+++ b/learning_loop_node/detector/__init__.py
@@ -1 +0,0 @@
-
diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py
index 18b8ab6c..92b5fa21 100644
--- a/learning_loop_node/detector/detector_node.py
+++ b/learning_loop_node/detector/detector_node.py
@@ -186,7 +186,9 @@ async def _check_for_update(self) -> None:
             if not update_to_model_id:
                 self.log.info('could not check for updates')
                 return
-            if self.detector_logic.is_initialized:  # TODO: solve race condition !!!
+
+            # TODO: solve race condition (it should not be required to recheck if model_info is not None, but it is!)
+            if self.detector_logic.is_initialized:
                 model_info = self.detector_logic._model_info  # pylint: disable=protected-access
                 if model_info is not None:
                     self.log.info(f'Current model: {model_info.version} with id {model_info.id}')
@@ -221,8 +223,7 @@ async def _check_for_update(self) -> None:
                     await self.data_exchanger.download_model(target_model_folder,
                                                              Context(organization=self.organization,
                                                                      project=self.project),
-                                                             update_to_model_id,
-                                                             self.detector_logic.model_format)
+                                                             update_to_model_id, self.detector_logic.model_format)
                     try:
                         os.unlink(model_symlink)
                         os.remove(model_symlink)
diff --git a/learning_loop_node/detector/inbox_filter/cam_observation_history.py b/learning_loop_node/detector/inbox_filter/cam_observation_history.py
index 88bbe881..a87c72ee 100644
--- a/learning_loop_node/detector/inbox_filter/cam_observation_history.py
+++ b/learning_loop_node/detector/inbox_filter/cam_observation_history.py
@@ -1,20 +1,17 @@
 import os
 from typing import List, Union
 
-from learning_loop_node.data_classes import (BoxDetection,
-                                             ClassificationDetection,
-                                             Detections, Observation,
-                                             PointDetection,
-                                             SegmentationDetection)
+from learning_loop_node.data_classes import (BoxDetection, ClassificationDetection, Detections, Observation,
+                                             PointDetection, SegmentationDetection)
 
 
 class CamObservationHistory:
-    def __init__(self):
+    def __init__(self) -> None:
         self.reset_time = 3600
         self.recent_observations: List[Observation] = []
         self.iou_threshold = 0.5
 
-    def forget_old_detections(self):
+    def forget_old_detections(self) -> None:
         self.recent_observations = [detection
                                     for detection in self.recent_observations
                                     if not detection.is_older_than(self.reset_time)]
diff --git a/learning_loop_node/detector/outbox.py b/learning_loop_node/detector/outbox.py
index 23138c85..ca1a200d 100644
--- a/learning_loop_node/detector/outbox.py
+++ b/learning_loop_node/detector/outbox.py
@@ -53,7 +53,6 @@ def save(self, image: bytes, detections: Optional[Detections] = None, tags: Opti
         with open(tmp + '/image.json', 'w') as f:
             json.dump(jsonable_encoder(asdict(detections)), f)
 
-        # TODO sometimes No such file or directory: '/tmp/learning_loop_lib_data/tmp/2023-09-07_13:27:38.399/image.jpg'
         with open(tmp + '/image.jpg', 'wb') as f:
             f.write(image)
 
diff --git a/learning_loop_node/detector/tests/conftest.py b/learning_loop_node/detector/tests/conftest.py
index ad183fe2..1611f265 100644
--- a/learning_loop_node/detector/tests/conftest.py
+++ b/learning_loop_node/detector/tests/conftest.py
@@ -12,7 +12,6 @@
 import uvicorn
 
 from learning_loop_node import DetectorNode
-from learning_loop_node.data_classes.general import Category, ModelInformation
 from learning_loop_node.detector.outbox import Outbox
 from learning_loop_node.globals import GLOBALS
 
diff --git a/learning_loop_node/detector/tests/test_client_communication.py b/learning_loop_node/detector/tests/test_client_communication.py
index 97e3f074..24fbd095 100644
--- a/learning_loop_node/detector/tests/test_client_communication.py
+++ b/learning_loop_node/detector/tests/test_client_communication.py
@@ -2,7 +2,7 @@
 import json
 
 import pytest
-import requests  # type: ignore
+import requests
 
 from learning_loop_node import DetectorNode
 from learning_loop_node.data_classes import ModelInformation
@@ -101,4 +101,4 @@ async def test_about_endpoint(test_detector_node: DetectorNode):
     assert response_dict['operation_mode'] == 'idle'
     assert response_dict['state'] == 'online'
     assert response_dict['target_model'] == '1.1'
-    assert any([c.name == 'purple point' for c in model_information.categories])
+    assert any(c.name == 'purple point' for c in model_information.categories)
diff --git a/learning_loop_node/detector/tests/test_outbox.py b/learning_loop_node/detector/tests/test_outbox.py
index 9db7dd09..adf56744 100644
--- a/learning_loop_node/detector/tests/test_outbox.py
+++ b/learning_loop_node/detector/tests/test_outbox.py
@@ -9,6 +9,8 @@
 from learning_loop_node.detector.detector_node import DetectorNode
 from learning_loop_node.detector.outbox import Outbox
 
+# pylint: disable=redefined-outer-name
+
 
 @pytest.fixture()
 def test_outbox():
diff --git a/learning_loop_node/globals.py b/learning_loop_node/globals.py
index eee9511a..336df3fa 100644
--- a/learning_loop_node/globals.py
+++ b/learning_loop_node/globals.py
@@ -1,8 +1,8 @@
 
 class Globals():
-    def __init__(self):
+    def __init__(self) -> None:
         self.data_folder: str = '/data'
-        self.detector_port: int = 5004  # TODO move to tests
+        self.detector_port: int = 5004  # NOTE used for tests
 
 
 GLOBALS = Globals()
diff --git a/learning_loop_node/helpers/gdrive_downloader.py b/learning_loop_node/helpers/gdrive_downloader.py
index 8e5b3120..deefed68 100755
--- a/learning_loop_node/helpers/gdrive_downloader.py
+++ b/learning_loop_node/helpers/gdrive_downloader.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-import requests
+import requests  # type: ignore
 
 # https://stackoverflow.com/a/39225272/4082686
 
diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py
index 1f2e297d..aea20e60 100644
--- a/learning_loop_node/helpers/misc.py
+++ b/learning_loop_node/helpers/misc.py
@@ -55,7 +55,7 @@ def _handle_task_result(task: asyncio.Task,
         logger.exception(message, *message_args)
 
 
-def get_free_memory_mb() -> float:  # TODO check if this is used
+def get_free_memory_mb() -> float:  # NOTE used by yolov5
     pynvml.nvmlInit()
     h = pynvml.nvmlDeviceGetHandleByIndex(0)
     info = pynvml.nvmlDeviceGetMemoryInfo(h)
@@ -76,7 +76,6 @@ async def is_valid_image(filename: str, check_jpeg: bool) -> bool:
     return "OK" in out.decode()
 
 
-@staticmethod
 async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) -> None:
     logging.info('deleting corrupt images')
     n_deleted = 0
@@ -90,15 +89,7 @@ async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) ->
 
 
 def create_resource_paths(organization_name: str, project_name: str, image_ids: List[str]) -> Tuple[List[str], List[str]]:
-    # TODO: experimental:
     return [f'/{organization_name}/projects/{project_name}/images/{id}/main' for id in image_ids], image_ids
-    # if not image_ids:
-    #     return [], []
-    # url_ids: List[Tuple(str, str)] = [(f'/{organization_name}/projects/{project_name}/images/{id}/main', id)
-    #                                   for id in image_ids]
-    # urls, ids = list(map(list, zip(*url_ids)))
-
-    # return urls, ids
 
 
 def create_image_folder(project_folder: str) -> str:
@@ -141,17 +132,17 @@ async def wrapper_ensure_socket_response(*args, **kwargs):
 
             if isinstance(value, str):
                 return asdict(SocketResponse.for_success(value))
-            elif isinstance(value, bool):
+            if isinstance(value, bool):
                 return asdict(SocketResponse.from_bool(value))
-            elif isinstance(value, SocketResponse):
+            if isinstance(value, SocketResponse):
                 return value
-            elif (args[0] in ['connect', 'disconnect', 'connect_error']):
+            if (args[0] in ['connect', 'disconnect', 'connect_error']):
                 return value
-            elif value is None:
+            if value is None:
                 return None
-            else:
-                raise Exception(
-                    f"Return type for sio must be str, bool, SocketResponse or None', but was {type(value)}'")
+
+            raise Exception(
+                f"Return type for sio must be str, bool, SocketResponse or None', but was {type(value)}'")
         except Exception as e:
             logging.exception(f'An error occured for {args[0]}')
 
@@ -161,6 +152,8 @@ async def wrapper_ensure_socket_response(*args, **kwargs):
 
 
 def is_valid_uuid4(val):
+    if not val:
+        return False
     try:
         _ = UUID(str(val)).version
         return True
@@ -189,7 +182,6 @@ def activate_asyncio_warnings() -> None:
         logging.exception('could not activate asyncio warnings. Exception:')
 
 
-@staticmethod
 def images_for_ids(image_ids, image_folder) -> List[str]:
     logging.info(f'### Going to get images for {len(image_ids)} images ids')
     start = perf_counter()
@@ -200,7 +192,6 @@ def images_for_ids(image_ids, image_folder) -> List[str]:
     return images
 
 
-@staticmethod
 def generate_training(project_folder: str, context: Context) -> Training:
     training_uuid = str(uuid4())
     return Training(
@@ -212,7 +203,6 @@ def generate_training(project_folder: str, context: Context) -> Training:
     )
 
 
-@staticmethod
 def delete_all_training_folders(project_folder: str):
     if not os.path.exists(f'{project_folder}/trainings'):
         return
@@ -220,7 +210,6 @@ def delete_all_training_folders(project_folder: str):
         shutil.rmtree(f'{project_folder}/trainings/{uuid}', ignore_errors=True)
 
 
-@staticmethod
 def create_training_folder(project_folder: str, trainings_id: str) -> str:
     training_folder = f'{project_folder}/trainings/{trainings_id}'
     os.makedirs(training_folder, exist_ok=True)
diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index 75c57189..901532fd 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -24,6 +24,7 @@ def __init__(self) -> None:
         self.project: str = environment_reader.project()  # used by mock_detector
         self.base_url: str = f'http{"s" if "learning-loop.ai" in host else ""}://' + host
         self.async_client: httpx.AsyncClient = httpx.AsyncClient(base_url=self.base_url, timeout=Timeout(60.0))
+        self.async_client.cookies.clear()
 
         logging.info(f'Loop interface initialized with base_url: {self.base_url} / user: {self.username}')
 
@@ -80,8 +81,23 @@ async def put(self, path, files: Optional[List[str]] = None, requires_login=True
         if files is None:
             return await self.async_client.put(api_prefix+path, **kwargs)
 
-        file_list = [('files', open(f, 'rb')) for f in files]  # TODO: does this properly close the files after upload?
-        return await self.async_client.put(api_prefix+path, files=file_list)
+        file_handles = []
+        for f in files:
+            try:
+                file_handles.append(open(f, 'rb'))
+            except FileNotFoundError:
+                for fh in file_handles:
+                    fh.close()  # Ensure all files are closed
+                return httpx.Response(404, content=b'File not found')
+
+        try:
+            file_list = [('files', fh) for fh in file_handles]  # Use file handles
+            response = await self.async_client.put(api_prefix+path, files=file_list)
+        finally:
+            for fh in file_handles:
+                fh.close()  # Ensure all files are closed
+
+        return response
 
     async def post(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response:
         if requires_login:
@@ -92,14 +108,3 @@ async def delete(self, path, requires_login=True, api_prefix='/api', **kwargs) -
         if requires_login:
             await self.ensure_login()
         return await self.async_client.delete(api_prefix+path, **kwargs)
-
-    # --------------------------------- unused?! --------------------------------- #TODO remove?
-
-    # def get_data(self, path):
-    #     return asyncio.get_event_loop().run_until_complete(self._get_data_async(path))
-
-    # async def _get_data_async(self, path) -> bytes:
-    #     response = await self.get(f'{self.project_path}{path}')
-    #     if response.status_code != 200:
-    #         raise LoopCommunicationException('bad response: ' + str(response))
-    #     return response.content
diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py
index 38742fa4..9418123e 100644
--- a/learning_loop_node/node.py
+++ b/learning_loop_node/node.py
@@ -62,7 +62,7 @@ def sio_client(self) -> AsyncClient:
 
     # --------------------------------------------------- APPLICATION LIFECYCLE ---------------------------------------------------
     @asynccontextmanager
-    async def lifespan(self, app: FastAPI):
+    async def lifespan(self, app: FastAPI):  # pylint: disable=unused-argument
         try:
             await self._on_startup()
             self.repeat_task = asyncio.create_task(self.repeat_loop())
@@ -78,7 +78,7 @@ async def lifespan(self, app: FastAPI):
 
     async def _on_startup(self):
         self.log.info('received "startup" lifecycle-event')
-        activate_asyncio_warnings()
+        # activate_asyncio_warnings()
         if self.needs_login:
             await self.loop_communicator.backend_ready()
             self.log.info('ensuring login')
diff --git a/learning_loop_node/py.typed b/learning_loop_node/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py
index b661c818..1842f71e 100644
--- a/learning_loop_node/tests/test_executor.py
+++ b/learning_loop_node/tests/test_executor.py
@@ -21,26 +21,28 @@ def cleanup():
     cleanup_process.communicate()
 
 
-def test_executor_lifecycle():
+@pytest.mark.asyncio
+async def test_executor_lifecycle():
     assert_process_is_running('some_executable.sh', False)
 
-    executor = Executor('/tmp/test_executor/' + str(uuid4()))
-    cmd = executor.path + '/some_executable.sh'
-    with open(cmd, 'w') as f:
-        f.write('while true; do echo "some output"; sleep 1; done')
-    os.chmod(cmd, 0o755)
+    executor = Executor('/tmp/test_executor/' + str(uuid4())+'/')
+    cmd = 'bash some_executable.sh'
+    executable_path = executor.path+'some_executable.sh'
+    with open(executable_path, 'w') as f:
+        f.write('/bin/bash -c "while true; do sleep 1; echo some output; done"')
+    os.chmod(executable_path, 0o755)
 
-    executor.start(cmd)
+    await executor.start(cmd)
 
-    assert executor.is_process_running()
+    assert executor.is_running()
     assert_process_is_running('some_executable.sh')
 
     sleep(1)
     assert 'some output' in executor.get_log()
 
-    executor.stop()
+    await executor.stop_and_wait()
 
-    assert not executor.is_process_running()
+    assert not executor.is_running()
     sleep(1)
     assert_process_is_running('some_executable.sh', False)
 
@@ -48,6 +50,7 @@ def test_executor_lifecycle():
 def assert_process_is_running(process_name, running=True):
     if running:
         for process in psutil.process_iter():
+            print(process.name(), process.cmdline())
             process_name_match = process_name in process.name()
             process_cmd_match = process_name in str(process.cmdline())
             if process_name_match or process_cmd_match:
diff --git a/learning_loop_node/tests/test_helper.py b/learning_loop_node/tests/test_helper.py
index e802c7a0..c52037ed 100644
--- a/learning_loop_node/tests/test_helper.py
+++ b/learning_loop_node/tests/test_helper.py
@@ -9,7 +9,6 @@
 from learning_loop_node.data_classes import Context
 from learning_loop_node.helpers.misc import create_image_folder, create_project_folder, create_training_folder
 from learning_loop_node.loop_communication import LoopCommunicator
-from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
 
 def get_files_in_folder(folder: str):
diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py
index c768332c..082407ad 100644
--- a/learning_loop_node/trainer/executor.py
+++ b/learning_loop_node/trainer/executor.py
@@ -1,105 +1,109 @@
-
-import ctypes
+import asyncio
 import logging
 import os
-import signal
-import subprocess
-from sys import platform
+import shlex
+from io import BufferedWriter
 from typing import List, Optional
 
-import psutil
 
+class Executor:
+    def __init__(self, base_path: str, log_name='last_training.log') -> None:
+        """An executor that runs a command in a separate async subprocess.
+        The log of the process is written to 'last_training.log' in the base_path.
+        Tthe process is executed in the base_path directory.
+        The process should be awaited to finish using `wait` or stopped using `stop` to 
+        avoid zombie processes and close the log file."""
 
-def create_signal_handler(sig=signal.SIGTERM):
-    if platform == "linux" or platform == "linux2":
-        # "The system will send a signal to the child once the parent exits for any reason (even sigkill)."
-        # https://stackoverflow.com/a/19448096
-        libc = ctypes.CDLL("libc.so.6")
+        self.path = base_path
+        self.log_file_path = f'{self.path}/{log_name}'
+        self.log_file: None | BufferedWriter = None
+        self._process: Optional[asyncio.subprocess.Process] = None  # pylint: disable=no-member
+        os.makedirs(self.path, exist_ok=True)
 
-        def callable_():
-            os.setsid()
-            return libc.prctl(1, sig)
+    def _get_running_process(self) -> Optional[asyncio.subprocess.Process]:  # pylint: disable=no-member
+        """Get the running process if available."""
+        if self._process is not None and self._process.returncode is None:
+            return self._process
+        return None
 
-        return callable_
-    return os.setsid
+    async def start(self, cmd: str, env: Optional[dict[str, str]] = None) -> None:
+        """Start the process with the given command and environment variables."""
 
+        full_env = os.environ.copy()
+        if env is not None:
+            full_env.update(env)
 
-class Executor:
-    def __init__(self, base_path: str) -> None:
-        self.path = base_path
-        os.makedirs(self.path, exist_ok=True)
-        self.process: Optional[subprocess.Popen[bytes]] = None
-
-    def start(self, cmd: str):
-        with open(f'{self.path}/last_training.log', 'a') as f:
-            f.write(f'\nStarting executor with command: {cmd}\n')
-        # pylint: disable=subprocess-popen-preexec-fn
-        self.process = subprocess.Popen(
-            f'cd {self.path}; {cmd} >> last_training.log 2>&1',
-            shell=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            executable='/bin/bash',
-            preexec_fn=create_signal_handler(),
-        )
+        logging.info(f'Starting executor with command: {cmd} in {self.path} - logging to {self.log_file_path}')
+        self.log_file = open(self.log_file_path, 'ab')
 
-    def is_process_running(self):
-        if self.process is None:
-            return False
+        self._process = await asyncio.create_subprocess_exec(
+            *shlex.split(cmd),
+            cwd=self.path,
+            stdout=self.log_file,
+            stderr=asyncio.subprocess.STDOUT,  # Merge stderr with stdout
+            env=full_env
+        )
 
-        if self.process.poll() is not None:
-            return False
+    def is_running(self) -> bool:
+        """Check if the process is still running."""
+        return self._process is not None and self._process.returncode is None
 
-        try:
-            psutil.Process(self.process.pid)
-        except psutil.NoSuchProcess:
-            # self.process.terminate() # TODO does this make sense?
-            # self.process = None
-            return False
+    def terminate(self) -> None:
+        """Terminate the process."""
 
-        return True
+        if process := self._get_running_process():
+            try:
+                process.terminate()
+                return
+            except ProcessLookupError:
+                logging.error('No process to terminate')
+        self._process = None
 
-    def get_log(self) -> str:
-        try:
-            with open(f'{self.path}/last_training.log') as f:
-                return f.read()
-        except Exception:
-            return ''
+    async def wait(self) -> Optional[int]:
+        """Wait for the process to finish. Returns the return code of the process or None if no process is running."""
 
-    def get_log_by_lines(self, since_last_start=False) -> List[str]:  # TODO do not read whole log again
-        try:
-            with open(f'{self.path}/last_training.log') as f:
-                lines = f.readlines()
-            if since_last_start:
-                lines_since_last_start = []
-                for line in reversed(lines):
-                    lines_since_last_start.append(line)
-                    if line.startswith('Starting executor with command:'):
-                        break
-                return list(reversed(lines_since_last_start))
-            return lines
-        except Exception:
-            return []
+        if not self._process:
+            logging.info('No process to wait for')
+            return None
 
-    def stop(self):
-        if self.process is None:
-            logging.info('no process running ... nothing to stop')
-            return
+        return_code = await self._process.wait()
 
-        logging.info('terminating process')
+        self.close_log()
+        self._process = None
 
-        try:
-            os.killpg(os.getpgid(self.process.pid), signal.SIGTERM)
-        except ProcessLookupError:
-            pass
+        return return_code
 
-        self.process.terminate()
-        _, _ = self.process.communicate(timeout=3)
+    async def stop_and_wait(self) -> Optional[int]:
+        """Terminate the process and wait for it to finish. Returns the return code of the process."""
 
-    @property
-    def return_code(self):
-        if not self.process:
-            return None
-        if self.is_process_running():
+        if not self.is_running():
+            logging.info('No process to stop')
             return None
-        return self.process.poll()
+
+        self.terminate()
+        return await self.wait()
+
+    # -------------------------------------------------------------------------------------------- LOGGING
+
+    def get_log(self) -> str:
+        """Get the log of the process as a string."""
+        if not os.path.exists(self.log_file_path):
+            return ''
+        with open(self.log_file_path, 'r') as f:
+            return f.read()
+
+    def get_log_by_lines(self, tail: Optional[int] = None) -> List[str]:
+        """Get the log of the process as a list of lines."""
+        if not os.path.exists(self.log_file_path):
+            return []
+        with open(self.log_file_path) as f:
+            lines = f.readlines()
+        if tail is not None:
+            lines = lines[-tail:]
+        return lines
+
+    def close_log(self):
+        """Close the log file."""
+        if self.log_file is not None:
+            self.log_file.close()
+            self.log_file = None
diff --git a/learning_loop_node/trainer/io_helpers.py b/learning_loop_node/trainer/io_helpers.py
index 6ec7a5c3..4849d67a 100644
--- a/learning_loop_node/trainer/io_helpers.py
+++ b/learning_loop_node/trainer/io_helpers.py
@@ -14,6 +14,16 @@
 from ..loop_communication import LoopCommunicator
 
 
+class EnvironmentVars:
+    def __init__(self) -> None:
+        self.restart_after_training = os.environ.get(
+            'RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1']
+        self.keep_old_trainings = os.environ.get(
+            'KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1']
+        self.inference_batch_size = int(
+            os.environ.get('INFERENCE_BATCH_SIZE', '10'))
+
+
 class LastTrainingIO:
 
     def __init__(self, node_uuid: str) -> None:
@@ -137,7 +147,8 @@ async def upload_detetions(self):
         num_files = self.get_number_of_detection_files()
         print(f'num_files: {num_files}', flush=True)
         if not num_files:
-            raise Exception('no detection files found')
+            logging.error('no detection files found')
+            return
         current_json_file_index = self.load_detections_upload_file_index()
         for i in range(current_json_file_index, num_files):
             detections = self.load_detections(i)
@@ -164,9 +175,9 @@ async def _upload_detections(self, context: Context, batch_detections: List[Dete
             msg = f'could not upload detections. {str(response)}'
             logging.error(msg)
             raise Exception(msg)
+
+        logging.info('successfully uploaded detections')
+        if up_progress > len(batch_detections):
+            self.save_detection_upload_progress(0)
         else:
-            logging.info('successfully uploaded detections')
-            if up_progress > len(batch_detections):
-                self.save_detection_upload_progress(0)
-            else:
-                self.save_detection_upload_progress(up_progress)
+            self.save_detection_upload_progress(up_progress)
diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py
index a796fc4d..e2dafc26 100644
--- a/learning_loop_node/trainer/rest/backdoor_controls.py
+++ b/learning_loop_node/trainer/rest/backdoor_controls.py
@@ -5,7 +5,6 @@
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Dict
 
-from dacite import from_dict
 from fastapi import APIRouter, HTTPException, Request
 
 from ...data_classes import ErrorConfiguration, NodeState
@@ -98,7 +97,7 @@ async def add_steps(request: Request):
 
     assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic'
 
-    if not trainer_logic._executor or not trainer_logic._executor.is_process_running():  # pylint: disable=protected-access
+    if not trainer_logic._executor or not trainer_logic._executor.is_running():  # pylint: disable=protected-access
         training = trainer_logic._training  # pylint: disable=protected-access
         logging.error(f'cannot add steps when not running, state: {training.training_state if training else "None"}')
         raise HTTPException(status_code=409, detail="trainer is not running")
@@ -127,9 +126,9 @@ async def kill_process(request: Request):
     trainer_node = trainer_node_from_request(request)
     trainer_logic = trainer_node.trainer_logic
     assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic'
-    if not trainer_logic._executor or not trainer_logic._executor.is_process_running():
+    if not trainer_logic._executor or not trainer_logic._executor.is_running():
         raise HTTPException(status_code=409, detail="trainer is not running")
-    trainer_logic._executor.stop()
+    await trainer_logic._executor.stop_and_wait()
 
 
 @router.post("/force_status_update")
diff --git a/learning_loop_node/trainer/rest/controls.py b/learning_loop_node/trainer/rest/controls.py
index b8fbbec8..6c92d9a8 100644
--- a/learning_loop_node/trainer/rest/controls.py
+++ b/learning_loop_node/trainer/rest/controls.py
@@ -7,6 +7,8 @@
 
 router = APIRouter()
 
+# pylint: disable=protected-access
+
 
 @router.post("/controls/detect/{organization}/{project}/{version}")
 async def operation_mode(organization: str, project: str, version: str, request: Request):
diff --git a/learning_loop_node/trainer/tests/conftest.py b/learning_loop_node/trainer/tests/conftest.py
index 75937920..aca1919c 100644
--- a/learning_loop_node/trainer/tests/conftest.py
+++ b/learning_loop_node/trainer/tests/conftest.py
@@ -10,6 +10,8 @@
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_node import TrainerNode
 
+# pylint: disable=protected-access
+
 logging.basicConfig(level=logging.INFO)
 # show ouptut from uvicorn server https://stackoverflow.com/a/66132186/364388
 log_to_stderr(logging.INFO)
@@ -24,16 +26,14 @@ async def test_initialized_trainer_node():
 
     trainer = TestingTrainerLogic()
     node = TrainerNode(name='test', trainer_logic=trainer, uuid='NOD30000-0000-0000-0000-000000000000')
-    trainer._node = node  # pylint: disable=protected-access
-    trainer.init_new_training(context=Context(organization='zauberzeug', project='demo'),
-                              details={'categories': [],
-                                       'id': '917d5c7f-403d-7e92-f95f-577f79c2273a',  # version 1.2 of demo project
-                                       'training_number': 0,
-                                       'resolution': 800,
-                                       'flip_rl': False,
-                                       'flip_ud': False})
-
-    # pylint: disable=protected-access
+    trainer._node = node
+    trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'),
+                               details={'categories': [],
+                                        'id': '917d5c7f-403d-7e92-f95f-577f79c2273a',  # version 1.2 of demo project
+                                        'training_number': 0,
+                                        'resolution': 800,
+                                        'flip_rl': False,
+                                        'flip_ud': False})
     await node._on_startup()
     yield node
     await node._on_shutdown()
@@ -44,19 +44,17 @@ async def test_initialized_trainer():
 
     trainer = TestingTrainerLogic()
     node = TrainerNode(name='test', trainer_logic=trainer, uuid='NODE-000-0000-0000-0000-000000000000')
-    # pylint: disable=protected-access
-    await node._on_startup()
-    trainer._node = node  # pylint: disable=protected-access
-    trainer.init_new_training(context=Context(organization='zauberzeug', project='demo'),
-                              details={'categories': [],
-                                       'id': '917d5c7f-403d-7e92-f95f-577f79c2273a',  # version 1.2 of demo project
-                                       'training_number': 0,
-                                       'resolution': 800,
-                                       'flip_rl': False,
-                                       'flip_ud': False})
 
+    await node._on_startup()
+    trainer._node = node
+    trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'),
+                               details={'categories': [],
+                                        'id': '917d5c7f-403d-7e92-f95f-577f79c2273a',  # version 1.2 of demo project
+                                        'training_number': 0,
+                                        'resolution': 800,
+                                        'flip_rl': False,
+                                        'flip_ud': False})
     yield trainer
-    # await node._on_shutdown()
     try:
         await node._on_shutdown()
     except Exception:
@@ -66,10 +64,3 @@ async def test_initialized_trainer():
 def is_port_in_use(port):
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         return s.connect_ex(('localhost', port)) == 0
-
-
-# @pytest.fixture(autouse=True, scope='session')
-# def initialize_active_training():
-#     from learning_loop_node.trainer import active_training_module
-#     active_training_module.init('00000000-0000-0000-0000-000000000000')
-#     yield
diff --git a/learning_loop_node/trainer/tests/state_helper.py b/learning_loop_node/trainer/tests/state_helper.py
index a5b982ec..01c9001d 100644
--- a/learning_loop_node/trainer/tests/state_helper.py
+++ b/learning_loop_node/trainer/tests/state_helper.py
@@ -7,7 +7,7 @@
 
 def create_active_training_file(trainer: TrainerLogic, **kwargs) -> None:
     update_attributes(trainer._training, **kwargs)  # pylint: disable=protected-access
-    trainer.node.last_training_io.save(training=trainer.active_training)
+    trainer.node.last_training_io.save(training=trainer.training)
 
 
 async def assert_training_state(training: Training, state: str, timeout: float, interval: float) -> None:
diff --git a/learning_loop_node/trainer/tests/states/test_state_cleanup.py b/learning_loop_node/trainer/tests/states/test_state_cleanup.py
index 3326d156..f3911a54 100644
--- a/learning_loop_node/trainer/tests/states/test_state_cleanup.py
+++ b/learning_loop_node/trainer/tests/states/test_state_cleanup.py
@@ -1,11 +1,13 @@
 from learning_loop_node.trainer.tests.state_helper import create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 
+# pylint: disable=protected-access
+
 
 async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state='ready_for_cleanup')
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
     trainer.active_training_io.save_detections(detections=[])
 
     trainer.active_training_io.save_detection_upload_progress(count=42)
@@ -16,9 +18,9 @@ async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic
     assert trainer.active_training_io.detection_upload_progress_exist() is True
     assert trainer.active_training_io.detections_upload_file_index_exists() is True
 
-    await trainer.clear_training()
+    await trainer._clear_training()
 
-    assert trainer._training is None  # pylint: disable=protected-access
+    assert trainer._training is None
     assert trainer.node.last_training_io.exists() is False
     assert trainer.active_training_io.detections_exist() is False
     assert trainer.active_training_io.detection_upload_progress_exist() is False
diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py
index fbb8e9c0..5492f8dc 100644
--- a/learning_loop_node/trainer/tests/states/test_state_detecting.py
+++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py
@@ -6,6 +6,7 @@
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
+# pylint: disable=protected-access
 error_key = 'detecting'
 
 
@@ -13,38 +14,36 @@ def trainer_has_error(trainer: TrainerLogic):
     return trainer.errors.has_error_for(error_key)
 
 
-async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogic):  # TODO Flaky test
+async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogic):  # NOTE was a flaky test
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state='train_model_uploaded',
-                                model_id_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a')
+                                model_uuid_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a')
     # trainer.load_active_training()
     _ = asyncio.get_running_loop().create_task(
-        trainer.perform_state('do_detections', TrainerState.Detecting,
-                              TrainerState.Detected, trainer._do_detections)
-    )
+        trainer._perform_state('do_detections', TrainerState.Detecting, TrainerState.Detected, trainer._do_detections))
 
-    await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=10, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detecting, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detected, timeout=10, interval=0.001)
 
     assert trainer_has_error(trainer) is False
-    assert trainer.active_training.training_state == TrainerState.Detected
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.Detected
+    assert trainer.node.last_training_io.load() == trainer.training
     assert trainer.active_training_io.detections_exist()
 
 
 async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded)
-    trainer.init_from_last_training()
-    trainer.active_training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242'
+    trainer._init_from_last_training()
+    trainer.training.model_uuid_for_detecting = '12345678-bobo-7e92-f95f-424242424242'
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=5, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detecting, timeout=5, interval=0.001)
     await trainer.stop()
     await asyncio.sleep(0.1)
 
-    assert trainer._training is None  # pylint: disable=protected-access
+    assert trainer._training is None
     assert trainer.active_training_io.detections_exist() is False
     assert trainer.node.last_training_io.exists() is False
 
@@ -52,25 +51,25 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer
 async def test_model_not_downloadable_error(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded,
-                                model_id_for_detecting='00000000-0000-0000-0000-000000000000')  # bad model id
-    trainer.init_from_last_training()
+                                model_uuid_for_detecting='00000000-0000-0000-0000-000000000000')  # bad model id
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, 'detecting', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, 'train_model_uploaded', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, 'detecting', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, 'train_model_uploaded', timeout=1, interval=0.001)
     await asyncio.sleep(0.1)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.TrainModelUploaded
-    assert trainer.active_training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000'
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainModelUploaded
+    assert trainer.training.model_uuid_for_detecting == '00000000-0000-0000-0000-000000000000'
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 def test_save_load_detections(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     detections = [get_dummy_detections(), get_dummy_detections()]
 
diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
index 12e9b745..282a2288 100644
--- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py
@@ -6,37 +6,39 @@
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 
+# pylint: disable=protected-access
+
 
 async def test_downloading_is_successful(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.DataDownloaded)
 
     trainer.model_format = 'mocked'
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     asyncio.get_running_loop().create_task(
-        trainer.perform_state('download_model',
-                              TrainerState.TrainModelDownloading,
-                              TrainerState.TrainModelDownloaded, trainer._download_model))
-    await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, 'train_model_downloaded', timeout=1, interval=0.001)
+        trainer._perform_state('download_model',
+                               TrainerState.TrainModelDownloading,
+                               TrainerState.TrainModelDownloaded, trainer._download_model))
+    await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, 'train_model_downloaded', timeout=1, interval=0.001)
 
-    assert trainer.active_training.training_state == TrainerState.TrainModelDownloaded
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainModelDownloaded
+    assert trainer.node.last_training_io.load() == trainer.training
 
     # file on disk
-    assert os.path.exists(f'{trainer.active_training.training_folder}/base_model.json')
-    assert os.path.exists(f'{trainer.active_training.training_folder}/file_1.txt')
-    assert os.path.exists(f'{trainer.active_training.training_folder}/file_2.txt')
+    assert os.path.exists(f'{trainer.training.training_folder}/base_model.json')
+    assert os.path.exists(f'{trainer.training.training_folder}/file_1.txt')
+    assert os.path.exists(f'{trainer.training.training_folder}/file_2.txt')
 
 
 async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state='data_downloaded')
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -48,14 +50,14 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi
 async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.DataDownloaded,
-                                base_model_id='00000000-0000-0000-0000-000000000000')  # bad model id)
-    trainer.init_from_last_training()
+                                base_model_uuid_or_name='00000000-0000-0000-0000-000000000000')  # bad model id)
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.DataDownloaded, timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.DataDownloaded, timeout=1, interval=0.001)
 
     assert trainer.errors.has_error_for('download_model')
     assert trainer._training is not None  # pylint: disable=protected-access
-    assert trainer.active_training.training_state == TrainerState.DataDownloaded
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.DataDownloaded
+    assert trainer.node.last_training_io.load() == trainer.training
diff --git a/learning_loop_node/trainer/tests/states/test_state_prepare.py b/learning_loop_node/trainer/tests/states/test_state_prepare.py
index 8c490c92..d3222f9a 100644
--- a/learning_loop_node/trainer/tests/states/test_state_prepare.py
+++ b/learning_loop_node/trainer/tests/states/test_state_prepare.py
@@ -5,6 +5,7 @@
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
+# pylint: disable=protected-access
 error_key = 'prepare'
 
 
@@ -15,22 +16,22 @@ def trainer_has_error(trainer: TrainerLogic):
 async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    await trainer.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare)
+    await trainer._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare)
     assert trainer_has_error(trainer) is False
-    assert trainer.active_training.training_state == TrainerState.DataDownloaded
-    assert trainer.active_training.data is not None
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.DataDownloaded
+    assert trainer.training.data is not None
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_abort_preparing(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -43,13 +44,13 @@ async def test_request_error(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, context=Context(
         organization='zauberzeug', project='some_bad_project'))
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=3, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.Initialized, timeout=3, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=3, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Initialized, timeout=3, interval=0.001)
 
     assert trainer_has_error(trainer)
     assert trainer._training is not None  # pylint: disable=protected-access
-    assert trainer.active_training.training_state == TrainerState.Initialized
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.Initialized
+    assert trainer.node.last_training_io.load() == trainer.training
diff --git a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
index cc145233..6a292be5 100644
--- a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
+++ b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py
@@ -10,6 +10,8 @@
 from ..state_helper import assert_training_state, create_active_training_file
 from ..testing_trainer_logic import TestingTrainerLogic
 
+# pylint: disable=protected-access
+
 error_key = 'sync_confusion_matrix'
 
 
@@ -23,14 +25,14 @@ async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic):
     # TODO this requires trainer to have _training
     # trainer.load_active_training()
     create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
     assert trainer_has_error(trainer) is False
-    assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_unsynced_model_available__sync_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture):
@@ -40,15 +42,15 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine
     await mock_socket_io_call(mocker, test_initialized_trainer_node, {'success': True})
     create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
 
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
     trainer.has_new_model = True
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer) is False
 #    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_unsynced_model_available__sio_not_connected(test_initialized_trainer_node: TrainerNode):
@@ -60,14 +62,14 @@ async def test_unsynced_model_available__sio_not_connected(test_initialized_trai
     assert test_initialized_trainer_node.sio_client.connected is False
     trainer.has_new_model = True
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_unsynced_model_available__request_is_not_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture):
@@ -79,14 +81,14 @@ async def test_unsynced_model_available__request_is_not_successful(test_initiali
     create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
 
     trainer.has_new_model = True
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_basic_mock(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture):
diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py
index 46a7f953..4e1d200c 100644
--- a/learning_loop_node/trainer/tests/states/test_state_train.py
+++ b/learning_loop_node/trainer/tests/states/test_state_train.py
@@ -5,47 +5,46 @@
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 
+# pylint: disable=protected-access
+
 
 async def test_successful_training(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
-    await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
+    await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.01)
     assert trainer.start_training_task is not None
-    assert trainer.start_training_task.__name__ == 'start_training'
 
-    # pylint: disable=protected-access
     assert trainer._executor is not None
-    trainer._executor.stop()  # NOTE normally a training terminates itself
-    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await trainer.stop()  # NOTE normally a training terminates itself
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.active_training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_stop_running_training(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)  # pylint: disable=protected-access
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.01)
     assert trainer.start_training_task is not None
-    assert trainer.start_training_task.__name__ == 'start_training'
 
     await trainer.stop()
-    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=2, interval=0.01)
 
-    assert trainer.active_training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrainerLogic):
@@ -53,20 +52,18 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain
 
     # NOTE e.g. when a node-computer is restarted
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
-    trainer.init_from_last_training()
-    trainer._can_resume = True  # pylint: disable=protected-access
+    trainer._init_from_last_training()
+    trainer._can_resume_flag = True
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01)  # pylint: disable=protected-access
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     assert trainer.start_training_task is not None
-    assert trainer.start_training_task.__name__ == 'resume'
 
-    # pylint: disable=protected-access
     assert trainer._executor is not None
-    trainer._executor.stop()  # NOTE normally a training terminates itself e.g
-    await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
+    await trainer._executor.stop_and_wait()  # NOTE normally a training terminates itself e.g
+    await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001)
 
-    assert trainer.active_training.training_state == TrainerState.TrainingFinished
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainingFinished
+    assert trainer.node.last_training_io.load() == trainer.training
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
index 757cf968..e2784514 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py
@@ -10,6 +10,7 @@
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
+# pylint: disable=protected-access
 error_key = 'upload_detections'
 
 
@@ -44,14 +45,14 @@ async def create_valid_detection_file(trainer: TrainerLogic, number_of_entries:
 async def test_upload_successful(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.Detected)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     await create_valid_detection_file(trainer)
     await asyncio.get_running_loop().create_task(
-        trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions))
+        trainer._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions))
 
-    assert trainer.active_training.training_state == TrainerState.ReadyForCleanup
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.ReadyForCleanup
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 @pytest.mark.asyncio
@@ -59,14 +60,14 @@ async def test_detection_upload_progress_is_stored(test_initialized_trainer: Tes
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.Detected)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     await create_valid_detection_file(trainer)
 
     assert trainer.active_training_io.load_detections_upload_file_index() == 0
     # await trainer.upload_detections()
     await asyncio.get_running_loop().create_task(
-        trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions))
+        trainer._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions))
 
     assert trainer.active_training_io.load_detection_upload_progress() == 0  # Progress is reset for every file
     assert trainer.active_training_io.load_detections_upload_file_index() == 1
@@ -77,7 +78,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.Detected)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     await create_valid_detection_file(trainer, 2, 0)
     await create_valid_detection_file(trainer, 2, 1)
@@ -91,7 +92,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test
     for i in range(skip_detections, len(detections), batch_size):
         batch_detections = detections[i:i+batch_size]
         # pylint: disable=protected-access
-        await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size)
+        await trainer.active_training_io._upload_detections(trainer.training.context, batch_detections, i + batch_size)
 
         expected_value = i + batch_size if i + batch_size < len(detections) else 0  # Progress is reset for every file
         assert trainer.active_training_io.load_detection_upload_progress() == expected_value
@@ -107,7 +108,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test
     for i in range(skip_detections, len(detections), batch_size):
         batch_detections = detections[i:i+batch_size]
         # pylint: disable=protected-access
-        await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size)
+        await trainer.active_training_io._upload_detections(trainer.training.context, batch_detections, i + batch_size)
 
         expected_value = i + batch_size if i + batch_size < len(detections) else 0  # Progress is reset for every file
         assert trainer.active_training_io.load_detection_upload_progress() == expected_value
@@ -120,44 +121,41 @@ async def test_bad_status_from_LearningLoop(test_initialized_trainer: TestingTra
 
     create_active_training_file(trainer, training_state=TrainerState.Detected, context=Context(
         organization='zauberzeug', project='some_bad_project'))
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
     trainer.active_training_io.save_detections([get_dummy_detections()])
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001)
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.Detected
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.Detected
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
-async def test_other_errors(test_initialized_trainer: TestingTrainerLogic):
+async def test_go_to_cleanup_if_no_detections_exist(test_initialized_trainer: TestingTrainerLogic):
+    """This test simulates a situation where the detection file is missing.
+    In this case, the trainer should report an error and move to the ReadyForCleanup state."""
     trainer = test_initialized_trainer
 
     # e.g. missing detection file
     create_active_training_file(trainer, training_state=TrainerState.Detected)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
-    await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
-    await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001)
-
-    assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.Detected
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    _ = asyncio.get_running_loop().create_task(trainer._run())
+    await assert_training_state(trainer.training, TrainerState.ReadyForCleanup, timeout=1, interval=0.001)
 
 
 async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.Detected)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
     await create_valid_detection_file(trainer)
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
index 9faa656f..b2bfa4c7 100644
--- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py
+++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py
@@ -7,6 +7,7 @@
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
+# pylint: disable=protected-access
 error_key = 'upload_model'
 
 
@@ -19,29 +20,29 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer
     mock_upload_model_for_training(mocker, 'new_model_id')
 
     create_active_training_file(trainer)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     train_task = asyncio.get_running_loop().create_task(
-        trainer.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, trainer._upload_model))
+        trainer._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, trainer._upload_model))
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
     await train_task
 
     assert trainer_has_error(trainer) is False
-    assert trainer.active_training.training_state == TrainerState.TrainModelUploaded
-    assert trainer.active_training.model_id_for_detecting is not None
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.TrainModelUploaded
+    assert trainer.training.model_uuid_for_detecting is not None
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
 
     await trainer.stop()
     await asyncio.sleep(0.1)
@@ -57,18 +58,18 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai
     trainer = test_initialized_trainer
 
     create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
     # TODO goes to finished because of the error
-    await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001)
 
     assert trainer_has_error(trainer)
-    assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced
-    assert trainer.active_training.model_id_for_detecting is None
-    assert trainer.node.last_training_io.load() == trainer.active_training
+    assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
+    assert trainer.training.model_uuid_for_detecting is None
+    assert trainer.node.last_training_io.load() == trainer.training
 
 
 async def test_mock_loop_response_example(mocker: MockerFixture, test_initialized_trainer: TestingTrainerLogic):
@@ -77,7 +78,7 @@ async def test_mock_loop_response_example(mocker: MockerFixture, test_initialize
     mock_upload_model_for_training(mocker, 'new_model_id')
 
     create_active_training_file(trainer)
-    trainer.init_from_last_training()
+    trainer._init_from_last_training()
 
     # pylint: disable=protected-access
     result = await trainer._upload_model_return_new_model_uuid(Context(organization='zauberzeug', project='demo'))
diff --git a/learning_loop_node/trainer/tests/test_errors.py b/learning_loop_node/trainer/tests/test_errors.py
index 1ba85572..9a9c1cd8 100644
--- a/learning_loop_node/trainer/tests/test_errors.py
+++ b/learning_loop_node/trainer/tests/test_errors.py
@@ -1,40 +1,45 @@
 import asyncio
 import re
 
+import pytest
+
 from learning_loop_node.data_classes import TrainerState
 from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file
 from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic
 
+# pylint: disable=protected-access
+
 
 async def test_training_process_is_stopped_when_trainer_reports_error(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
-    trainer.init_from_last_training()
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    trainer._init_from_last_training()
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     trainer.error_msg = 'some_error'
-    await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
 
 
+@pytest.mark.skip(reason='The since_last_start flag is deprecated.')
 async def test_log_can_provide_only_data_for_current_run(test_initialized_trainer: TestingTrainerLogic):
     trainer = test_initialized_trainer
     create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded)
-    trainer.init_from_last_training()
-    _ = asyncio.get_running_loop().create_task(trainer.run())
+    trainer._init_from_last_training()
+    _ = asyncio.get_running_loop().create_task(trainer._run())
 
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
 
     assert trainer._executor is not None
     assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) == 1
 
     trainer.error_msg = 'some_error'
-    await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001)
     trainer.error_msg = None
-    await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
+    await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001)
     await asyncio.sleep(1)
 
     assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) > 1
     # Here only the current run is provided
-    assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines(since_last_start=True)))) == 1
+    # assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines(since_last_start=True)))) == 1
diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py
index c7faeca8..50171e08 100644
--- a/learning_loop_node/trainer/tests/testing_trainer_logic.py
+++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py
@@ -1,8 +1,8 @@
 import asyncio
 import time
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional
 
-from learning_loop_node.data_classes import BasicModel, Context, Detections, ModelInformation, PretrainedModel
+from learning_loop_node.data_classes import Context, Detections, ModelInformation, PretrainedModel, TrainingStateData
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
 
@@ -11,7 +11,7 @@ class TestingTrainerLogic(TrainerLogic):
 
     def __init__(self, can_resume: bool = False) -> None:
         super().__init__('mocked')
-        self._can_resume: bool = can_resume
+        self._can_resume_flag: bool = can_resume
         self.has_new_model: bool = False
         self.error_msg: Optional[str] = None
 
@@ -25,25 +25,25 @@ def model_architecture(self) -> str:
 
     @property
     def provided_pretrained_models(self) -> List[PretrainedModel]:
-        return [
-            PretrainedModel(name='small', label='Small', description='a small model'),
-            PretrainedModel(name='medium', label='Medium', description='a medium model'),
-            PretrainedModel(name='large', label='Large', description='a large model')]
+        return [PretrainedModel(name='small', label='Small', description='a small model'),
+                PretrainedModel(name='medium', label='Medium', description='a medium model'),
+                PretrainedModel(name='large', label='Large', description='a large model')]
 
     # pylint: disable=unused-argument
-    async def start_training(self, model: str = 'model.model') -> None:
+    async def _start_training_from_base_model(self, model: str = 'model.model') -> None:
         assert self._executor is not None
-        self._executor.start('while true; do sleep 1; done')
+        await self._executor.start('/bin/bash -c "while true; do sleep 1; done"')
 
-    async def start_training_from_scratch(self, base_model_id: str) -> None:
-        await self.start_training(model=f'model_{base_model_id}.pt')
+    async def _start_training_from_scratch(self) -> None:
+        assert self.training.base_model_uuid_or_name is not None, 'base_model_uuid_or_name must be set'
+        await self._start_training_from_base_model(model=f'model_{self.training.base_model_uuid_or_name}.pt')
 
-    def get_new_best_model(self) -> Optional[BasicModel]:
+    def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
         if self.has_new_model:
-            return BasicModel(confusion_matrix={})
+            return TrainingStateData(confusion_matrix={})
         return None
 
-    def on_model_published(self, basic_model: BasicModel) -> None:
+    def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
         pass
 
     async def _prepare(self) -> None:
@@ -54,9 +54,9 @@ async def _download_model(self) -> None:
         await super()._download_model()
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
 
-    async def upload_model(self) -> None:
+    async def _upload_model(self) -> None:
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
-        await super().upload_model()
+        await super()._upload_model()
         await asyncio.sleep(0.1)  # give tests a bit time to to check for the state
 
     async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
@@ -66,7 +66,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
         assert isinstance(result, str)
         return result
 
-    def get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
+    async def _get_latest_model_files(self) -> Dict[str, List[str]]:
         time.sleep(1)  # NOTE reduce flakyness in Backend tests du to wrong order of events.
         fake_weight_file = '/tmp/weightfile.weights'
         with open(fake_weight_file, 'wb') as f:
@@ -77,18 +77,18 @@ def get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
             f.write('zweiundvierzig')
         return {'mocked': [fake_weight_file, more_data_file], 'mocked_2': [fake_weight_file, more_data_file]}
 
-    def can_resume(self) -> bool:
-        return self._can_resume
+    def _can_resume(self) -> bool:
+        return self._can_resume_flag
 
-    async def resume(self) -> None:
-        return await self.start_training()
+    async def _resume(self) -> None:
+        return await self._start_training_from_base_model()
 
     async def _detect(self, model_information: ModelInformation, images:  List[str], model_folder: str) -> List[Detections]:
         detections: List[Detections] = []
         return detections
 
-    async def clear_training_data(self, training_folder: str) -> None:
+    async def _clear_training_data(self, training_folder: str) -> None:
         return
 
-    def get_executor_error_from_log(self) -> Optional[str]:
+    def _get_executor_error_from_log(self) -> Optional[str]:
         return self.error_msg
diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py
index 82fd8aad..ea32b6dc 100644
--- a/learning_loop_node/trainer/trainer_logic.py
+++ b/learning_loop_node/trainer/trainer_logic.py
@@ -5,7 +5,7 @@
 import shutil
 from abc import abstractmethod
 from datetime import datetime
-from typing import Coroutine, Dict, List, Optional
+from typing import Coroutine, List, Optional
 
 from dacite import from_dict
 
@@ -18,45 +18,47 @@
 class TrainerLogic(TrainerLogicGeneric):
 
     def __init__(self, model_format: str) -> None:
-        super().__init__(model_format)
-        self.model_format: str = model_format
-        # NOTE: String to be used in the file path for the model on the server:
-        # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
+        """This class is the base class for all trainers that use an executor to run training processes.
+        The executor is used to run the training process in a separate process."""
 
+        super().__init__(model_format)
+        self._detection_progress: Optional[float] = None
         self._executor: Optional[Executor] = None
         self.start_training_task: Optional[Coroutine] = None
+        self.inference_batch_size = 10
+
+    # ---------------------------------------- IMPLEMENTED ABSTRACT PROPERTIES ----------------------------------------
+
+    @property
+    def detection_progress(self) -> Optional[float]:
+        return self._detection_progress
+
+    # ---------------------------------------- PROPERTIES ----------------------------------------
 
     @property
     def executor(self) -> Executor:
         assert self._executor is not None, 'executor must be set, call `run_training` first'
         return self._executor
 
-    @property
-    def hyperparameters(self) -> Optional[Dict]:
-        if self._training and self._training.data and self._training.data.hyperparameter:
-            information = {}
-            information['resolution'] = self._training.data.hyperparameter.resolution
-            information['flipRl'] = self._training.data.hyperparameter.flip_rl
-            information['flipUd'] = self._training.data.hyperparameter.flip_ud
-            return information
-        return None
+    # ---------------------------------------- IMPLEMENTED ABSTRACT MEHTODS ----------------------------------------
 
     async def _train(self) -> None:
         previous_state = TrainerState.TrainModelDownloaded
         error_key = 'run_training'
-        self._executor = Executor(self.active_training.training_folder)
-        self.active_training.training_state = TrainerState.TrainingRunning
+        self._executor = Executor(self.training.training_folder)
+        self.training.training_state = TrainerState.TrainingRunning
 
         try:
             await self._start_training()
-
             last_sync_time = datetime.now()
+
             while True:
-                if not self.executor.is_process_running():
+                await asyncio.sleep(0.1)
+                if not self.executor.is_running():
                     break
                 if (datetime.now() - last_sync_time).total_seconds() > 5:
                     last_sync_time = datetime.now()
-                    if self.get_executor_error_from_log():
+                    if self._get_executor_error_from_log():
                         break
                     self.errors.reset(error_key)
                     try:
@@ -65,65 +67,49 @@ async def _train(self) -> None:
                         logging.warning('CancelledError in run_training')
                         raise
                     except Exception:
-                        pass
-                else:
-                    await asyncio.sleep(0.1)
+                        logging.error('Error in sync_confusion_matrix (this error is ignored)')
 
-            error = self.get_executor_error_from_log()
-            if error:
+            if error := self._get_executor_error_from_log():
                 raise TrainingError(cause=error)
-            # TODO check if this works:
+
+            # NOTE: This is problematic, because the return code is not 0 when executor was stoppen e.g. via self.stop()
             # if self.executor.return_code != 0:
-            #     self.errors.set(error_key, f'Executor return code was {self.executor.return_code}')
-            #     raise TrainingError(cause=f'Executor return code was {self.executor.return_code}')
+            #     raise TrainingError(cause=f'Executor returned with error code: {self.executor.return_code}')
 
         except TrainingError:
-            logging.exception('Error in TrainingProcess')
-            if self.executor.is_process_running():
-                self.executor.stop()
-            self.active_training.training_state = previous_state
+            logging.exception('Exception in trainer_logic._train')
+            await self.executor.stop_and_wait()
+            self.training.training_state = previous_state
             raise
 
-    async def _start_training(self):
-        self.start_training_task = None  # NOTE: this is used i.e. by tests
-        if self.can_resume():
-            self.start_training_task = self.resume()
-        else:
-            base_model_id = self.active_training.base_model_id
-            if not is_valid_uuid4(base_model_id):  # TODO this check was done earlier!
-                assert isinstance(base_model_id, str)
-                # TODO this could be removed here and accessed via self.training.base_model_id
-                self.start_training_task = self.start_training_from_scratch(base_model_id)
-            else:
-                self.start_training_task = self.start_training()
-        await self.start_training_task
-
     async def _do_detections(self) -> None:
-        context = self.active_training.context
-        model_id = self.active_training.model_id_for_detecting
-        assert model_id, 'model_id must be set'
+        context = self.training.context
+        model_id = self.training.model_uuid_for_detecting
+        if not model_id:
+            logging.error('model_id is not set! Cannot do detections.')
+            return
         tmp_folder = f'/tmp/model_for_auto_detections_{model_id}_{self.model_format}'
 
         shutil.rmtree(tmp_folder, ignore_errors=True)
         os.makedirs(tmp_folder)
         logging.info(f'downloading detection model to {tmp_folder}')
 
-        await self.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format)
+        await self.node.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format)
         with open(f'{tmp_folder}/model.json', 'r') as f:
             model_information = from_dict(data_class=ModelInformation, data=json.load(f))
 
         project_folder = create_project_folder(context)
         image_folder = create_image_folder(project_folder)
-        self.data_exchanger.set_context(context)
+        self.node.data_exchanger.set_context(context)
         image_ids = []
         for state, p in zip(['inbox', 'annotate', 'review', 'complete'], [0.1, 0.2, 0.3, 0.4]):
-            self.detection_progress = p
+            self._detection_progress = p
             logging.info(f'fetching image ids of {state}')
-            new_ids = await self.data_exchanger.fetch_image_uuids(query_params=f'state={state}')
+            new_ids = await self.node.data_exchanger.fetch_image_uuids(query_params=f'state={state}')
             image_ids += new_ids
             logging.info(f'downloading {len(new_ids)} images')
-            await self.data_exchanger.download_images(new_ids, image_folder)
-        self.detection_progress = 0.42
+            await self.node.data_exchanger.download_images(new_ids, image_folder)
+        self._detection_progress = 0.42
         # await delete_corrupt_images(image_folder)
 
         images = await asyncio.get_event_loop().run_in_executor(None, images_for_ids, image_ids, image_folder)
@@ -131,19 +117,36 @@ async def _do_detections(self) -> None:
             self.active_training_io.save_detections([], 0)
         num_images = len(images)
 
-        batch_size = 200
-        for idx, i in enumerate(range(0, num_images, batch_size)):
-            self.detection_progress = 0.5 + (i/num_images)*0.5
-            batch_images = images[i:i+batch_size]
+        for idx, i in enumerate(range(0, num_images, self.inference_batch_size)):
+            self._detection_progress = 0.5 + (i/num_images)*0.5
+            batch_images = images[i:i+self.inference_batch_size]
             batch_detections = await self._detect(model_information, batch_images, tmp_folder)
             self.active_training_io.save_detections(batch_detections, idx)
 
+    # ---------------------------------------- METHODS ----------------------------------------
+
+    async def _start_training(self):
+        self.start_training_task = None  # NOTE: this is used i.e. by tests
+        if self._can_resume():
+            self.start_training_task = self._resume()
+        else:
+            base_model_uuid_or_name = self.training.base_model_uuid_or_name
+            if not is_valid_uuid4(base_model_uuid_or_name):
+                self.start_training_task = self._start_training_from_scratch()
+            else:
+                self.start_training_task = self._start_training_from_base_model()
+        await self.start_training_task
+
+    # ---------------------------------------- OVERWRITTEN METHODS ----------------------------------------
+
     async def stop(self) -> None:
         """If executor is running, stop it. Else cancel training task."""
+        print('===============> stop received in trainer_logic.', flush=True)
+
         if not self.training_active:
             return
-        if self._executor and self._executor.is_process_running():
-            self.executor.stop()
+        if self._executor and self._executor.is_running():
+            await self.executor.stop_and_wait()
         elif self.training_task:
             logging.info('cancelling training task')
             if self.training_task.cancel():
@@ -152,33 +155,31 @@ async def stop(self) -> None:
                 except asyncio.CancelledError:
                     pass
                 logging.info('cancelled training task')
-                self.may_restart()
-
-    def get_log(self) -> str:
-        return self.executor.get_log()
+                self._may_restart()
 
     # ---------------------------------------- ABSTRACT METHODS ----------------------------------------
 
     @abstractmethod
-    async def start_training(self) -> None:
-        '''Should be used to start a training.'''
+    async def _start_training_from_base_model(self) -> None:
+        '''Should be used to start a training on executer, e.g. self.executor.start(cmd).'''
 
     @abstractmethod
-    async def start_training_from_scratch(self, base_model_id: str) -> None:
-        '''Should be used to start a training from scratch.
-        base_model_id is the id of a pretrained model provided by self.provided_pretrained_models.'''
+    async def _start_training_from_scratch(self) -> None:
+        '''Should be used to start a training from scratch on executer, e.g. self.executor.start(cmd).
+        NOTE base_model_id is now accessible via self.training.base_model_id 
+        the id of a pretrained model provided by self.provided_pretrained_models.'''
 
     @abstractmethod
-    def can_resume(self) -> bool:
+    def _can_resume(self) -> bool:
         '''Override this method to return True if the trainer can resume training.'''
 
     @abstractmethod
-    async def resume(self) -> None:
+    async def _resume(self) -> None:
         '''Is called when self.can_resume() returns True.
         One may resume the training on a previously trained model stored by self.on_model_published(basic_model).'''
 
     @abstractmethod
-    def get_executor_error_from_log(self) -> Optional[str]:
+    def _get_executor_error_from_log(self) -> Optional[str]:
         '''Should be used to provide error informations to the Learning Loop by extracting data from self.executor.get_log().'''
 
     @abstractmethod
diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py
deleted file mode 100644
index 64349e3d..00000000
--- a/learning_loop_node/trainer/trainer_logic_abstraction.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import os
-import time
-from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Dict, List, Optional
-
-from socketio import AsyncClient
-
-from ..data_classes import Context, Errors, PretrainedModel, TrainerState, Training, TrainingData
-from ..data_exchanger import DataExchanger
-from ..loop_communication import LoopCommunicator
-from .io_helpers import ActiveTrainingIO, LastTrainingIO
-
-if TYPE_CHECKING:
-    from .trainer_node import TrainerNode
-
-
-class TrainerLogicAbstraction(ABC):
-
-    def __init__(self, model_format: str):
-
-        # NOTE: String to be used in the file path for the model on the server:
-        # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
-        self.model_format: str = model_format
-
-        self._node: Optional['TrainerNode'] = None  # type: ignore
-        self._last_training_io: Optional[LastTrainingIO] = None  # type: ignore
-        self.errors = Errors()
-
-        self._training: Optional[Training] = None
-        self._active_training_io: Optional[ActiveTrainingIO] = None
-
-        self.restart_after_training = os.environ.get('RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1']
-        self.keep_old_trainings = os.environ.get('KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1']
-        self.inference_batch_size = int(os.environ.get('INFERENCE_BATCH_SIZE', '10'))
-
-    @property
-    def node(self) -> 'TrainerNode':
-        assert self._node is not None, 'node should be set by TrainerNode before initialization'
-        return self._node
-
-    @property
-    def last_training_io(self) -> LastTrainingIO:
-        assert self._last_training_io is not None, 'last_training_io should be set by TrainerNode before initialization'
-        return self._last_training_io
-
-    @property
-    def data_exchanger(self) -> DataExchanger:
-        return self.node.data_exchanger
-
-    @property
-    def loop_communicator(self) -> LoopCommunicator:
-        return self.node.loop_communicator
-
-    @property
-    def node_uuid(self) -> str:
-        return self.node.uuid
-
-    @property
-    def sio_client(self) -> AsyncClient:
-        return self.node.sio_client
-
-    @property
-    def active_training_io(self) -> ActiveTrainingIO:
-        assert self._active_training_io is not None, 'active_training_io must be set, call `init` first'
-        return self._active_training_io
-
-    @property
-    def training_active(self) -> bool:
-        """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'"""
-        return self._training is not None and self._active_training_io is not None
-
-    @property
-    def state(self) -> str:
-        if (not self.training_active) or (self.active_training.training_state is None):
-            return TrainerState.Idle.value
-        else:
-            return self.active_training.training_state
-
-    @property
-    def active_training(self) -> Training:
-        assert self._training is not None, 'training must be initialized, call `init` first'
-        return self._training
-
-    @property
-    def training_uptime(self) -> Optional[float]:
-        if self.training_active:
-            return time.time() - self.active_training.start_time
-        return None
-
-    @property
-    def training_data(self) -> Optional[TrainingData]:
-        if self.training_active and self.active_training.data:
-            return self.active_training.data
-        return None
-
-    @property
-    def training_context(self) -> Optional[Context]:
-        if self.training_active:
-            return self.active_training.context
-        return None
-
-    # --- ABSTRACT PROPERTIES
-    # --------- implemented in TrainerLogicGeneric
-
-    @property
-    @abstractmethod
-    def general_progress(self) -> Optional[float]:
-        """Returns the general progress of the training per state or None if idle"""
-
-    # --------- implemented in TrainerLogic(with Executor)
-    @property
-    @abstractmethod
-    def hyperparameters(self) -> Optional[Dict]:
-        """Returns the currently used hyperparameters if available"""
-
-    # --------- not implemented in any abstract class
-    @property
-    @abstractmethod
-    def model_architecture(self) -> Optional[str]:
-        """Returns the architecture name of the model if available"""
-
-    @property
-    @abstractmethod
-    def provided_pretrained_models(self) -> List[PretrainedModel]:
-        """Returns the list of provided pretrained models"""
-
-    # --- ABSTRACT METHODS -----
-    # --------- implemented in TrainerLogicGeneric ---
-
-    @abstractmethod
-    async def on_shutdown(self):
-        """Called when the trainer is shut down"""
-
-    @abstractmethod
-    async def begin_training(self, organization: str, project: str, details: dict):
-        """Starts the training process"""
-
-    @abstractmethod
-    async def try_continue_run_if_incomplete(self) -> bool:
-        """Start training continuation if possible, returns True if continuation started"""
-
-    # --- implemented in TrainerLogic(with Executor) ---
-
-    @abstractmethod
-    async def stop(self):
-        """Stops the training process"""
diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py
index 7221e6ec..f790bbd9 100644
--- a/learning_loop_node/trainer/trainer_logic_generic.py
+++ b/learning_loop_node/trainer/trainer_logic_generic.py
@@ -3,37 +3,130 @@
 import logging
 import shutil
 import sys
-from abc import abstractmethod
+import time
+from abc import ABC, abstractmethod
 from dataclasses import asdict
-from typing import Callable, Coroutine, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Coroutine, Dict, List, Optional
 
-from dacite import from_dict
 from fastapi.encoders import jsonable_encoder
 
-from ..data_classes import BasicModel, Category, Context, Hyperparameter, TrainerState, TrainingData, TrainingOut
+from ..data_classes import (Context, Errors, Hyperparameter, PretrainedModel, TrainerState, Training, TrainingData,
+                            TrainingOut, TrainingStateData)
 from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4
 from .downloader import TrainingsDownloader
-from .io_helpers import ActiveTrainingIO
-from .trainer_logic_abstraction import TrainerLogicAbstraction
+from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO
 
+if TYPE_CHECKING:
+    from .trainer_node import TrainerNode
 
-class TrainerLogicGeneric(TrainerLogicAbstraction):
+
+class TrainerLogicGeneric(ABC):
 
     def __init__(self, model_format: str) -> None:
-        super().__init__(model_format)
+
+        # NOTE: model_format is used in the file path for the model on the server:
+        # It acts as a key for list of files (cf. _get_latest_model_files)
+        # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file'
+        self.model_format: str = model_format
+        self.errors = Errors()
+
         self.training_task: Optional[asyncio.Task] = None
-        self.detection_progress = 0.0
         self.shutdown_event: asyncio.Event = asyncio.Event()
 
+        self._node: Optional['TrainerNode'] = None  # type: ignore
+        self._last_training_io: Optional[LastTrainingIO] = None  # type: ignore
+
+        self._training: Optional[Training] = None
+        self._active_training_io: Optional[ActiveTrainingIO] = None
+        self._environment_vars = EnvironmentVars()
+
+    # ---------------------------------------- PROPERTIES TO AVOID CHECKING FOR NONE ----------------------------------------
+
+    @property
+    def node(self) -> 'TrainerNode':
+        assert self._node is not None, 'node should be set by TrainerNode before initialization'
+        return self._node
+
+    @property
+    def last_training_io(self) -> LastTrainingIO:
+        assert self._last_training_io is not None, 'last_training_io should be set by TrainerNode before initialization'
+        return self._last_training_io
+
+    @property
+    def active_training_io(self) -> ActiveTrainingIO:
+        assert self._active_training_io is not None, 'active_training_io must be set, call `init` first'
+        return self._active_training_io
+
+    @property
+    def training(self) -> Training:
+        assert self._training is not None, 'training must be initialized, call `init` first'
+        return self._training
+
+    @property
+    def hyperparameter(self) -> Hyperparameter:
+        assert self.training_data is not None, 'Training should have data'
+        assert self.training_data.hyperparameter is not None, 'Training.data should have hyperparameter'
+        return self.training_data.hyperparameter
+
+    # ---------------------------------------- PROPERTIES ----------------------------------------
+
+    @property
+    def training_data(self) -> Optional[TrainingData]:
+        if self.training_active and self.training.data:
+            return self.training.data
+        return None
+
+    @property
+    def training_context(self) -> Optional[Context]:
+        if self.training_active:
+            return self.training.context
+        return None
+
+    @property
+    def training_active(self) -> bool:
+        """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'.
+        """
+        return self._training is not None and self._active_training_io is not None
+
+    @property
+    def state(self) -> str:
+        """Returns the current state of the training. Used solely by the node in send_status().
+        """
+        if (not self.training_active) or (self.training.training_state is None):
+            return TrainerState.Idle.value
+        return self.training.training_state
+
+    @property
+    def training_uptime(self) -> Optional[float]:
+        """Livetime of current Training object. Start time is set during initialization of Training object.
+        """
+        if self.training_active:
+            return time.time() - self.training.start_time
+        return None
+
+    @property
+    def hyperparameters_for_state_sync(self) -> Optional[Dict]:
+        """Used in sync_confusion_matrix and send_status to provide information about the training configuration.
+        """
+        if self._training and self._training.data and self._training.data.hyperparameter:
+            information = {}
+            information['resolution'] = self._training.data.hyperparameter.resolution
+            information['flipRl'] = self._training.data.hyperparameter.flip_rl
+            information['flipUd'] = self._training.data.hyperparameter.flip_ud
+            return information
+        return None
+
     @property
     def general_progress(self) -> Optional[float]:
-        """Represents the progress for different states."""
+        """Represents the progress for different states, should run from 0 to 100 for each state.
+        Note that training_progress and detection_progress need to be implemented in the specific trainer.
+        """
         if not self.training_active:
             return None
 
-        t_state = self.active_training.training_state
+        t_state = self.training.training_state
         if t_state == TrainerState.DataDownloading:
-            return self.data_exchanger.progress
+            return self.node.data_exchanger.progress
         if t_state == TrainerState.TrainingRunning:
             return self.training_progress
         if t_state == TrainerState.Detecting:
@@ -41,45 +134,85 @@ def general_progress(self) -> Optional[float]:
 
         return None
 
-    def init_new_training(self, context: Context, details: Dict) -> None:
-        """Called on `begin_training` event from the Learning Loop.
-        Note that details needs the entries 'categories' and 'training_number'"""
+    # ---------------------------------------- ABSTRACT PROPERTIES ----------------------------------------
 
-        project_folder = create_project_folder(context)
-        if not self.keep_old_trainings:
-            # NOTE: We delete all existing training folders because they are not needed anymore.
-            delete_all_training_folders(project_folder)
-        self._training = generate_training(project_folder, context)
-        self._training.data = TrainingData(categories=Category.from_list(details['categories']))
-        self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details)
-        self._training.training_number = details['training_number']
-        self._training.base_model_id = details['id']
-        self._training.training_state = TrainerState.Initialized
-        self._active_training_io = ActiveTrainingIO(
-            self._training.training_folder, self.loop_communicator, context)
-        logging.info(f'training initialized: {self._training}')
+    @property
+    @abstractmethod
+    def training_progress(self) -> Optional[float]:
+        """Represents the training progress."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def detection_progress(self) -> Optional[float]:
+        """Represents the detection progress."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def model_architecture(self) -> Optional[str]:
+        """Returns the architecture name of the model if available"""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def provided_pretrained_models(self) -> List[PretrainedModel]:
+        """Returns the list of provided pretrained models.
+        The names of the models will come back as model_uuid_or_name in the training details.
+        """
+        raise NotImplementedError
+
+    # ---------------------------------------- METHODS ----------------------------------------
+
+    # NOTE: Trainings are started by the Learning Loop via the begin_training event
+        # or by the trainer itself via try_continue_run_if_incomplete.
+        # The trainer will then initialize a new training object and start the training loop.
+        # Initializing a new training object will create the folder structure for the training.
+        # The training loop will then run through the states of the training.
 
     async def try_continue_run_if_incomplete(self) -> bool:
+        """Tries to continue a training if the last training was not finished.
+        """
         if not self.training_active and self.last_training_io.exists():
+            self._init_from_last_training()
             logging.info('found incomplete training, continuing now.')
-            self.init_from_last_training()
-            asyncio.get_event_loop().create_task(self.run())
+            asyncio.get_event_loop().create_task(self._run())
             return True
         return False
 
-    def init_from_last_training(self) -> None:
+    def _init_from_last_training(self) -> None:
+        """Initializes a new training object from the last training saved on disc via last_training_io.
+        """
         self._training = self.last_training_io.load()
         assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder'
         self._active_training_io = ActiveTrainingIO(
-            self._training.training_folder, self.loop_communicator, self._training.context)
+            self._training.training_folder, self.node.loop_communicator, self._training.context)
 
     async def begin_training(self, organization: str, project: str, details: Dict) -> None:
-        """Called on `begin_training` event from the Learning Loop."""
+        """Called on `begin_training` event from the Learning Loop.
+        """
+        self._init_new_training(Context(organization=organization, project=project), details)
+        asyncio.get_event_loop().create_task(self._run())
+
+    def _init_new_training(self, context: Context, details: Dict) -> None:
+        """Called on `begin_training` event from the Learning Loop.
+        Note that details needs the entries 'categories' and 'training_number',
+        but also the hyperparameter entries.
+        """
+        project_folder = create_project_folder(context)
+        if not self._environment_vars.keep_old_trainings:
+            delete_all_training_folders(project_folder)
+        self._training = generate_training(project_folder, context)
+        self._training.set_values_from_data(details)
 
-        self.init_new_training(Context(organization=organization, project=project), details)
-        asyncio.get_event_loop().create_task(self.run())
+        self._active_training_io = ActiveTrainingIO(
+            self._training.training_folder, self.node.loop_communicator, context)
+        logging.info(f'new training initialized: {self._training}')
 
-    async def run(self) -> None:
+    async def _run(self) -> None:
+        """Called on `begin_training` event from the Learning Loop. 
+        Either via `begin_training` or `try_continue_run_if_incomplete`.
+        """
         self.errors.reset_all()
         try:
             self.training_task = asyncio.get_running_loop().create_task(self._training_loop())
@@ -87,46 +220,47 @@ async def run(self) -> None:
         except asyncio.CancelledError:
             if not self.shutdown_event.is_set():
                 logging.info('training task was cancelled but not by shutdown event')
-                self.active_training.training_state = TrainerState.ReadyForCleanup
-                self.last_training_io.save(self.active_training)
-                await self.clear_training()
+                self.training.training_state = TrainerState.ReadyForCleanup
+                self.last_training_io.save(self.training)
+                await self._clear_training()
         except Exception as e:
             logging.exception(f'Error in train: {e}')
 
     # ---------------------------------------- TRAINING STATES ----------------------------------------
 
     async def _training_loop(self) -> None:
-        """asyncio.CancelledError is catched in run"""
-
+        """Cycle through the training states until the training is finished or 
+        an asyncio.CancelledError is raised.
+        """
         assert self.training_active
 
         while self._training is not None:
-            tstate = self.active_training.training_state
-            logging.info(f'STATE LOOP: {tstate}, eerrors: {self.errors.errors}')
+            tstate = self.training.training_state
             await asyncio.sleep(0.6)  # Note: Required for pytests!
+
             if tstate == TrainerState.Initialized:  # -> DataDownloading -> DataDownloaded
-                await self.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare)
+                await self._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare)
             elif tstate == TrainerState.DataDownloaded:  # -> TrainModelDownloading -> TrainModelDownloaded
-                await self.perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model)
+                await self._perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model)
             elif tstate == TrainerState.TrainModelDownloaded:  # -> TrainingRunning -> TrainingFinished
-                await self.perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train)
+                await self._perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train)
             elif tstate == TrainerState.TrainingFinished:  # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced
-                await self.perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix)
+                await self._perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix)
             elif tstate == TrainerState.ConfusionMatrixSynced:  # -> TrainModelUploading -> TrainModelUploaded
-                await self.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model)
+                await self._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model)
             elif tstate == TrainerState.TrainModelUploaded:  # -> Detecting -> Detected
-                await self.perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections)
+                await self._perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections)
             elif tstate == TrainerState.Detected:  # -> DetectionUploading -> ReadyForCleanup
-                await self.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions)
+                await self._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions)
             elif tstate == TrainerState.ReadyForCleanup:  # -> RESTART or TrainingFinished
-                await self.clear_training()
-                self.may_restart()
+                await self._clear_training()
+                self._may_restart()
 
-    async def perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False):
+    async def _perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False):
         await asyncio.sleep(0.1)
         logging.info(f'Performing state: {state_during}')
-        previous_state = self.active_training.training_state
-        self.active_training.training_state = state_during
+        previous_state = self.training.training_state
+        self.training.training_state = state_during
         await asyncio.sleep(0.1)
         if reset_early:
             self.errors.reset(error_key)
@@ -141,52 +275,60 @@ async def perform_state(self, error_key: str, state_during: TrainerState, state_
         except Exception as e:
             self.errors.set(error_key, str(e))
             logging.exception(f'Error in {state_during} - Exception:')
-            self.active_training.training_state = previous_state
+            self.training.training_state = previous_state
         else:
             if not reset_early:
                 self.errors.reset(error_key)
-            self.active_training.training_state = state_after
-            self.last_training_io.save(self.active_training)
+            self.training.training_state = state_after
+            self.last_training_io.save(self.training)
 
     async def _prepare(self) -> None:
-        self.data_exchanger.set_context(self.active_training.context)
-        downloader = TrainingsDownloader(self.data_exchanger)
-        image_data, skipped_image_count = await downloader.download_training_data(self.active_training.images_folder)
-        assert self.active_training.data is not None, 'training.data must be set'
-        self.active_training.data.image_data = image_data
-        self.active_training.data.skipped_image_count = skipped_image_count
+        """Downloads images to the images_folder and saves annotations to training.data.image_data.
+        """
+        self.node.data_exchanger.set_context(self.training.context)
+        downloader = TrainingsDownloader(self.node.data_exchanger)
+        image_data, skipped_image_count = await downloader.download_training_data(self.training.images_folder)
+        assert self.training.data is not None, 'training.data must be set'
+        self.training.data.image_data = image_data
+        self.training.data.skipped_image_count = skipped_image_count
 
     async def _download_model(self) -> None:
-        model_id = self.active_training.base_model_id
-        assert model_id is not None, 'model_id must be set'
-        if is_valid_uuid4(
-                self.active_training.base_model_id):  # TODO this checks if we continue a training -> make more explicit
-            logging.info('loading model from Learning Loop')
-            logging.info(f'downloading model {model_id} as {self.model_format}')
-            await self.data_exchanger.download_model(self.active_training.training_folder, self.active_training.context, model_id, self.model_format)
-            shutil.move(f'{self.active_training.training_folder}/model.json',
-                        f'{self.active_training.training_folder}/base_model.json')
-        else:
-            logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download')
-
-    async def _sync_confusion_matrix(self):
-        '''NOTE: This stage sets the errors explicitly because it may be used inside the training stage.'''
+        """If training is continued, the model is downloaded from the Learning Loop to the training_folder.
+        The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training.
+        """
+        base_model_uuid = self.training.base_model_uuid_or_name
+
+        # TODO this checks if we continue a training -> make more explicit
+        if not base_model_uuid or not is_valid_uuid4(base_model_uuid):
+            logging.info(f'skipping model download. No base model provided (in form of uuid): {base_model_uuid}')
+            return
+
+        logging.info('loading model from Learning Loop')
+        logging.info(f'downloading model {base_model_uuid} as {self.model_format}')
+        await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_uuid, self.model_format)
+        shutil.move(f'{self.training.training_folder}/model.json',
+                    f'{self.training.training_folder}/base_model.json')
+
+    async def _sync_confusion_matrix(self) -> None:
+        """Syncronizes the confusion matrix with the Learning Loop via the update_training endpoint.
+        NOTE: This stage sets the errors explicitly because it may be used inside the training stage.
+        """
         error_key = 'sync_confusion_matrix'
         try:
-            new_best_model = self.get_new_best_model()
-            if new_best_model and self.active_training.data:
-                new_training = TrainingOut(trainer_id=self.node_uuid,
+            new_best_model = self._get_new_best_training_state()
+            if new_best_model and self.training.data:
+                new_training = TrainingOut(trainer_id=self.node.uuid,
                                            confusion_matrix=new_best_model.confusion_matrix,
-                                           train_image_count=self.active_training.data.train_image_count(),
-                                           test_image_count=self.active_training.data.test_image_count(),
-                                           hyperparameters=self.hyperparameters)
+                                           train_image_count=self.training.data.train_image_count(),
+                                           test_image_count=self.training.data.test_image_count(),
+                                           hyperparameters=self.hyperparameters_for_state_sync)
                 await asyncio.sleep(0.1)  # NOTE needed for tests.
 
-                result = await self.sio_client.call('update_training', (
-                    self.active_training.context.organization, self.active_training.context.project, jsonable_encoder(new_training)))
+                result = await self.node.sio_client.call('update_training', (
+                    self.training.context.organization, self.training.context.project, jsonable_encoder(new_training)))
                 if isinstance(result,  dict) and result['success']:
                     logging.info(f'successfully updated training {asdict(new_training)}')
-                    self.on_model_published(new_best_model)
+                    self._on_metrics_published(new_best_model)
                 else:
                     raise Exception(f'Error for update_training: Response from loop was : {result}')
         except Exception as e:
@@ -195,25 +337,23 @@ async def _sync_confusion_matrix(self):
             raise
         self.errors.reset(error_key)
 
-    async def _upload_model(self) -> None | bool:
-        """Returns True if the training should be cleaned up."""
-
-        new_model_id = await self._upload_model_return_new_model_uuid(self.active_training.context)
-        if new_model_id is None:
-            self.active_training.training_state = TrainerState.ReadyForCleanup
+    async def _upload_model(self) -> None:
+        """Uploads the latest model to the Learning Loop.
+        """
+        new_model_uuid = await self._upload_model_return_new_model_uuid(self.training.context)
+        if new_model_uuid is None:
+            self.training.training_state = TrainerState.ReadyForCleanup
             logging.error('could not upload model - maybe training failed.. cleaning up')
-            return True
-        logging.info(f'Successfully uploaded model and received new model id: {new_model_id}')
-        self.active_training.model_id_for_detecting = new_model_id
-        return None
+        logging.info(f'Successfully uploaded model and received new model id: {new_model_uuid}')
+        self.training.model_uuid_for_detecting = new_model_uuid
 
     async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
         """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file.
         Note that with the latest trainers the conversion to (.wts) is done by the trainer.
         The conversion from .wts to .engine is done by the detector (needs to be done on target hardware).
-        Note that trainer may train with different classes, which is why we send an initial model.json file.
-        """
-        files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files)
+        Note that trainer may train with different classes, which is why we send an initial model.json file."""
+
+        files = await self._get_latest_model_files()
         if files is None:
             return None
 
@@ -225,10 +365,10 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
 
         model_uuid = None
         for file_format in [f for f in files if f not in already_uploaded_formats]:
-            _files = files[file_format] + [self.dump_categories_to_json()]
+            _files = files[file_format] + [self._dump_categories_to_json()]
             assert len([f for f in _files if 'model.json' in f]) == 1, "model.json must be included exactly once"
 
-            model_uuid = await self.data_exchanger.upload_model_get_uuid(context, _files, self.active_training.training_number, file_format)
+            model_uuid = await self.node.data_exchanger.upload_model_get_uuid(context, _files, self.training.training_number, file_format)
             if model_uuid is None:
                 return None
 
@@ -237,91 +377,119 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona
 
         return model_uuid
 
-    def dump_categories_to_json(self) -> str:
+    def _dump_categories_to_json(self) -> str:
+        """Dumps the categories to a json file and returns the path to the file.
+        """
         content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None
         json_path = '/tmp/model.json'
         with open(json_path, 'w') as f:
             json.dump(content, f)
         return json_path
 
-    async def clear_training(self):
+    async def _clear_training(self):
+        """Clears the training data after a training has finished.
+        """
         self.active_training_io.delete_detections()
         self.active_training_io.delete_detection_upload_progress()
         self.active_training_io.delete_detections_upload_file_index()
-        await self.clear_training_data(self.active_training.training_folder)
+        await self._clear_training_data(self.training.training_folder)
         self.last_training_io.delete()
-        # self.training.training_state = TrainingState.TrainingFinished
 
         await self.node.send_status()
         self._training = None
 
     # ---------------------------------------- OTHER METHODS ----------------------------------------
 
-    def may_restart(self) -> None:
-        if self.restart_after_training:
-            logging.info('restarting')
-            sys.exit(0)
-        else:
-            logging.info('not restarting')
-
     async def on_shutdown(self) -> None:
         self.shutdown_event.set()
         await self.stop()
         await self.stop()
 
-    # ---------------------------------------- ABSTRACT PROPERTIES ----------------------------------------
-
-    @property
-    @abstractmethod
-    def training_progress(self) -> Optional[float]:
-        """Represents the training progress."""
-        raise NotImplementedError
-
+    async def stop(self):
+        """Stops the training process by canceling training task.
+        """
+        if not self.training_active:
+            return
+        if self.training_task:
+            logging.info('cancelling training task')
+            if self.training_task.cancel():
+                try:
+                    await self.training_task
+                except asyncio.CancelledError:
+                    pass
+                logging.info('cancelled training task')
+                self._may_restart()
+
+    def _may_restart(self) -> None:
+        """If the environment variable RESTART_AFTER_TRAINING is set, the trainer will restart after a training.
+        """
+        if self._environment_vars.restart_after_training:
+            logging.info('restarting')
+            sys.exit(0)
+        else:
+            logging.info('not restarting')
     # ---------------------------------------- ABSTRACT METHODS ----------------------------------------
 
     @abstractmethod
     async def _train(self) -> None:
-        '''Should be used to execute a training.
+        """Should be used to execute a training.
+        At this point, images are already downloaded to the images_folder and annotations are saved in training.data.image_data.
+        If a training is continued, the model is already downloaded.
         The model should be synchronized with the Learning Loop via self._sync_confusion_matrix() every now and then.
-        asyncio.CancelledError should be catched and re-raised.'''
+        asyncio.CancelledError should be catched and re-raised.
+        """
+        raise NotImplementedError
 
     @abstractmethod
     async def _do_detections(self) -> None:
-        '''Should be used to execute detections.
+        """Should be used to infer detections of all images and save them to drive.
         active_training_io.save_detections(...) should be used to store the detections.
-        asyncio.CancelledError should be catched and re-raised.'''
+        asyncio.CancelledError should be catched and re-raised.
+        """
+        raise NotImplementedError
 
     @abstractmethod
-    def get_new_best_model(self) -> Optional[BasicModel]:
-        '''Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe.
-        Returns None if no new model could be found. Otherwise BasicModel(confusion_matrix, meta_information).
+    def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
+        """Is called frequently by `_sync_confusion_matrix` to check if a new "best" model is availabe.
+        Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information).
         `confusion_matrix` contains a dict of all classes:
-            - The classes must be identified by their id, not their name.
+            - The classes must be identified by their uuid, not their name.
             - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives).
-        `meta_information` can hold any data which is helpful for self.on_model_published to store weight file etc for later upload via self.get_model_files
-        '''
+        `meta_information` can hold any data which is helpful for self._on_metrics_published to store weight file etc for later upload via self.get_model_files
+        """
+        raise NotImplementedError
 
     @abstractmethod
-    def on_model_published(self, basic_model: BasicModel) -> None:
-        '''Called after a BasicModel has been successfully send to the Learning Loop.
-        The files for this model should be stored.
-        self.get_latest_model_files is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop.
-        In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model
-        '''
+    def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
+        """Called after the metrics corresponding to TrainingStateData have been successfully send to the Learning Loop.
+        Receives the TrainingStateData object which was returned by self._get_new_best_training_state. 
+        If above function returns None, this function is not called.
+        The respective files for this model should be stored so they can be later uploaded in get_latest_model_files.
+        """
+        raise NotImplementedError
 
     @abstractmethod
-    def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]:
-        '''Called when the Learning Loop requests to backup the latest model for the training.
-        Should return a list of file paths which describe the model.
+    async def _get_latest_model_files(self) -> Dict[str, List[str]]:
+        """Called when the Learning Loop requests to backup the latest model for the training.
+        This function is used to __generate and gather__ all files needed for transfering the actual data from the trainer node to the Learning Loop.
+        In the simplest implementation this method just renames the weight file (e.g. stored in TrainingStateData.meta_information) into a file name like latest_published_model
+
+        The function should return a list of file paths which describe the model per format.
         These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.)
         and will be stored in the Learning Loop unter the format of this trainer.
         Note: by convention the weightfile should be named "model.<extension>" where extension is the file format of the weightfile.
         For example "model.pt" for pytorch or "model.weights" for darknet/yolo.
 
         If a trainer can also generate other formats (for example for an detector),
-        a dictionary mapping format -> list of files can be returned.'''
+        a dictionary mapping format -> list of files can be returned.
+
+        If the function returns an empty dict, something went wrong and the model upload will be skipped.
+        """
+        raise NotImplementedError
 
     @abstractmethod
-    async def clear_training_data(self, training_folder: str) -> None:
-        '''Called after a training has finished. Deletes all data that is not needed anymore after a training run. 
-        This can be old weightfiles or any additional files.'''
+    async def _clear_training_data(self, training_folder: str) -> None:
+        """Called after a training has finished. Deletes all data that is not needed anymore after a training run. 
+        This can be old weightfiles or any additional files.
+        """
+        raise NotImplementedError
diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py
index c87124c1..f69cf103 100644
--- a/learning_loop_node/trainer/trainer_node.py
+++ b/learning_loop_node/trainer/trainer_node.py
@@ -9,12 +9,12 @@
 from ..node import Node
 from .io_helpers import LastTrainingIO
 from .rest import backdoor_controls, controls
-from .trainer_logic_abstraction import TrainerLogicAbstraction
+from .trainer_logic_generic import TrainerLogicGeneric
 
 
 class TrainerNode(Node):
 
-    def __init__(self, name: str, trainer_logic: TrainerLogicAbstraction, uuid: Optional[str] = None, use_backdoor_controls: bool = False):
+    def __init__(self, name: str, trainer_logic: TrainerLogicGeneric, uuid: Optional[str] = None, use_backdoor_controls: bool = False):
         super().__init__(name, uuid, 'trainer')
         trainer_logic._node = self
         self.trainer_logic = trainer_logic
@@ -84,7 +84,7 @@ async def send_status(self):
             status.train_image_count = data.train_image_count()
             status.test_image_count = data.test_image_count()
             status.skipped_image_count = data.skipped_image_count
-            status.hyperparameters = self.trainer_logic.hyperparameters
+            status.hyperparameters = self.trainer_logic.hyperparameters_for_state_sync
             status.errors = self.trainer_logic.errors.errors
             status.context = self.trainer_logic.training_context
 
diff --git a/learning_loop_node/trainer/training_syncronizer.py b/learning_loop_node/trainer/training_syncronizer.py
deleted file mode 100644
index 97041bb9..00000000
--- a/learning_loop_node/trainer/training_syncronizer.py
+++ /dev/null
@@ -1,53 +0,0 @@
-
-import asyncio
-import logging
-from dataclasses import asdict
-from typing import TYPE_CHECKING
-
-import socketio
-from dacite import from_dict
-from fastapi.encoders import jsonable_encoder
-
-from ..data_classes import TrainingOut
-from ..data_classes.socket_response import SocketResponse
-
-if TYPE_CHECKING:
-    from .trainer_logic import TrainerLogic
-
-
-class TrainingSyncronizer:
-    def __init__(self, trainer_node_uuid: str, sio_client: socketio.AsyncClient):
-        self.trainer_node_uuid = trainer_node_uuid
-        self.sio_client = sio_client
-
-    async def sync_model(model, current_training):
-        new_training = TrainingOut(
-            trainer_id=self.trainer_node_uuid,
-            confusion_matrix=model.confusion_matrix,
-            train_image_count=current_training.data.train_image_count(),
-            test_image_count=current_training.data.test_image_count(),
-            hyperparameters=trainer.hyperparameters)
-
-        await asyncio.sleep(0.1)  # NOTE needed for tests.
-
-        result = await self.sio_client.call('update_training', (current_training.context.organization, current_training.context.project, jsonable_encoder(new_training)))
-        response = from_dict(data_class=SocketResponse, data=result)
-
-        return response
-
-
-async def try_sync_model(mo):
-    try:
-        model = trainer.get_new_model()
-    except Exception as exc:
-        logging.exception('error while getting new model')
-        raise Exception(f'Could not get new model: {str(exc)}') from exc
-    logging.debug(f'new model {model}')
-
-    if model:
-        response = await sync_model(trainer, trainer_node_uuid, sio_client, model)
-
-        if not response.success:
-            error_msg = f'Error for update_training: Response from loop was : {asdict(response)}'
-            logging.error(error_msg)
-            raise Exception(error_msg)
diff --git a/mock_detector/app_code/tests/test_detector.py b/mock_detector/app_code/tests/test_detector.py
index 3d05d99e..75816212 100644
--- a/mock_detector/app_code/tests/test_detector.py
+++ b/mock_detector/app_code/tests/test_detector.py
@@ -5,6 +5,8 @@
 from learning_loop_node.detector.detector_node import DetectorNode
 from learning_loop_node.globals import GLOBALS
 
+# pylint: disable=unused-argument
+
 
 @pytest.fixture(scope="session")
 def event_loop(request):
diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py
index e88a2de3..d293758e 100644
--- a/mock_trainer/app_code/mock_trainer_logic.py
+++ b/mock_trainer/app_code/mock_trainer_logic.py
@@ -2,11 +2,11 @@
 import asyncio
 import logging
 import time
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional
 
-from learning_loop_node.data_classes import (BasicModel, BoxDetection, CategoryType, ClassificationDetection,
-                                             Detections, ErrorConfiguration, ModelInformation, Point, PointDetection,
-                                             PretrainedModel, SegmentationDetection, Shape)
+from learning_loop_node.data_classes import (BoxDetection, CategoryType, ClassificationDetection, Detections,
+                                             ErrorConfiguration, ModelInformation, Point, PointDetection,
+                                             PretrainedModel, SegmentationDetection, Shape, TrainingStateData)
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
 from . import progress_simulator
@@ -23,28 +23,28 @@ def __init__(self, model_format: str) -> None:
         self.current_iteration = 0
         self.provide_new_model = True
 
-    def can_resume(self) -> bool:
+    def _can_resume(self) -> bool:
         return False
 
-    async def resume(self) -> None:
+    async def _resume(self) -> None:
         pass
 
-    async def start_training(self) -> None:
+    async def _start_training_from_base_model(self) -> None:
         self.current_iteration = 0
         if self.error_configuration.begin_training:
             raise Exception('Could not start training')
-        self.executor.start('while true; do sleep 1; done')
+        await self.executor.start('/bin/bash -c "while true; do sleep 1; done"')
 
-    async def start_training_from_scratch(self, base_model_id: str) -> None:
+    async def _start_training_from_scratch(self) -> None:
         self.current_iteration = 0
-        self.executor.start('while true; do sleep 1; done')
+        await self.executor.start('/bin/bash -c "while true; do sleep 1; done"')
 
-    def get_executor_error_from_log(self) -> Optional[str]:
+    def _get_executor_error_from_log(self) -> Optional[str]:
         if self.error_configuration.crash_training:
             return 'mocked crash'
         return None
 
-    def get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]:
+    async def _get_latest_model_files(self) -> Dict[str, List[str]]:
         if self.error_configuration.save_model:
             raise Exception()
 
@@ -66,37 +66,34 @@ async def _detect(self, model_information: ModelInformation, images:  List[str],
         for image in images:
             image_id = image.split('/')[-1].replace('.jpg', '')
 
-            box_detections = []
-            point_detections = []
-            segmentation_detections = []
-            classification_detections = []
-            det_entry = {
-                'image_id': image_id, 'box_detections': box_detections, 'point_detections': point_detections,
-                'segmentation_detections': segmentation_detections,
-                'classification_detections': classification_detections}
+            box_detections: List[BoxDetection] = []
+            point_detections: List[PointDetection] = []
+            segmentation_detections: List[SegmentationDetection] = []
+            classification_detections: List[ClassificationDetection] = []
+
             for c in model_information.categories:
                 if c.type == CategoryType.Box:
-                    d = BoxDetection(category_name=c.name, x=1, y=2, width=30, height=40,
-                                     model_name=model_information.version, confidence=.99, category_id=c.id)
-                    box_detections.append(d)
+                    bd = BoxDetection(category_name=c.name, x=1, y=2, width=30, height=40,
+                                      model_name=model_information.version, confidence=.99, category_id=c.id)
+                    box_detections.append(bd)
                 elif c.type == CategoryType.Point:
-                    d = PointDetection(category_name=c.name, x=100, y=200,
-                                       model_name=model_information.version, confidence=.97, category_id=c.id)
-                    point_detections.append(d)
+                    pd = PointDetection(category_name=c.name, x=100, y=200,
+                                        model_name=model_information.version, confidence=.97, category_id=c.id)
+                    point_detections.append(pd)
                 elif c.type == CategoryType.Segmentation:
-                    d = SegmentationDetection(category_name=c.name, shape=Shape(points=[Point(x=1, y=2), Point(
+                    sd = SegmentationDetection(category_name=c.name, shape=Shape(points=[Point(x=1, y=2), Point(
                         x=3, y=4)]), model_name=model_information.version, confidence=.96, category_id=c.id)
-                    segmentation_detections.append(d)
+                    segmentation_detections.append(sd)
                 elif c.type == CategoryType.Classification:
-                    d = ClassificationDetection(category_name=c.name, model_name=model_information.version,
-                                                confidence=.95, category_id=c.id)
-                    classification_detections.append(d)
+                    cd = ClassificationDetection(category_name=c.name, model_name=model_information.version,
+                                                 confidence=.95, category_id=c.id)
+                    classification_detections.append(cd)
             detections.append(Detections(box_detections=box_detections, point_detections=point_detections,
                                          segmentation_detections=segmentation_detections,
                                          classification_detections=classification_detections, image_id=image_id))
         return detections
 
-    async def clear_training_data(self, training_folder: str):
+    async def _clear_training_data(self, training_folder: str):
         pass
 
     @property
@@ -111,7 +108,7 @@ def training_progress(self) -> float:
         print(f'prog. is {self.current_iteration} / {self.max_iterations} = {self.current_iteration / self.max_iterations}')
         return self.current_iteration / self.max_iterations
 
-    def get_new_best_model(self) -> Optional[BasicModel]:
+    def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
         logging.warning('get_new_model called')
         if self.error_configuration.get_new_model:
             raise Exception('Could not get new model')
@@ -120,9 +117,9 @@ def get_new_best_model(self) -> Optional[BasicModel]:
         self.current_iteration += 1
         return progress_simulator.increment_time(self, self.latest_known_confusion_matrix)
 
-    def on_model_published(self, basic_model: BasicModel) -> None:
-        assert isinstance(basic_model.confusion_matrix, Dict)
-        self.latest_known_confusion_matrix = basic_model.confusion_matrix
+    def _on_metrics_published(self, training_state_data: TrainingStateData) -> None:
+        assert isinstance(training_state_data.confusion_matrix, Dict)
+        self.latest_known_confusion_matrix = training_state_data.confusion_matrix
 
     @property
     def model_architecture(self) -> str:
diff --git a/mock_trainer/app_code/progress_simulator.py b/mock_trainer/app_code/progress_simulator.py
index 6eaf5ced..76f8be52 100644
--- a/mock_trainer/app_code/progress_simulator.py
+++ b/mock_trainer/app_code/progress_simulator.py
@@ -1,17 +1,17 @@
 import random
 from typing import Dict, Optional
 
-from learning_loop_node.data_classes import BasicModel
+from learning_loop_node.data_classes import TrainingStateData
 from learning_loop_node.trainer.trainer_logic import TrainerLogic
 
 
-def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -> Optional[BasicModel]:
+def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -> Optional[TrainingStateData]:
     if not trainer._training or not trainer._training.data:  # pylint: disable=protected-access
         return None
 
     confusion_matrix = {}
-    assert trainer.active_training.data is not None
-    for category in trainer.active_training.data.categories:
+    assert trainer.training.data is not None
+    for category in trainer.training.data.categories:
         try:
             minimum = latest_known_confusion_matrix[category.id]['tp']
         except Exception:
@@ -23,7 +23,7 @@ def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -
             'fn': max(random.randint(10-maximum, 10-minimum), 2),
         }
 
-    new_model = BasicModel(
+    new_model = TrainingStateData(
         confusion_matrix=confusion_matrix,
     )
 
diff --git a/mock_trainer/app_code/tests/conftest.py b/mock_trainer/app_code/tests/conftest.py
index 86c62dc2..6c23ca7e 100644
--- a/mock_trainer/app_code/tests/conftest.py
+++ b/mock_trainer/app_code/tests/conftest.py
@@ -1,5 +1,4 @@
 import asyncio
-import logging
 import shutil
 
 import pytest
@@ -7,6 +6,8 @@
 from learning_loop_node.globals import GLOBALS
 from learning_loop_node.loop_communication import LoopCommunicator
 
+# pylint: disable=redefined-outer-name
+
 
 @pytest.fixture()
 async def glc():
diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py
index 42fbfe8b..a1e3b471 100644
--- a/mock_trainer/app_code/tests/test_detections.py
+++ b/mock_trainer/app_code/tests/test_detections.py
@@ -12,8 +12,10 @@
 
 from ..mock_trainer_logic import MockTrainerLogic
 
+# pylint: disable=protected-access,redefined-outer-name,unused-argument
 
-async def test_all(setup_test_project1, glc: LoopCommunicator):  # pylint: disable=unused-argument, redefined-outer-name
+
+async def test_all(setup_test_project1, glc: LoopCommunicator):
     assert_image_count(0)
     assert GLOBALS.data_folder == '/tmp/learning_loop_lib_data'
 
@@ -28,14 +30,14 @@ async def test_all(setup_test_project1, glc: LoopCommunicator):  # pylint: disab
                'resolution': 800,
                'flip_rl': False,
                'flip_ud': False}
-    trainer._node = node  # pylint: disable=protected-access
-    trainer.init_new_training(context=context, details=details)
+    trainer._node = node
+    trainer._init_new_training(context=context, details=details)
 
     project_folder = create_project_folder(context)
     training = generate_training(project_folder, context)
-    training.model_id_for_detecting = latest_model_id
-    trainer._training = training  # pylint: disable=protected-access
-    await trainer._do_detections()  # pylint: disable=protected-access
+    training.model_uuid_for_detecting = latest_model_id
+    trainer._training = training
+    await trainer._do_detections()
     detections = trainer.active_training_io.load_detections()
 
     assert_image_count(10)  # TODO This assert fails frequently on Drone
diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py
index f20797b0..a5d397f5 100644
--- a/mock_trainer/app_code/tests/test_mock_trainer.py
+++ b/mock_trainer/app_code/tests/test_mock_trainer.py
@@ -7,16 +7,19 @@
 
 from ..mock_trainer_logic import MockTrainerLogic
 
+# pylint: disable=protected-access
+# pylint: disable=unused-argument
+
 
 async def create_mock_trainer() -> MockTrainerLogic:
     mock_trainer = MockTrainerLogic(model_format='mocked')
-    mock_trainer._executor = Executor(GLOBALS.data_folder)  # pylint: disable=protected-access
+    mock_trainer._executor = Executor(GLOBALS.data_folder)
     return mock_trainer
 
 
 async def test_get_model_files(setup_test_project2):
     mock_trainer = await create_mock_trainer()
-    files = mock_trainer.get_latest_model_files()
+    files = mock_trainer._get_latest_model_files()
 
     assert isinstance(files, Dict)
 
@@ -27,7 +30,7 @@ async def test_get_model_files(setup_test_project2):
 
 async def test_get_new_model(setup_test_project2):
     mock_trainer = await create_mock_trainer()
-    await mock_trainer.start_training()
+    await mock_trainer._start_training_from_base_model()
 
     model = Model(uuid=(str(uuid4())))
     context = Context(organization="", project="")
@@ -37,6 +40,6 @@ async def test_get_new_model(setup_test_project2):
         project_folder="",
         images_folder="",
         training_folder="",)
-    mock_trainer.active_training.data = TrainingData(image_data=[], categories=[])
-    model = mock_trainer.get_new_best_model()
+    mock_trainer.training.data = TrainingData(image_data=[], categories=[])
+    model = mock_trainer._get_new_best_training_state()
     assert model is not None

From a634dfead98ad2cca8aa97e1e10c7ccc01b432b5 Mon Sep 17 00:00:00 2001
From: Niklas Neugebauer <niklas@zauberzeug.com>
Date: Thu, 11 Apr 2024 12:54:12 +0200
Subject: [PATCH 58/62] remove redundant second if statement

---
 learning_loop_node/loop_communication.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index 0642c3c1..a2e65124 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -81,9 +81,6 @@ async def retry_on_401(self, func: Callable[..., Awaitable[httpx.Response]], *ar
     async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/api') -> httpx.Response:
         if requires_login:
             await self.ensure_login()
-
-        # retry on 401 if required
-        if requires_login:
             return await self.retry_on_401(self._get, path, api_prefix)
         else:
             return await self._get(path, api_prefix)
@@ -97,14 +94,11 @@ async def _get(self, path: str, api_prefix: str = '/api') -> httpx.Response:
     async def put(self, path: str, files: Optional[List[str]] = None, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response:
         if requires_login:
             await self.ensure_login()
-
-        # retry on 401 if required
-        if requires_login:
             return await self.retry_on_401(self._put, path, api_prefix, **kwargs)
         else:
             return await self._put(path, files, api_prefix, **kwargs)
 
-    async def _put(self, path, files: Optional[List[str]] = None, api_prefix='/api', **kwargs) -> httpx.Response:
+    async def _put(self, path: str, files: Optional[List[str]] = None, api_prefix='/api', **kwargs) -> httpx.Response:
         if files is None:
             return await self.async_client.put(api_prefix+path, **kwargs)
 
@@ -129,9 +123,6 @@ async def _put(self, path, files: Optional[List[str]] = None, api_prefix='/api',
     async def post(self, path: str, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response:
         if requires_login:
             await self.ensure_login()
-
-        # retry on 401 if required
-        if requires_login:
             return await self.retry_on_401(self._post, path, api_prefix, **kwargs)
         else:
             return await self._post(path, api_prefix, **kwargs)
@@ -145,9 +136,6 @@ async def _post(self, path, api_prefix='/api', **kwargs) -> httpx.Response:
     async def delete(self, path: str, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response:
         if requires_login:
             await self.ensure_login()
-
-        # retry on 401 if required
-        if requires_login:
             return await self.retry_on_401(self._delete, path, api_prefix, **kwargs)
         else:
             return await self._delete(path, api_prefix, **kwargs)

From 507d59e8bcfadce3007251bf6d35b1a66675d325 Mon Sep 17 00:00:00 2001
From: Niklas Neugebauer <niklas@zauberzeug.com>
Date: Thu, 11 Apr 2024 12:56:11 +0200
Subject: [PATCH 59/62] shorten most function to one-line

---
 learning_loop_node/loop_communication.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index a2e65124..ed322c87 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -86,10 +86,7 @@ async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/
             return await self._get(path, api_prefix)
 
     async def _get(self, path: str, api_prefix: str = '/api') -> httpx.Response:
-
-        response = await self.async_client.get(api_prefix+path)
-
-        return response
+        return await self.async_client.get(api_prefix+path)
 
     async def put(self, path: str, files: Optional[List[str]] = None, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response:
         if requires_login:
@@ -128,10 +125,7 @@ async def post(self, path: str, requires_login: bool = True, api_prefix: str = '
             return await self._post(path, api_prefix, **kwargs)
 
     async def _post(self, path, api_prefix='/api', **kwargs) -> httpx.Response:
-
-        response = await self.async_client.post(api_prefix+path, **kwargs)
-
-        return response
+        return await self.async_client.post(api_prefix+path, **kwargs)
 
     async def delete(self, path: str, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response:
         if requires_login:
@@ -141,7 +135,4 @@ async def delete(self, path: str, requires_login: bool = True, api_prefix: str =
             return await self._delete(path, api_prefix, **kwargs)
 
     async def _delete(self, path, api_prefix='/api', **kwargs) -> httpx.Response:
-
-        response = await self.async_client.delete(api_prefix+path, **kwargs)
-
-        return response
+        return await self.async_client.delete(api_prefix+path, **kwargs)

From b547780ad39fb22e96604fbe2b71f0b2c39e00b1 Mon Sep 17 00:00:00 2001
From: Niklas Neugebauer <niklas@zauberzeug.com>
Date: Thu, 11 Apr 2024 12:58:24 +0200
Subject: [PATCH 60/62] fix missing argument when calling _put also remove
 default arguments in private methods to make this kind of error more obvious

---
 learning_loop_node/loop_communication.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py
index ed322c87..a643fec4 100644
--- a/learning_loop_node/loop_communication.py
+++ b/learning_loop_node/loop_communication.py
@@ -85,17 +85,17 @@ async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/
         else:
             return await self._get(path, api_prefix)
 
-    async def _get(self, path: str, api_prefix: str = '/api') -> httpx.Response:
+    async def _get(self, path: str, api_prefix: str) -> httpx.Response:
         return await self.async_client.get(api_prefix+path)
 
     async def put(self, path: str, files: Optional[List[str]] = None, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response:
         if requires_login:
             await self.ensure_login()
-            return await self.retry_on_401(self._put, path, api_prefix, **kwargs)
+            return await self.retry_on_401(self._put, path, files, api_prefix, **kwargs)
         else:
             return await self._put(path, files, api_prefix, **kwargs)
 
-    async def _put(self, path: str, files: Optional[List[str]] = None, api_prefix='/api', **kwargs) -> httpx.Response:
+    async def _put(self, path: str, files: Optional[List[str]], api_prefix: str, **kwargs) -> httpx.Response:
         if files is None:
             return await self.async_client.put(api_prefix+path, **kwargs)
 
@@ -134,5 +134,5 @@ async def delete(self, path: str, requires_login: bool = True, api_prefix: str =
         else:
             return await self._delete(path, api_prefix, **kwargs)
 
-    async def _delete(self, path, api_prefix='/api', **kwargs) -> httpx.Response:
+    async def _delete(self, path, api_prefix, **kwargs) -> httpx.Response:
         return await self.async_client.delete(api_prefix+path, **kwargs)

From eaab2bccb32bf30495a6fc7131d6c7e27f3fc780 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Mon, 15 Apr 2024 15:36:36 +0200
Subject: [PATCH 61/62] try rerduce flakyness of test

---
 learning_loop_node/tests/test_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py
index 1842f71e..1dbae97c 100644
--- a/learning_loop_node/tests/test_executor.py
+++ b/learning_loop_node/tests/test_executor.py
@@ -37,7 +37,7 @@ async def test_executor_lifecycle():
     assert executor.is_running()
     assert_process_is_running('some_executable.sh')
 
-    sleep(1)
+    sleep(5)
     assert 'some output' in executor.get_log()
 
     await executor.stop_and_wait()

From 11d0b66eaaa3baadc1d6fdcc6330f5fd3d9ddd33 Mon Sep 17 00:00:00 2001
From: "Dr. Dennis Wittich" <denniswittich@hotmail.de>
Date: Mon, 15 Apr 2024 17:45:33 +0200
Subject: [PATCH 62/62] fix test of mock_trainer

---
 mock_trainer/app_code/tests/test_mock_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py
index a5d397f5..e2b518b0 100644
--- a/mock_trainer/app_code/tests/test_mock_trainer.py
+++ b/mock_trainer/app_code/tests/test_mock_trainer.py
@@ -19,7 +19,7 @@ async def create_mock_trainer() -> MockTrainerLogic:
 
 async def test_get_model_files(setup_test_project2):
     mock_trainer = await create_mock_trainer()
-    files = mock_trainer._get_latest_model_files()
+    files = await mock_trainer._get_latest_model_files()
 
     assert isinstance(files, Dict)