From ff4721870b5441b7e23d793ddc86040bea6f75ca Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 6 Mar 2024 15:13:18 +0100 Subject: [PATCH 01/62] Start cleanup process. Remove converter code and do major cleanup of data_exchanger. No intendet API changes so far --- learning_loop_node/converter/__init__.py | 0 .../converter/converter_logic.py | 68 ------- .../converter/converter_node.py | 125 ------------ .../converter/tests/test_converter.py | 55 ----- learning_loop_node/data_exchanger.py | 189 ++++++------------ .../detector/tests/testing_detector.py | 9 +- learning_loop_node/helpers/misc.py | 36 ++++ learning_loop_node/node.py | 33 ++- learning_loop_node/tests/test_downloader.py | 11 +- learning_loop_node/trainer/downloader.py | 2 +- learning_loop_node/trainer/trainer_logic.py | 16 +- 11 files changed, 124 insertions(+), 420 deletions(-) delete mode 100644 learning_loop_node/converter/__init__.py delete mode 100644 learning_loop_node/converter/converter_logic.py delete mode 100644 learning_loop_node/converter/converter_node.py delete mode 100644 learning_loop_node/converter/tests/test_converter.py diff --git a/learning_loop_node/converter/__init__.py b/learning_loop_node/converter/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/learning_loop_node/converter/converter_logic.py b/learning_loop_node/converter/converter_logic.py deleted file mode 100644 index cef82eff..00000000 --- a/learning_loop_node/converter/converter_logic.py +++ /dev/null @@ -1,68 +0,0 @@ -import json -import os -import shutil -from abc import abstractmethod -from typing import List, Optional - -from ..data_classes import ModelInformation -from ..node import Node - - -class ConverterLogic(): - - def __init__( - self, source_format: str, target_format: str): - self.source_format = source_format - self.target_format = target_format - self._node: Optional[Node] = None - self.model_folder: Optional[str] = None - - def init(self, node: Node) -> None: - self._node = node - - @property - def node(self) -> Node: - if self._node is None: - raise Exception('ConverterLogic not initialized') - return self._node - - async def convert(self, model_information: ModelInformation) -> None: - project_folder = Node.create_project_folder(model_information.context) - - self.model_folder = ConverterLogic.create_model_folder(project_folder, model_information.id) - await self.node.data_exchanger.download_model(self.model_folder, - model_information.context, - model_information.id, - self.source_format) - - with open(f'{self.model_folder}/model.json', 'r') as f: - content = json.load(f) - if 'resolution' in content: - model_information.resolution = content['resolution'] - - await self._convert(model_information) - - async def upload_model(self, context, model_id: str) -> None: - files = self.get_converted_files(model_id) - await self.node.data_exchanger.upload_model(context, files, model_id, self.target_format) - - @abstractmethod - async def _convert(self, model_information: ModelInformation) -> None: - """Converts the model in self.model_folder to the target format.""" - - @abstractmethod - def get_converted_files(self, model_id) -> List[str]: - """Returns a list of files that should be uploaded to the server.""" - - @staticmethod - def create_convert_folder(project_folder: str) -> str: - image_folder = f'{project_folder}/images' - os.makedirs(image_folder, exist_ok=True) - return image_folder - - @staticmethod - def create_model_folder(project_folder: str, model_id: str) -> str: - model_folder = f'{project_folder}/{model_id}' - shutil.rmtree(model_folder, ignore_errors=True) # cleanup - os.makedirs(model_folder, exist_ok=True) - return model_folder diff --git a/learning_loop_node/converter/converter_node.py b/learning_loop_node/converter/converter_node.py deleted file mode 100644 index f23dd26e..00000000 --- a/learning_loop_node/converter/converter_node.py +++ /dev/null @@ -1,125 +0,0 @@ -import logging -from dataclasses import asdict -from http import HTTPStatus -from typing import List, Optional - -from dacite import from_dict -from fastapi.encoders import jsonable_encoder -from fastapi_utils.tasks import repeat_every -from socketio import AsyncClient - -from ..data_classes import Category, ModelInformation, NodeState -from ..node import Node -from .converter_logic import ConverterLogic - - -class ConverterNode(Node): - converter: ConverterLogic - skip_check_state: bool = False - bad_model_ids: List[str] = [] - - def __init__(self, name: str, converter: ConverterLogic, uuid: Optional[str] = None): - super().__init__(name, uuid) - self.converter = converter - converter.init(self) - - @self.on_event("startup") - @repeat_every(seconds=60, raise_exceptions=True, wait_first=False) - async def check_state(): - if not self.skip_check_state: - try: - await self.check_state() - except Exception: - logging.error('could not check state. Is loop reachable?') - - async def convert_model(self, model_information: ModelInformation): - if model_information.id in self.bad_model_ids: - logging.info( - f'skipping bad model model {model_information.id} for {model_information.context.organization}/{model_information.context.project}.') - return - try: - logging.info( - f'converting model {jsonable_encoder(asdict(model_information))}') - await self.converter.convert(model_information) - logging.info('uploading model ') - await self.converter.upload_model(model_information.context, model_information.id) - except Exception as e: - self.bad_model_ids.append(model_information.id) - logging.error( - f'could not convert model {model_information.id} for {model_information.context.organization}/{model_information.context.project}. Details: {str(e)}.') - - async def check_state(self): - logging.info(f'checking state: {self.status.state}') - - if self.status.state == NodeState.Running: - return - self.status.state = NodeState.Running - try: - await self.convert_models() - except Exception as exc: - logging.error(str(exc)) - - self.status.state = NodeState.Idle - - async def convert_models(self) -> None: - try: - response = await self.loop_communicator.get('/projects') - assert response.status_code == 200, f'Assert statuscode 200, but was {response.status_code}.' - content = response.json() - projects = content['projects'] - - for project in projects: - organization_id = project['organization_id'] - project_id = project['project_id'] - - response = await self.loop_communicator.get(f'{project["resource"]}') - if response.status_code != HTTPStatus.OK: - logging.error(f'got bad response for {response.url}: {str(response.status_code)}') - continue - - project_categories = [from_dict(data_class=Category, data=c) for c in response.json()['categories']] - - path = f'{project["resource"]}/models' - models_response = await self.loop_communicator.get(path) - assert models_response.status_code == 200 - content = models_response.json() - models = content['models'] - - for model in models: - if (model['version'] - and self.converter.source_format in model['formats'] - and self.converter.target_format not in model['formats'] - ): - # if self.converter.source_format in model['formats'] and project_id == 'drawingbot' and model['version'] == "6.0": - model_information = ModelInformation( - host=self.loop_communicator.base_url, - organization=organization_id, - project=project_id, - id=model['id'], - categories=project_categories, - version=model['version'], - ) - await self.convert_model(model_information) - except Exception: - logging.exception('could not convert models') - - async def send_status(self): - pass - - async def on_startup(self): - pass - - async def on_shutdown(self): - pass - - async def on_repeat(self): - pass - - def register_sio_events(self, sio_client: AsyncClient): - pass - - async def get_state(self): - return NodeState.Idle # NOTE unused for this node type - - def get_node_type(self): - return 'converter' diff --git a/learning_loop_node/converter/tests/test_converter.py b/learning_loop_node/converter/tests/test_converter.py deleted file mode 100644 index 7328806f..00000000 --- a/learning_loop_node/converter/tests/test_converter.py +++ /dev/null @@ -1,55 +0,0 @@ -import logging -from typing import List - -import pytest - -from learning_loop_node.converter.converter_logic import ConverterLogic -from learning_loop_node.converter.converter_node import ConverterNode -from learning_loop_node.data_classes import ModelInformation -from learning_loop_node.loop_communication import LoopCommunicator -from learning_loop_node.tests import test_helper - - -class TestConverter(ConverterLogic): - __test__ = False # hint for pytest - - def __init__(self, source_format: str, target_format: str, models: List[ModelInformation]): - super().__init__(source_format, target_format) - self.models = models - - async def _convert(self, model_information: ModelInformation) -> None: - self.models.append(model_information) - - def get_converted_files(self, model_id) -> List[str]: - return [] # test: test_meta_information fails because model cannot be uploaded - - -@pytest.mark.asyncio -@pytest.fixture() -async def setup_converter_test_project(glc: LoopCommunicator): - await glc.delete("/zauberzeug/projects/pytest_conv?keep_images=true") - project_configuration = { - 'project_name': 'pytest_conv', 'box_categories': 1, 'point_categories': 1, 'inbox': 0, 'annotate': 0, 'review': 0, - 'complete': 0, 'image_style': 'plain', 'thumbs': False, 'trainings': 1} - r = await glc.post("/zauberzeug/projects/generator", json=project_configuration) - assert r.status_code == 200 - yield - await glc.delete("/zauberzeug/projects/pytest?keep_images=true") - - -# pylint: disable=redefined-outer-name, unused-argument -@pytest.mark.asyncio -async def test_meta_information(setup_converter_test_project): - model_id = await test_helper.get_latest_model_id(project='pytest_conv') - - converter = TestConverter(source_format='mocked', target_format='test', models=[]) - node = ConverterNode(name='test', converter=converter) - await node.convert_models() - - pytest_project_model = [m for m in converter.models if m.id == model_id][0] - - categories = pytest_project_model.categories - assert len(categories) == 2 - category_types = [category.type for category in categories] - assert 'box' in category_types - assert 'point' in category_types diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index 23f19976..361f66cf 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -2,23 +2,18 @@ import logging import os import shutil -import time import zipfile from glob import glob from http import HTTPStatus from io import BytesIO -from time import perf_counter from typing import Dict, List, Optional import aiofiles -from tqdm.asyncio import tqdm from .data_classes import Context -from .helpers.misc import create_resource_paths, create_task +from .helpers.misc import create_resource_paths, create_task, is_valid_image from .loop_communication import LoopCommunicator -check_jpeg = shutil.which('jpeginfo') is not None - class DownloadError(Exception): @@ -30,120 +25,81 @@ def __init__(self, cause: str, *args: object) -> None: class DataExchanger(): def __init__(self, context: Optional[Context], loop_communicator: LoopCommunicator): - self.context = context + self.set_context(context) + self.progress = 0.0 self.loop_communicator = loop_communicator + + self.check_jpeg = shutil.which('jpeginfo') is not None + if self.check_jpeg: + logging.info('Detected command line tool "jpeginfo". Images will be checked for validity') + else: + logging.error('Missing command line tool "jpeginfo". We cannot check for validity of images.') + + def set_context(self, context: Optional[Context]) -> None: + self._context = context self.progress = 0.0 - def set_context(self, context: Context): - self.context = context + @property + def context(self) -> Context: + assert self._context, 'DataExchanger: Context was not set yet.. call set_context() first.' + return self._context - async def fetch_image_ids(self, query_params: Optional[str] = '') -> List[str]: - if self.context is None: - logging.warning('context was not set yet') - return [] + # ---------------------------- END OF INIT ---------------------------- + + async def fetch_image_uuids(self, query_params: Optional[str] = '') -> List[str]: + """Fetch image uuids from the learning loop data endpoint.""" + logging.info(f'Fetching image uuids for {self.context.organization}/{self.context.project}..') response = await self.loop_communicator.get(f'/{self.context.organization}/projects/{self.context.project}/data?{query_params}') assert response.status_code == 200, response return (response.json())['image_ids'] - async def download_images_data(self, ids: List[str]) -> List[Dict]: - '''Download image annotations etc.''' - if self.context is None: - logging.warning('context was not set yet') - return [] - - return await self._download_images_data(self.context.organization, self.context.project, ids) - - async def download_images(self, image_ids: List[str], image_folder: str) -> None: - '''Download images. Will skip existing images''' - if self.context is None: - logging.warning('context was not set yet') - return - - new_image_ids = await asyncio.get_event_loop().run_in_executor(None, DataExchanger.filter_existing_images, image_ids, image_folder) - paths, ids = create_resource_paths(self.context.organization, self.context.project, new_image_ids) - await self._download_images(paths, ids, image_folder) - - @staticmethod - async def delete_corrupt_images(image_folder: str) -> None: - logging.info('deleting corrupt images') - n_deleted = 0 - for image in glob(f'{image_folder}/*.jpg'): - if not await DataExchanger.is_valid_image(image): - logging.debug(f' deleting image {image}') - os.remove(image) - n_deleted += 1 - - logging.info(f'deleted {n_deleted} images') - - @staticmethod - def filter_existing_images(all_image_ids, image_folder) -> List[str]: - logging.info(f'### Going to filter {len(all_image_ids)} images ids') - start = perf_counter() - ids = [os.path.splitext(os.path.basename(image))[0] - for image in glob(f'{image_folder}/*.jpg')] - logging.info(f'found {len(ids)} images on disc') - result = [id for id in all_image_ids if id not in ids] - end = perf_counter() - logging.info(f'calculated {len(result)} new image ids, which took {end-start:0.2f} seconds') - return result - - def jepeg_check_info(self): - if check_jpeg: - logging.info('Detected command line tool "jpeginfo". Images will be checked for validity') - else: - logging.error('Missing command line tool "jpeginfo". We can not check for validity of images.') + async def download_images_data(self, image_uuids: List[str], chunk_size: int = 100) -> List[Dict]: + """Download image annotations, tags, set and other information for the given image uuids.""" + logging.info(f'Fetching annotations, tags, sets, etc. for {len(image_uuids)} images..') - async def _download_images_data(self, organization: str, project: str, image_ids: List[str], chunk_size: int = 100) -> List[Dict]: - logging.info('fetching annotations and other image data') - num_image_ids = len(image_ids) - self.jepeg_check_info() - images_data = [] + num_image_ids = len(image_uuids) if num_image_ids == 0: logging.info('got empty list. No images were downloaded') - return images_data - starttime = time.time() + return [] + progress_factor = 0.5 / num_image_ids # 50% of progress is for downloading data - for i in tqdm(range(0, num_image_ids, chunk_size), position=0, leave=True): + images_data: List[Dict] = [] + for i in range(0, num_image_ids, chunk_size): self.progress = i * progress_factor - chunk_ids = image_ids[i:i+chunk_size] - response = await self.loop_communicator.get(f'/{organization}/projects/{project}/images?ids={",".join(chunk_ids)}') + chunk_ids = image_uuids[i:i+chunk_size] + response = await self.loop_communicator.get(f'/{self.context.organization}/projects/{self.context.project}/images?ids={",".join(chunk_ids)}') if response.status_code != 200: - logging.error( - f'Error during downloading list of images. Statuscode is {response.status_code}') + logging.error(f'Error {response.status_code} during downloading image data. Continue with next batch..') continue images_data += response.json()['images'] - total_time = round(time.time() - starttime, 1) - if images_data: - per100 = total_time / len(images_data) * 100 - logging.debug(f'[+] Performance: {total_time} sec total. Per 100 : {per100:.1f} sec') - else: - logging.debug(f'[+] Performance: {total_time} sec total.') + return images_data - async def _download_images(self, paths: List[str], image_ids: List[str], image_folder: str, chunk_size: int = 10) -> None: - num_image_ids = len(image_ids) - if num_image_ids == 0: - logging.debug('got empty list. No images were downloaded') + async def download_images(self, image_uuids: List[str], image_folder: str, chunk_size: int = 10) -> None: + """Downloads images (actual image data). Will skip existing images""" + logging.info(f'Downloading {len(image_uuids)} images (actual image data).. skipping existing images.') + if not image_uuids: return - logging.info('fetching image files') - starttime = time.time() + + existing_uuids = {os.path.splitext(os.path.basename(image))[0] for image in glob(f'{image_folder}/*.jpg')} + new_image_uuids = [id for id in image_uuids if id not in existing_uuids] + + paths, ids = create_resource_paths(self.context.organization, self.context.project, new_image_uuids) + num_image_ids = len(image_uuids) os.makedirs(image_folder, exist_ok=True) progress_factor = 0.5 / num_image_ids # second 50% of progress is for downloading images - for i in tqdm(range(0, num_image_ids, chunk_size), position=0, leave=True): + for i in range(0, num_image_ids, chunk_size): self.progress = 0.5 + i * progress_factor chunk_paths = paths[i:i+chunk_size] - chunk_ids = image_ids[i:i+chunk_size] + chunk_ids = image_uuids[i:i+chunk_size] tasks = [] for j, chunk_j in enumerate(chunk_paths): - tasks.append(create_task(self.download_one_image(chunk_j, chunk_ids[j], image_folder))) + tasks.append(create_task(self._download_one_image(chunk_j, chunk_ids[j], image_folder))) await asyncio.gather(*tasks) - total_time = round(time.time() - starttime, 1) - per100 = total_time / (i + len(tasks)) * 100 - logging.debug(f'[+] Performance (image files): {total_time} sec total. Per 100 : {per100:.1f}') - async def download_one_image(self, path: str, image_id: str, image_folder: str) -> None: + async def _download_one_image(self, path: str, image_id: str, image_folder: str) -> None: response = await self.loop_communicator.get(path) if response.status_code != HTTPStatus.OK: logging.error(f'bad status code {response.status_code} for {path}') @@ -151,41 +107,25 @@ async def download_one_image(self, path: str, image_id: str, image_folder: str) filename = f'{image_folder}/{image_id}.jpg' async with aiofiles.open(filename, 'wb') as f: await f.write(response.content) - if not await self.is_valid_image(filename): + if not await is_valid_image(filename, self.check_jpeg): os.remove(filename) - @staticmethod - async def is_valid_image(filename: str) -> bool: - if not os.path.isfile(filename) or os.path.getsize(filename) == 0: - return False - if not check_jpeg: - return True - - info = await asyncio.create_subprocess_shell( - f'jpeginfo -c {filename}', - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE) - out, _ = await info.communicate() - return "OK" in out.decode() - async def download_model(self, target_folder: str, context: Context, model_id: str, model_format: str) -> List[str]: + """Downloads a model and returns the paths of the downloaded files.""" + logging.info(f'Downloading model {model_id} to {target_folder}..') + path = f'/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' response = await self.loop_communicator.get(path, requires_login=False) if response.status_code != 200: content = response.json() - logging.error( - f'could not download {self.loop_communicator.base_url}/{path}: {response.status_code}, content: {content}') + logging.error(f'could not download loop/{path}: {response.status_code}, content: {content}') raise DownloadError(content['detail']) try: provided_filename = response.headers.get( "Content-Disposition").split("filename=")[1].strip('"') content = response.content except: - logging.error(f'Error during downloading model {path}:') - try: - logging.exception(response.json()) - except Exception: - pass + logging.exception(f'Error during downloading model {path}:') raise # unzip and place downloaded model @@ -194,29 +134,20 @@ async def download_model(self, target_folder: str, context: Context, model_id: s with zipfile.ZipFile(BytesIO(content), 'r') as zip_: zip_.extractall(tmp_path) - logging.info(f'---- downloaded model {model_id} to {tmp_path}.') - created_files = [] - files = glob(f'{tmp_path}/**/*', recursive=True) - for file in files: + for file in glob(f'{tmp_path}/**/*', recursive=True): new_file = shutil.move(file, target_folder) - logging.info(f'moved model file {os.path.basename(file)} to {new_file}.') created_files.append(new_file) - return created_files - async def upload_model(self, context: Context, files: List[str], model_id: str, mformat: str) -> None: - response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/models/{model_id}/{mformat}/file', files=files) - if response.status_code != 200: - msg = f'---- could not upload model with id {model_id} and format {mformat}. Details: {response.text}' - raise Exception(msg) - logging.info(f'---- uploaded model with id {model_id} and format {mformat}.') + logging.info(f'---- downloaded model {model_id}/{model_format} to {tmp_path}. Moved to {target_folder}.') + return created_files - async def upload_model_for_training(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]: - """Returns the new model uuid to use for detection.""" + async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]: + """Used by the trainers. Function returns the new model uuid to use for detection.""" response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/trainings/{training_number}/models/latest/{mformat}/file', files=files) if response.status_code != 200: - msg = f'---- could not upload model for training {training_number} and format {mformat}. Details: {response.text}' - logging.error(msg) + logging.error( + f'---- could not upload model for training {training_number} and format {mformat}. Details: {response.text}') response.raise_for_status() return None else: diff --git a/learning_loop_node/detector/tests/testing_detector.py b/learning_loop_node/detector/tests/testing_detector.py index ed710824..95dd1300 100644 --- a/learning_loop_node/detector/tests/testing_detector.py +++ b/learning_loop_node/detector/tests/testing_detector.py @@ -4,7 +4,7 @@ from learning_loop_node import DetectorLogic from learning_loop_node.conftest import get_dummy_detections -from learning_loop_node.data_classes import Category, Detections, ModelInformation +from learning_loop_node.data_classes import Detections class TestingDetectorLogic(DetectorLogic): @@ -20,10 +20,3 @@ def init(self) -> None: def evaluate(self, image: np.ndarray) -> Detections: logging.info('evaluating') return self.det_to_return - - # return Detections( - # box_detections=[BoxDetection(category_name='some_category_name', x=1, y=2, height=3, width=4, - # model_name='some_model', confidence=.42, category_id='some_id')], - # point_detections=[PointDetection(category_name='some_category_name_2', x=10, y=12, - # model_name='some_model', confidence=.42, category_id='some_id')] - # ) diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py index 3eda99c5..b7e7d18f 100644 --- a/learning_loop_node/helpers/misc.py +++ b/learning_loop_node/helpers/misc.py @@ -4,7 +4,9 @@ import logging import os from dataclasses import asdict +from glob import glob from typing import Any, Coroutine, List, Optional, Tuple, TypeVar +from uuid import UUID import pynvml @@ -56,6 +58,32 @@ def get_free_memory_mb() -> float: # TODO check if this is used return free +async def is_valid_image(filename: str, check_jpeg: bool) -> bool: + if not os.path.isfile(filename) or os.path.getsize(filename) == 0: + return False + if not check_jpeg: + return True + + info = await asyncio.create_subprocess_shell(f'jpeginfo -c {filename}', + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE) + out, _ = await info.communicate() + return "OK" in out.decode() + + +@staticmethod +async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) -> None: + logging.info('deleting corrupt images') + n_deleted = 0 + for image in glob(f'{image_folder}/*.jpg'): + if not await is_valid_image(image, check_jpeg): + logging.debug(f' deleting image {image}') + os.remove(image) + n_deleted += 1 + + logging.info(f'deleted {n_deleted} images') + + def create_resource_paths(organization_name: str, project_name: str, image_ids: List[str]) -> Tuple[List[str], List[str]]: # TODO: experimental: return [f'/{organization_name}/projects/{project_name}/images/{id}/main' for id in image_ids], image_ids @@ -107,3 +135,11 @@ async def wrapper_ensure_socket_response(*args, **kwargs): return asdict(SocketResponse.for_failure(str(e))) return wrapper_ensure_socket_response + + +def is_valid_uuid4(val): + try: + _ = UUID(str(val)).version + return True + except ValueError: + return False diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py index ffce72f7..00e0313b 100644 --- a/learning_loop_node/node.py +++ b/learning_loop_node/node.py @@ -24,34 +24,38 @@ class Node(FastAPI): - def __init__(self, name: str, uuid: Optional[str] = None): + def __init__(self, name: str, uuid: Optional[str] = None, needs_login: bool = True): """Base class for all nodes. A node is a process that communicates with the zauberzeug learning loop. Args: name (str): The name of the node. This name is used to generate a uuid. uuid (Optional[str]): The uuid of the node. If None, a uuid is generated based on the name and stored in f'{GLOBALS.data_folder}/uuids.json'. - From the second run, the uuid is recovered based on the name of the node. Defaults to None. + From the second run, the uuid is recovered based on the name of the node. + needs_login (bool): If True, the node will try to login to the learning loop. """ super().__init__() log_conf.init() + self.name = name + self.uuid = uuid or self.read_or_create_uuid(self.name) + self.needs_login = needs_login + self.log = logging.getLogger() self.loop_communicator = LoopCommunicator() self.data_exchanger = DataExchanger(None, self.loop_communicator) - host = environment_reader.host(default='learning-loop.ai') - self.ws_url = f'ws{"s" if "learning-loop.ai" in host else ""}://' + host + loop_url = environment_reader.host(default='learning-loop.ai') + self.websocket_url = f'ws{"s" if "learning-loop.ai" in loop_url else ""}://' + loop_url - self.name = name - self.uuid = self.read_or_create_uuid(self.name) if uuid is None else uuid self.startup_time = datetime.now() self._sio_client: Optional[AsyncClient] = None self.status = NodeStatus(id=self.uuid, name=self.name) - # NOTE this is can be set to False for Nodes which do not need to authenticate with the backend (like the DetectorNode) - self.needs_login = True - self._setup_sio_headers() + + self.sio_headers = {'organization': self.loop_communicator.organization, + 'project': self.loop_communicator.project, + 'nodeType': self.get_node_type()} self._register_lifecycle_events() @property @@ -82,11 +86,6 @@ def read_or_create_uuid(self, identifier: str) -> str: json.dump(uuids, f) return uuid - def _setup_sio_headers(self) -> None: - self.sio_headers = {'organization': self.loop_communicator.organization, - 'project': self.loop_communicator.project, - 'nodeType': self.get_node_type()} - # --------------------------------------------------- APPLICATION LIFECYCLE --------------------------------------------------- def _register_lifecycle_events(self): @@ -176,14 +175,14 @@ async def connect_sio(self): except Exception: pass - self.log.info(f'(re)connecting to Learning Loop at {self.ws_url}') + self.log.info(f'(re)connecting to Learning Loop at {self.websocket_url}') try: - await self.sio_client.connect(f"{self.ws_url}", headers=self.sio_headers, socketio_path="/ws/socket.io") + await self.sio_client.connect(f"{self.websocket_url}", headers=self.sio_headers, socketio_path="/ws/socket.io") self.log.info('connected to Learning Loop') except socketio.exceptions.ConnectionError: # type: ignore self.log.warning('connection error') except Exception: - self.log.exception(f'error while connecting to "{self.ws_url}". Exception:') + self.log.exception(f'error while connecting to "{self.websocket_url}". Exception:') async def _update_send_state(self, state: NodeState): self.status.state = state diff --git a/learning_loop_node/tests/test_downloader.py b/learning_loop_node/tests/test_downloader.py index bf2e10e8..7b2143d1 100644 --- a/learning_loop_node/tests/test_downloader.py +++ b/learning_loop_node/tests/test_downloader.py @@ -1,3 +1,4 @@ +from ..helpers.misc import delete_corrupt_images import os import shutil @@ -33,26 +34,26 @@ async def test_download_model(data_exchanger: DataExchanger): # pylint: disable=redefined-outer-name async def test_fetching_image_ids(data_exchanger: DataExchanger): - ids = await data_exchanger.fetch_image_ids() + ids = await data_exchanger.fetch_image_uuids() assert len(ids) == 3 async def test_download_images(data_exchanger: DataExchanger): _, image_folder, _ = test_helper.create_needed_folders() - image_ids = await data_exchanger.fetch_image_ids() + image_ids = await data_exchanger.fetch_image_uuids() await data_exchanger.download_images(image_ids, image_folder) files = test_helper.get_files_in_folder(GLOBALS.data_folder) assert len(files) == 3 async def test_download_training_data(data_exchanger: DataExchanger): - image_ids = await data_exchanger.fetch_image_ids() + image_ids = await data_exchanger.fetch_image_uuids() image_data = await data_exchanger.download_images_data(image_ids) assert len(image_data) == 3 async def test_removal_of_corrupted_images(data_exchanger: DataExchanger): - image_ids = await data_exchanger.fetch_image_ids() + image_ids = await data_exchanger.fetch_image_uuids() shutil.rmtree('/tmp/img_folder', ignore_errors=True) os.makedirs('/tmp/img_folder', exist_ok=True) @@ -65,7 +66,7 @@ async def test_removal_of_corrupted_images(data_exchanger: DataExchanger): with open('/tmp/img_folder/c1.jpg', 'w') as f: f.write('I am no image') - await data_exchanger.delete_corrupt_images('/tmp/img_folder') + await delete_corrupt_images('/tmp/img_folder', True) assert len(os.listdir('/tmp/img_folder')) == num_images if check_jpeg else num_images - 1 shutil.rmtree('/tmp/img_folder', ignore_errors=True) diff --git a/learning_loop_node/trainer/downloader.py b/learning_loop_node/trainer/downloader.py index 94cd0516..7deb59cf 100644 --- a/learning_loop_node/trainer/downloader.py +++ b/learning_loop_node/trainer/downloader.py @@ -12,7 +12,7 @@ def __init__(self, data_exchanger: DataExchanger, data_query_params: Optional[st self.data_exchanger = data_exchanger async def download_training_data(self, image_folder: str) -> Tuple[List[Dict], int]: - image_ids = await self.data_exchanger.fetch_image_ids(query_params=self.data_query_params) + image_ids = await self.data_exchanger.fetch_image_uuids(query_params=self.data_query_params) image_data, skipped_image_count = await self.download_images_and_annotations(image_ids, image_folder) return (image_data, skipped_image_count) diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 1b11b4e3..06eac0aa 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -19,7 +19,7 @@ from ..data_classes import (BasicModel, Category, Context, Detections, Errors, Hyperparameter, ModelInformation, PretrainedModel, Training, TrainingData, TrainingError, TrainingState) -from ..helpers.misc import create_image_folder +from ..helpers.misc import create_image_folder, delete_corrupt_images, is_valid_uuid4 from ..node import Node from . import training_syncronizer from .downloader import TrainingsDownloader @@ -30,14 +30,6 @@ from .trainer_node import TrainerNode -def is_valid_uuid4(val): - try: - _ = UUID(str(val)).version - return True - except ValueError: - return False - - class TrainerLogic(): def __init__(self, model_format: str) -> None: @@ -371,7 +363,7 @@ async def _upload_model_return_new_id(self, context: Context) -> Optional[str]: # model.json was mandatory in previous versions. Now its forbidden to provide an own model.json file. assert not any(f for f in _files if 'model.json' in f), "Upload 'model.json' not allowed (added automatically)." _files.append(model_json_path) - new_id = await self.node.data_exchanger.upload_model_for_training(context, _files, self.training.training_number, file_format) + new_id = await self.node.data_exchanger.upload_model_get_uuid(context, _files, self.training.training_number, file_format) if new_id is None: return None @@ -420,12 +412,12 @@ async def _do_detections(self) -> None: for state, p in zip(['inbox', 'annotate', 'review', 'complete'], [0.1, 0.2, 0.3, 0.4]): self.detection_progress = p logging.info(f'fetching image ids of {state}') - new_ids = await self.node.data_exchanger.fetch_image_ids(query_params=f'state={state}') + new_ids = await self.node.data_exchanger.fetch_image_uuids(query_params=f'state={state}') image_ids += new_ids logging.info(f'downloading {len(new_ids)} images') await self.node.data_exchanger.download_images(new_ids, image_folder) self.detection_progress = 0.42 - await self.node.data_exchanger.delete_corrupt_images(image_folder) + # await delete_corrupt_images(image_folder) images = await asyncio.get_event_loop().run_in_executor(None, TrainerLogic.images_for_ids, image_ids, image_folder) num_images = len(images) From edb451077f0c135ee5e5a64ebcc204ac8f58ccf4 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 6 Mar 2024 15:36:22 +0100 Subject: [PATCH 02/62] proceed with cleanup, refactoring. Use lifespan contextmanager --- learning_loop_node/data_exchanger.py | 14 +++- learning_loop_node/detector/detector_node.py | 2 +- learning_loop_node/helpers/misc.py | 21 ++++++ learning_loop_node/loop_communication.py | 9 ++- learning_loop_node/node.py | 70 ++++++++------------ 5 files changed, 66 insertions(+), 50 deletions(-) diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index 361f66cf..6bb30e6d 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -8,7 +8,7 @@ from io import BytesIO from typing import Dict, List, Optional -import aiofiles +import aiofiles # type: ignore from .data_classes import Context from .helpers.misc import create_resource_paths, create_task, is_valid_image @@ -25,6 +25,15 @@ def __init__(self, cause: str, *args: object) -> None: class DataExchanger(): def __init__(self, context: Optional[Context], loop_communicator: LoopCommunicator): + """Exchanges data with the learning loop via the loop_communicator (rest api). + + Args: + context (Optional[Context]): The context of the node. This is the organization and project name. + loop_communicator (LoopCommunicator): The loop_communicator to use for communication with the learning loop. + + Note: + The context can be set later with the set_context method. + """ self.set_context(context) self.progress = 0.0 self.loop_communicator = loop_communicator @@ -85,7 +94,7 @@ async def download_images(self, image_uuids: List[str], image_folder: str, chunk existing_uuids = {os.path.splitext(os.path.basename(image))[0] for image in glob(f'{image_folder}/*.jpg')} new_image_uuids = [id for id in image_uuids if id not in existing_uuids] - paths, ids = create_resource_paths(self.context.organization, self.context.project, new_image_uuids) + paths, _ = create_resource_paths(self.context.organization, self.context.project, new_image_uuids) num_image_ids = len(image_uuids) os.makedirs(image_folder, exist_ok=True) @@ -128,7 +137,6 @@ async def download_model(self, target_folder: str, context: Context, model_id: s logging.exception(f'Error during downloading model {path}:') raise - # unzip and place downloaded model tmp_path = f'/tmp/{os.path.splitext(provided_filename)[0]}' shutil.rmtree(tmp_path, ignore_errors=True) with zipfile.ZipFile(BytesIO(content), 'r') as zip_: diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py index 785a10fe..7a19142b 100644 --- a/learning_loop_node/detector/detector_node.py +++ b/learning_loop_node/detector/detector_node.py @@ -256,7 +256,7 @@ async def send_status(self) -> Union[str, Literal[False]]: name=self.name, state=self.status.state, errors=self.status.errors, - uptime=int((datetime.now() - self.startup_time).total_seconds()), + uptime=int((datetime.now() - self.startup_datetime).total_seconds()), operation_mode=self.operation_mode, current_model=current_model, target_model=self.target_model, diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py index b7e7d18f..81cfc284 100644 --- a/learning_loop_node/helpers/misc.py +++ b/learning_loop_node/helpers/misc.py @@ -1,4 +1,6 @@ """original copied from https://quantlane.com/blog/ensure-asyncio-task-exceptions-get-logged/""" +import json +from uuid import uuid4 import asyncio import functools import logging @@ -11,6 +13,7 @@ import pynvml from ..data_classes import SocketResponse +from ..globals import GLOBALS T = TypeVar('T') @@ -102,6 +105,24 @@ def create_image_folder(project_folder: str) -> str: return image_folder +def read_or_create_uuid(identifier: str) -> str: + identifier = identifier.lower().replace(' ', '_') + uuids = {} + os.makedirs(GLOBALS.data_folder, exist_ok=True) + file_path = f'{GLOBALS.data_folder}/uuids.json' + if os.path.exists(file_path): + with open(file_path, 'r') as f: + uuids = json.load(f) + + uuid = uuids.get(identifier, None) + if not uuid: + uuid = str(uuid4()) + uuids[identifier] = uuid + with open(file_path, 'w') as f: + json.dump(uuids, f) + return uuid + + def ensure_socket_response(func): """Decorator to ensure that the return value of a socket.io event handler is a SocketResponse. diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index d4b3dadf..9ba7519b 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -27,9 +27,8 @@ def __init__(self) -> None: logging.info(f'Loop interface initialized with base_url: {self.base_url} / user: {self.username}') - # @property - # def project_path(self): # TODO: remove? - # return f'/{self.organization}/projects/{self.project}' + def websocket_url(self) -> str: + return f'ws{"s" if "learning-loop.ai" in self.host else ""}://' + self.host async def ensure_login(self) -> None: """aiohttp client session needs to be created on the event loop""" @@ -75,12 +74,12 @@ async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/ await self.ensure_login() return await self.async_client.get(api_prefix+path) - async def put(self, path, files: Optional[List[str]]=None, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response: + async def put(self, path, files: Optional[List[str]] = None, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response: if requires_login: await self.ensure_login() if files is None: return await self.async_client.put(api_prefix+path, **kwargs) - + file_list = [('files', open(f, 'rb')) for f in files] # TODO: does this properly close the files after upload? return await self.async_client.put(api_prefix+path, files=file_list) diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py index 00e0313b..ba9fe464 100644 --- a/learning_loop_node/node.py +++ b/learning_loop_node/node.py @@ -1,12 +1,11 @@ import asyncio -import json import logging import os import sys from abc import abstractmethod +from contextlib import asynccontextmanager from datetime import datetime from typing import Optional -from uuid import uuid4 import aiohttp import socketio @@ -17,8 +16,8 @@ from .data_classes import Context, NodeState, NodeStatus from .data_exchanger import DataExchanger from .globals import GLOBALS -from .helpers import environment_reader, log_conf -from .helpers.misc import ensure_socket_response +from .helpers import log_conf +from .helpers.misc import ensure_socket_response, read_or_create_uuid from .loop_communication import LoopCommunicator @@ -35,28 +34,29 @@ def __init__(self, name: str, uuid: Optional[str] = None, needs_login: bool = Tr needs_login (bool): If True, the node will try to login to the learning loop. """ - super().__init__() + super().__init__(lifespan=self.lifespan) log_conf.init() self.name = name - self.uuid = uuid or self.read_or_create_uuid(self.name) + self.uuid = uuid or read_or_create_uuid(self.name) self.needs_login = needs_login self.log = logging.getLogger() self.loop_communicator = LoopCommunicator() + self.websocket_url = self.loop_communicator.websocket_url() self.data_exchanger = DataExchanger(None, self.loop_communicator) - loop_url = environment_reader.host(default='learning-loop.ai') - self.websocket_url = f'ws{"s" if "learning-loop.ai" in loop_url else ""}://' + loop_url - - self.startup_time = datetime.now() + self.startup_datetime = datetime.now() self._sio_client: Optional[AsyncClient] = None self.status = NodeStatus(id=self.uuid, name=self.name) self.sio_headers = {'organization': self.loop_communicator.organization, 'project': self.loop_communicator.project, 'nodeType': self.get_node_type()} - self._register_lifecycle_events() + + @repeat_every(seconds=5, raise_exceptions=False, wait_first=False) + async def ensure_connected() -> None: + await self._on_repeat() @property def sio_client(self) -> AsyncClient: @@ -67,40 +67,27 @@ def sio_client(self) -> AsyncClient: def sio_is_initialized(self) -> bool: return self._sio_client is not None - # --------------------------------------------------- INIT --------------------------------------------------- - - def read_or_create_uuid(self, identifier: str) -> str: - identifier = identifier.lower().replace(' ', '_') - uuids = {} - os.makedirs(GLOBALS.data_folder, exist_ok=True) - file_path = f'{GLOBALS.data_folder}/uuids.json' - if os.path.exists(file_path): - with open(file_path, 'r') as f: - uuids = json.load(f) - - uuid = uuids.get(identifier, None) - if not uuid: - uuid = str(uuid4()) - uuids[identifier] = uuid - with open(file_path, 'w') as f: - json.dump(uuids, f) - return uuid - # --------------------------------------------------- APPLICATION LIFECYCLE --------------------------------------------------- - def _register_lifecycle_events(self): - @self.on_event("startup") - async def startup(): - await self._on_startup() + @asynccontextmanager + async def lifespan(self, app: FastAPI): + await self.on_startup() + yield + await self.on_shutdown() - @self.on_event("shutdown") # NOTE only used for developent ?! - async def shutdown(): - await self._on_shutdown() + # def _register_lifecycle_events(self): + # @self.on_event("startup") + # async def startup(): + # await self._on_startup() - @self.on_event("startup") - @repeat_every(seconds=5, raise_exceptions=False, wait_first=False) - async def ensure_connected() -> None: - await self._on_repeat() + # @self.on_event("shutdown") # NOTE only used for developent ?! + # async def shutdown(): + # await self._on_shutdown() + + # @self.on_event("startup") + # @repeat_every(seconds=5, raise_exceptions=False, wait_first=False) + # async def ensure_connected() -> None: + # await self._on_repeat() async def _on_startup(self): self.log.info('received "startup" lifecycle-event') @@ -122,6 +109,7 @@ async def _on_shutdown(self): self.log.info('successfully disconnected from loop.') await self.on_shutdown() + @repeat_every(seconds=5, raise_exceptions=False, wait_first=False) async def _on_repeat(self): while not self.sio_is_initialized(): self.log.info('Waiting for sio client to be initialized') From fda0bdbaa4705ff0ae375aa852855feac33b9b38 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 6 Mar 2024 15:38:18 +0100 Subject: [PATCH 03/62] remove mock converter --- mock_converter.dockerfile | 7 --- mock_converter/app_code/__init__.py | 0 mock_converter/app_code/backdoor_controls.py | 55 ------------------- .../app_code/mock_converter_logic.py | 18 ------ mock_converter/app_code/restart/restart.py | 5 -- mock_converter/app_code/tests/.gitkeep | 0 mock_converter/app_code/tests/test_dummy.py | 2 - mock_converter/main.py | 24 -------- mock_converter/pytest.ini | 8 --- mock_converter/start.sh | 3 - 10 files changed, 122 deletions(-) delete mode 100644 mock_converter.dockerfile delete mode 100644 mock_converter/app_code/__init__.py delete mode 100644 mock_converter/app_code/backdoor_controls.py delete mode 100644 mock_converter/app_code/mock_converter_logic.py delete mode 100644 mock_converter/app_code/restart/restart.py delete mode 100644 mock_converter/app_code/tests/.gitkeep delete mode 100644 mock_converter/app_code/tests/test_dummy.py delete mode 100644 mock_converter/main.py delete mode 100644 mock_converter/pytest.ini delete mode 100755 mock_converter/start.sh diff --git a/mock_converter.dockerfile b/mock_converter.dockerfile deleted file mode 100644 index 42c883c8..00000000 --- a/mock_converter.dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM base_node:latest - -COPY ./mock_converter/ /app -ENV PYTHONPATH "${PYTHONPATH}:/app:/usr/local/lib/python3.11/site-packages:/learning_loop_node/learning_loop_node" -ENV TZ=Europe/Amsterdam - -EXPOSE 80 diff --git a/mock_converter/app_code/__init__.py b/mock_converter/app_code/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/mock_converter/app_code/backdoor_controls.py b/mock_converter/app_code/backdoor_controls.py deleted file mode 100644 index 6472d4b2..00000000 --- a/mock_converter/app_code/backdoor_controls.py +++ /dev/null @@ -1,55 +0,0 @@ -"""These restful endpoints are only to be used for testing purposes and are not part of the 'offical' trainer behavior.""" - -import asyncio -import logging - -from fastapi import APIRouter, HTTPException, Request - -from learning_loop_node.data_classes import NodeState - -router = APIRouter() - - -@router.put("/socketio") -async def put_socketio(request: Request): - ''' - Example Usage - - curl -X PUT -d "on" http://localhost:8005/socketio - ''' - state = str(await request.body(), 'utf-8') - if state == 'off': - if request.app.status.state != NodeState.Offline: - logging.info('turning socketio off') - asyncio.create_task(request.app.sio.disconnect()) - if state == 'on': - if request.app.status.state == NodeState.Offline: - logging.info('turning socketio on') - asyncio.create_task(request.app.connect()) - - -@router.put("/check_state") -async def put_check_state(request: Request): - value = str(await request.body(), 'utf-8') - print(f'turning automatically check_state {value}', flush=True) - - if value == 'off': - request.app.skip_check_state = True - for _ in range(5): - if request.app.status.state != NodeState.Idle: - await asyncio.sleep(0.5) - else: - break - if request.app.status.state != NodeState.Idle: - raise HTTPException(status_code=409, detail="Could not skip auto checking. State is still not idle") - - if value == 'on': - request.app.skip_check_state = False - - -@router.post("/step") -async def add_steps(request: Request): - if request.app.status.state == NodeState.Running: - raise HTTPException(status_code=409, detail="converter is already running") - - await request.app.check_state() diff --git a/mock_converter/app_code/mock_converter_logic.py b/mock_converter/app_code/mock_converter_logic.py deleted file mode 100644 index 7fc68579..00000000 --- a/mock_converter/app_code/mock_converter_logic.py +++ /dev/null @@ -1,18 +0,0 @@ - -import asyncio -from typing import List - -from learning_loop_node.converter.converter_logic import ConverterLogic -from learning_loop_node.data_classes import ModelInformation - - -class MockConverterLogic(ConverterLogic): - - async def _convert(self, model_information: ModelInformation) -> None: - await asyncio.sleep(1) - - def get_converted_files(self, model_id: str) -> List[str]: - fake_converted_file = '/tmp/converted_weightfile.converted' - with open(fake_converted_file, 'wb') as f: - f.write(b'\x42') - return [fake_converted_file] diff --git a/mock_converter/app_code/restart/restart.py b/mock_converter/app_code/restart/restart.py deleted file mode 100644 index f7203baa..00000000 --- a/mock_converter/app_code/restart/restart.py +++ /dev/null @@ -1,5 +0,0 @@ -# add 'reload_dirs=['./app_code/restart'] to uvicorn call in main.py -# save this file to trigger uvicorn restart - - -# TODO raus nehmen diff --git a/mock_converter/app_code/tests/.gitkeep b/mock_converter/app_code/tests/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/mock_converter/app_code/tests/test_dummy.py b/mock_converter/app_code/tests/test_dummy.py deleted file mode 100644 index 1f00624b..00000000 --- a/mock_converter/app_code/tests/test_dummy.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_always_succeed_to_ensure_ci_of_loop_will_not_fail(): - assert True diff --git a/mock_converter/main.py b/mock_converter/main.py deleted file mode 100644 index b8bdb907..00000000 --- a/mock_converter/main.py +++ /dev/null @@ -1,24 +0,0 @@ -import logging -import os - -import uvicorn -from app_code import backdoor_controls -from app_code.mock_converter_logic import MockConverterLogic - -from learning_loop_node.converter.converter_node import ConverterNode - -logging.basicConfig(level=logging.DEBUG) - -mock_converter = MockConverterLogic(source_format='mocked', target_format='mocked_converted') -node = ConverterNode(uuid='85ef1a58-308d-4c80-8931-43d1f752f4f3', name='mocked converter', converter=mock_converter) -node.skip_check_state = True # do not check states auotmatically for this mock - -# setting up backdoor_controls -node.include_router(backdoor_controls.router, prefix="") - - -if __name__ == "__main__": - reload_dirs = ['./app_code/restart'] if os.environ.get('MANUAL_RESTART', None) \ - else ['./app_code', './learning-loop-node', '/usr/local/lib/python3.11/site-packages/learning_loop_node'] - uvicorn.run("main:node", host="0.0.0.0", port=80, lifespan='on', - reload=True, use_colors=True, reload_dirs=reload_dirs) diff --git a/mock_converter/pytest.ini b/mock_converter/pytest.ini deleted file mode 100644 index 0d20a612..00000000 --- a/mock_converter/pytest.ini +++ /dev/null @@ -1,8 +0,0 @@ -[pytest] -# NOTE: changing default location of pytest_cache because the uvicorn file watcher somehow triggered to many reloads -cache_dir = /tmp/pytest_cache -python_files = test_*.py -asyncio_mode = auto - -testpaths = tests - \ No newline at end of file diff --git a/mock_converter/start.sh b/mock_converter/start.sh deleted file mode 100755 index 125eee97..00000000 --- a/mock_converter/start.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash - -uvicorn main:node --host 0.0.0.0 --port 80 --reload --lifespan on --reload-dir /app \ No newline at end of file From 4925fe36d32b269cdb3ce57db11c50e4db000b2b Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 6 Mar 2024 15:39:53 +0100 Subject: [PATCH 04/62] remove converter from __init__ --- learning_loop_node/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/learning_loop_node/__init__.py b/learning_loop_node/__init__.py index b8f0f5cd..5f4433bc 100644 --- a/learning_loop_node/__init__.py +++ b/learning_loop_node/__init__.py @@ -2,7 +2,6 @@ import os import sys -from .converter.converter_node import ConverterNode # from . import log_conf from .detector.detector_logic import DetectorLogic from .detector.detector_node import DetectorNode From f238acbcc723932f1825affd68ce347401ae929c Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 6 Mar 2024 16:57:52 +0100 Subject: [PATCH 05/62] cahnge deprecated way of app lifecycle handling --- learning_loop_node/detector/detector_node.py | 2 + learning_loop_node/detector/rest/about.py | 1 + .../tests/test_client_communication.py | 3 +- learning_loop_node/node.py | 49 ++++++++++--------- learning_loop_node/tests/test_downloader.py | 6 +-- .../tests/states/test_state_upload_model.py | 2 +- 6 files changed, 35 insertions(+), 28 deletions(-) diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py index 7a19142b..e2c532ed 100644 --- a/learning_loop_node/detector/detector_node.py +++ b/learning_loop_node/detector/detector_node.py @@ -170,6 +170,8 @@ async def _upload(sid, data: Dict) -> Optional[Dict]: def _connect(sid, environ, auth) -> None: self.connected_clients.append(sid) + print('>>>>>>>>>>>>>>>>>>>>>>> setting up sio server', flush=True) + self.sio_server = SocketManager(app=self) self.sio_server.on('detect', _detect) self.sio_server.on('info', _info) diff --git a/learning_loop_node/detector/rest/about.py b/learning_loop_node/detector/rest/about.py index c464b999..9f1e407e 100644 --- a/learning_loop_node/detector/rest/about.py +++ b/learning_loop_node/detector/rest/about.py @@ -16,6 +16,7 @@ async def get_about(request: Request): curl http://localhost/about ''' app: 'DetectorNode' = request.app + return { 'operation_mode': app.operation_mode.value, 'state': app.status.state, diff --git a/learning_loop_node/detector/tests/test_client_communication.py b/learning_loop_node/detector/tests/test_client_communication.py index be3d2d4b..16f0fa6b 100644 --- a/learning_loop_node/detector/tests/test_client_communication.py +++ b/learning_loop_node/detector/tests/test_client_communication.py @@ -5,7 +5,7 @@ import requests from learning_loop_node import DetectorNode -from learning_loop_node.data_classes import Category, ModelInformation +from learning_loop_node.data_classes import ModelInformation from learning_loop_node.detector.tests.conftest import get_outbox_files from learning_loop_node.globals import GLOBALS @@ -94,6 +94,7 @@ async def test_about_endpoint(test_detector_node: DetectorNode): assert response.status_code == 200 response_dict = json.loads(response.content) + assert response_dict['model_info'] model_information = ModelInformation.from_dict(response_dict['model_info']) assert response_dict['operation_mode'] == 'idle' diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py index ba9fe464..bbf7bd77 100644 --- a/learning_loop_node/node.py +++ b/learning_loop_node/node.py @@ -5,12 +5,11 @@ from abc import abstractmethod from contextlib import asynccontextmanager from datetime import datetime -from typing import Optional +from typing import Any, Optional import aiohttp import socketio from fastapi import FastAPI -from fastapi_utils.tasks import repeat_every from socketio import AsyncClient from .data_classes import Context, NodeState, NodeStatus @@ -54,9 +53,7 @@ def __init__(self, name: str, uuid: Optional[str] = None, needs_login: bool = Tr 'project': self.loop_communicator.project, 'nodeType': self.get_node_type()} - @repeat_every(seconds=5, raise_exceptions=False, wait_first=False) - async def ensure_connected() -> None: - await self._on_repeat() + self.repeat_task: Any = None @property def sio_client(self) -> AsyncClient: @@ -68,26 +65,21 @@ def sio_is_initialized(self) -> bool: return self._sio_client is not None # --------------------------------------------------- APPLICATION LIFECYCLE --------------------------------------------------- - @asynccontextmanager async def lifespan(self, app: FastAPI): - await self.on_startup() - yield - await self.on_shutdown() - - # def _register_lifecycle_events(self): - # @self.on_event("startup") - # async def startup(): - # await self._on_startup() - - # @self.on_event("shutdown") # NOTE only used for developent ?! - # async def shutdown(): - # await self._on_shutdown() - # @self.on_event("startup") - # @repeat_every(seconds=5, raise_exceptions=False, wait_first=False) - # async def ensure_connected() -> None: - # await self._on_repeat() + try: + self.repeat_task = asyncio.create_task(self.repeat_loop()) + await self._on_startup() + yield + finally: + await self._on_shutdown() + if self.repeat_task is not None: + self.repeat_task.cancel() + try: + await self.repeat_task + except asyncio.CancelledError: + pass async def _on_startup(self): self.log.info('received "startup" lifecycle-event') @@ -109,8 +101,19 @@ async def _on_shutdown(self): self.log.info('successfully disconnected from loop.') await self.on_shutdown() - @repeat_every(seconds=5, raise_exceptions=False, wait_first=False) + async def repeat_loop(self) -> None: + while True: + try: + await self._on_repeat() + except asyncio.CancelledError: + return + except Exception as e: + self.log.exception(f'error in repeat loop: {e}') + await asyncio.sleep(5) + async def _on_repeat(self): + print('received "repeat" lifecycle-event', flush=True) + logging.info('received "repeat" lifecycle-event') while not self.sio_is_initialized(): self.log.info('Waiting for sio client to be initialized') await asyncio.sleep(1) diff --git a/learning_loop_node/tests/test_downloader.py b/learning_loop_node/tests/test_downloader.py index 7b2143d1..43ee4c6f 100644 --- a/learning_loop_node/tests/test_downloader.py +++ b/learning_loop_node/tests/test_downloader.py @@ -1,11 +1,11 @@ -from ..helpers.misc import delete_corrupt_images import os import shutil from learning_loop_node.data_classes import Context -from learning_loop_node.data_exchanger import DataExchanger, check_jpeg +from learning_loop_node.data_exchanger import DataExchanger from learning_loop_node.globals import GLOBALS +from ..helpers.misc import delete_corrupt_images from . import test_helper @@ -68,5 +68,5 @@ async def test_removal_of_corrupted_images(data_exchanger: DataExchanger): await delete_corrupt_images('/tmp/img_folder', True) - assert len(os.listdir('/tmp/img_folder')) == num_images if check_jpeg else num_images - 1 + assert len(os.listdir('/tmp/img_folder')) == num_images if data_exchanger.check_jpeg else num_images - 1 shutil.rmtree('/tmp/img_folder', ignore_errors=True) diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py index 05eaa8ed..41a5a4a8 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py @@ -84,4 +84,4 @@ async def test_mock_loop_response_example(mocker: MockerFixture, test_initialize def mock_upload_model_for_training(mocker, return_value): - mocker.patch('learning_loop_node.data_exchanger.DataExchanger.upload_model_for_training', return_value=return_value) + mocker.patch('learning_loop_node.data_exchanger.DataExchanger.upload_model_get_uuid', return_value=return_value) From 421f529640f553ef388e27760b73292334a7b1b1 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 6 Mar 2024 17:54:19 +0100 Subject: [PATCH 06/62] remove unused states from node base class --- .../annotation/annotator_node.py | 9 +- learning_loop_node/data_classes/general.py | 2 +- learning_loop_node/detector/detector_node.py | 4 +- learning_loop_node/helpers/misc.py | 69 +++++++++++++- learning_loop_node/loop_communication.py | 2 +- learning_loop_node/node.py | 95 ++++--------------- learning_loop_node/tests/test_helper.py | 4 +- learning_loop_node/trainer/trainer_logic.py | 14 +-- learning_loop_node/trainer/trainer_node.py | 5 - .../app_code/tests/test_detections.py | 4 +- 10 files changed, 98 insertions(+), 110 deletions(-) diff --git a/learning_loop_node/annotation/annotator_node.py b/learning_loop_node/annotation/annotator_node.py index b1781b73..474f28e6 100644 --- a/learning_loop_node/annotation/annotator_node.py +++ b/learning_loop_node/annotation/annotator_node.py @@ -8,7 +8,7 @@ from ..data_classes import AnnotationNodeStatus, Context, NodeState, UserInput from ..data_classes.socket_response import SocketResponse from ..data_exchanger import DataExchanger -from ..helpers.misc import create_image_folder +from ..helpers.misc import create_image_folder, create_project_folder from ..node import Node from .annotator_logic import AnnotatorLogic @@ -50,8 +50,6 @@ async def _handle_user_input(self, user_input_dict: Dict) -> str: raise if tool_result.annotation: - if not self.sio_is_initialized(): - raise Exception('Socket client waas not initialized') await self.sio_client.call('update_segmentation_annotation', (user_input.data.context.organization, user_input.data.context.project, jsonable_encoder(asdict(tool_result.annotation))), timeout=30) @@ -85,15 +83,12 @@ async def send_status(self): self.log.error(f'Error for updating: Response from loop was : {asdict(response)}') async def download_image(self, context: Context, uuid: str): - project_folder = Node.create_project_folder(context) + project_folder = create_project_folder(context) images_folder = create_image_folder(project_folder) downloader = DataExchanger(context=context, loop_communicator=self.loop_communicator) await downloader.download_images([uuid], images_folder) - async def get_state(self): - return NodeState.Online - def get_node_type(self): return 'annotation_node' diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py index 8404ab22..9d5c893e 100644 --- a/learning_loop_node/data_classes/general.py +++ b/learning_loop_node/data_classes/general.py @@ -121,7 +121,7 @@ class NodeState(str, Enum): class NodeStatus(): id: str name: str - state: Optional[NodeState] = NodeState.Offline + state: Optional[NodeState] = NodeState.Online uptime: Optional[int] = 0 errors: Dict = field(default_factory=dict) capabilities: List[str] = field(default_factory=list) diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py index e2c532ed..00271f64 100644 --- a/learning_loop_node/detector/detector_node.py +++ b/learning_loop_node/detector/detector_node.py @@ -274,13 +274,11 @@ async def send_status(self) -> Union[str, Literal[False]]: return False assert socket_response.payload is not None + # TODO This is weird because target_model_version is stored in self and target_model_id is returned self.target_model = socket_response.payload['target_model_version'] self.log.info(f'After sending status. Target_model is {self.target_model}') return socket_response.payload['target_model_id'] - async def get_state(self): - return NodeState.Online # NOTE At the moment only trainer-nodes use a meaningful state - async def set_operation_mode(self, mode: OperationMode): self.operation_mode = mode await self.send_status() diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py index 81cfc284..1f2e297d 100644 --- a/learning_loop_node/helpers/misc.py +++ b/learning_loop_node/helpers/misc.py @@ -1,18 +1,20 @@ """original copied from https://quantlane.com/blog/ensure-asyncio-task-exceptions-get-logged/""" -import json -from uuid import uuid4 import asyncio import functools +import json import logging import os +import shutil +import sys from dataclasses import asdict from glob import glob +from time import perf_counter from typing import Any, Coroutine, List, Optional, Tuple, TypeVar -from uuid import UUID +from uuid import UUID, uuid4 import pynvml -from ..data_classes import SocketResponse +from ..data_classes import Context, SocketResponse, Training from ..globals import GLOBALS T = TypeVar('T') @@ -164,3 +166,62 @@ def is_valid_uuid4(val): return True except ValueError: return False + + +def create_project_folder(context: Context) -> str: + project_folder = f'{GLOBALS.data_folder}/{context.organization}/{context.project}' + os.makedirs(project_folder, exist_ok=True) + return project_folder + + +def activate_asyncio_warnings() -> None: + '''Produce warnings for coroutines which take too long on the main loop and hence clog the event loop''' + try: + if sys.version_info.major >= 3 and sys.version_info.minor >= 7: # most + loop = asyncio.get_running_loop() + else: + loop = asyncio.get_event_loop() + + loop.set_debug(True) + loop.slow_callback_duration = 0.2 + logging.info('activated asyncio warnings') + except Exception: + logging.exception('could not activate asyncio warnings. Exception:') + + +@staticmethod +def images_for_ids(image_ids, image_folder) -> List[str]: + logging.info(f'### Going to get images for {len(image_ids)} images ids') + start = perf_counter() + images = [img for img in glob(f'{image_folder}/**/*.*', recursive=True) + if os.path.splitext(os.path.basename(img))[0] in image_ids] + end = perf_counter() + logging.info(f'found {len(images)} images for {len(image_ids)} image ids, which took {end-start:0.2f} seconds') + return images + + +@staticmethod +def generate_training(project_folder: str, context: Context) -> Training: + training_uuid = str(uuid4()) + return Training( + id=training_uuid, + context=context, + project_folder=project_folder, + images_folder=create_image_folder(project_folder), + training_folder=create_training_folder(project_folder, training_uuid) + ) + + +@staticmethod +def delete_all_training_folders(project_folder: str): + if not os.path.exists(f'{project_folder}/trainings'): + return + for uuid in os.listdir(f'{project_folder}/trainings'): + shutil.rmtree(f'{project_folder}/trainings/{uuid}', ignore_errors=True) + + +@staticmethod +def create_training_folder(project_folder: str, trainings_id: str) -> str: + training_folder = f'{project_folder}/trainings/{trainings_id}' + os.makedirs(training_folder, exist_ok=True) + return training_folder diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index 9ba7519b..75c57189 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -50,7 +50,7 @@ async def logout(self) -> None: logging.info(f'Logout failed with response: {response}') raise LoopCommunicationException('Logout failed with response: ' + str(response)) - async def get_cookies(self) -> Cookies: + def get_cookies(self) -> Cookies: return self.async_client.cookies async def shutdown(self): diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py index bbf7bd77..5e7adf33 100644 --- a/learning_loop_node/node.py +++ b/learning_loop_node/node.py @@ -1,6 +1,5 @@ import asyncio import logging -import os import sys from abc import abstractmethod from contextlib import asynccontextmanager @@ -12,11 +11,10 @@ from fastapi import FastAPI from socketio import AsyncClient -from .data_classes import Context, NodeState, NodeStatus +from .data_classes import NodeStatus from .data_exchanger import DataExchanger -from .globals import GLOBALS from .helpers import log_conf -from .helpers.misc import ensure_socket_response, read_or_create_uuid +from .helpers.misc import activate_asyncio_warnings, ensure_socket_response, read_or_create_uuid from .loop_communication import LoopCommunicator @@ -61,16 +59,12 @@ def sio_client(self) -> AsyncClient: raise Exception('sio_client not yet initialized') return self._sio_client - def sio_is_initialized(self) -> bool: - return self._sio_client is not None - # --------------------------------------------------- APPLICATION LIFECYCLE --------------------------------------------------- @asynccontextmanager async def lifespan(self, app: FastAPI): - try: - self.repeat_task = asyncio.create_task(self.repeat_loop()) await self._on_startup() + self.repeat_task = asyncio.create_task(self.repeat_loop()) yield finally: await self._on_shutdown() @@ -83,7 +77,7 @@ async def lifespan(self, app: FastAPI): async def _on_startup(self): self.log.info('received "startup" lifecycle-event') - Node._activate_asyncio_warnings() + activate_asyncio_warnings() if self.needs_login: await self.loop_communicator.backend_ready() self.log.info('ensuring login') @@ -102,6 +96,7 @@ async def _on_shutdown(self): await self.on_shutdown() async def repeat_loop(self) -> None: + """NOTE: with the lifespan approach, we cannot use @repeat_every anymore :(""" while True: try: await self._on_repeat() @@ -112,11 +107,6 @@ async def repeat_loop(self) -> None: await asyncio.sleep(5) async def _on_repeat(self): - print('received "repeat" lifecycle-event', flush=True) - logging.info('received "repeat" lifecycle-event') - while not self.sio_is_initialized(): - self.log.info('Waiting for sio client to be initialized') - await asyncio.sleep(1) if not self.sio_client.connected: self.log.info('Reconnecting to loop via sio') await self.connect_sio() @@ -128,8 +118,11 @@ async def _on_repeat(self): # --------------------------------------------------- SOCKET.IO --------------------------------------------------- async def create_sio_client(self): - cookies = await self.loop_communicator.get_cookies() - self._sio_client = AsyncClient(request_timeout=20, http_session=aiohttp.ClientSession(cookies=cookies)) + """Create a socket.io client that communicates with the learning loop and register the events. + Note: The method is called in startup and soft restart of detector, so the _sio_client should always be available.""" + + self._sio_client = AsyncClient(request_timeout=20, + http_session=aiohttp.ClientSession(cookies=self.loop_communicator.get_cookies())) # pylint: disable=protected-access self.sio_client._trigger_event = ensure_socket_response(self.sio_client._trigger_event) @@ -137,30 +130,19 @@ async def create_sio_client(self): @self._sio_client.event async def connect(): self.log.info('received "connect" via sio from loop.') - self.status = NodeStatus(id=self.uuid, name=self.name) - state = await self.get_state() - try: - await self._update_send_state(state) - except: - self.log.exception('Error sending state. Exception:') - raise @self._sio_client.event async def disconnect(): self.log.info('received "disconnect" via sio from loop.') - await self._update_send_state(NodeState.Offline) @self._sio_client.event async def restart(): - self.log.info('received "restart" via sio from loop.') - self.restart() + self.log.info('received "restart" via sio from loop -> restarting node.') + sys.exit(0) self.register_sio_events(self._sio_client) async def connect_sio(self): - if not self.sio_is_initialized(): - self.log.warning('sio client not yet initialized') - return try: await self.sio_client.disconnect() except Exception: @@ -175,30 +157,11 @@ async def connect_sio(self): except Exception: self.log.exception(f'error while connecting to "{self.websocket_url}". Exception:') - async def _update_send_state(self, state: NodeState): - self.status.state = state - if self.status.state != NodeState.Offline: - await self.send_status() - # --------------------------------------------------- ABSTRACT METHODS --------------------------------------------------- - @abstractmethod - def register_sio_events(self, sio_client: AsyncClient): - """Register socket.io events for the communication with the learning loop. - The events: connect and disconnect are already registered and should not be overwritten.""" - - @abstractmethod - async def send_status(self): - """Send the current status to the learning loop. - Note that currently this method is also used to react to the response of the learning loop.""" - - @abstractmethod - async def get_state(self) -> NodeState: - """Return the current state of the node.""" - @abstractmethod def get_node_type(self): - pass + """Return the type of the node. This is used to register the node at the learning loop.""" @abstractmethod async def on_startup(self): @@ -211,32 +174,8 @@ async def on_shutdown(self): @abstractmethod async def on_repeat(self): """This method is called every 10 seconds.""" - # --------------------------------------------------- SHARED FUNCTIONS --------------------------------------------------- - - def restart(self): - """Restart the node.""" - self.log.info('restarting node') - sys.exit(0) - - # --------------------------------------------------- HELPER --------------------------------------------------- - @staticmethod - def create_project_folder(context: Context) -> str: - project_folder = f'{GLOBALS.data_folder}/{context.organization}/{context.project}' - os.makedirs(project_folder, exist_ok=True) - return project_folder - - @staticmethod - def _activate_asyncio_warnings() -> None: - '''Produce warnings for coroutines which take too long on the main loop and hence clog the event loop''' - try: - if sys.version_info.major >= 3 and sys.version_info.minor >= 7: # most - loop = asyncio.get_running_loop() - else: - loop = asyncio.get_event_loop() - - loop.set_debug(True) - loop.slow_callback_duration = 0.2 - logging.info('activated asyncio warnings') - except Exception: - logging.exception('could not activate asyncio warnings. Exception:') + @abstractmethod + def register_sio_events(self, sio_client: AsyncClient): + """Register (additional) socket.io events for the communication with the learning loop. + The events: connect, disconnect and restart are already registered and should not be overwritten.""" diff --git a/learning_loop_node/tests/test_helper.py b/learning_loop_node/tests/test_helper.py index 88a94af2..1f485506 100644 --- a/learning_loop_node/tests/test_helper.py +++ b/learning_loop_node/tests/test_helper.py @@ -7,7 +7,7 @@ from typing import Callable from learning_loop_node.data_classes import Context -from learning_loop_node.helpers.misc import create_image_folder +from learning_loop_node.helpers.misc import create_image_folder, create_project_folder from learning_loop_node.loop_communication import LoopCommunicator from learning_loop_node.node import Node from learning_loop_node.trainer.trainer_logic import TrainerLogic @@ -65,7 +65,7 @@ def _update_attribute_dict(obj: dict, **kwargs) -> None: def create_needed_folders(training_uuid: str = 'some_uuid'): # pylint: disable=unused-argument - project_folder = Node.create_project_folder( + project_folder = create_project_folder( Context(organization='zauberzeug', project='pytest')) image_folder = create_image_folder(project_folder) training_folder = TrainerLogic.create_training_folder(project_folder, training_uuid) diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 06eac0aa..2cff2d6f 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -3,6 +3,7 @@ import logging import os import shutil +import sys import time from abc import abstractmethod from dataclasses import asdict @@ -10,7 +11,7 @@ from glob import glob from time import perf_counter from typing import TYPE_CHECKING, Coroutine, Dict, List, Optional, Union -from uuid import UUID, uuid4 +from uuid import uuid4 import socketio from dacite import from_dict @@ -19,7 +20,7 @@ from ..data_classes import (BasicModel, Category, Context, Detections, Errors, Hyperparameter, ModelInformation, PretrainedModel, Training, TrainingData, TrainingError, TrainingState) -from ..helpers.misc import create_image_folder, delete_corrupt_images, is_valid_uuid4 +from ..helpers.misc import create_image_folder, create_project_folder, generate_training, is_valid_uuid4 from ..node import Node from . import training_syncronizer from .downloader import TrainingsDownloader @@ -80,11 +81,11 @@ def init_new_training(self, context: Context, details: Dict) -> None: Note that details needs the entries 'categories' and 'training_number'""" try: - project_folder = Node.create_project_folder(context) + project_folder = create_project_folder(context) if not self.keep_old_trainings: # NOTE: We delete all existing training folders because they are not needed anymore. TrainerLogic.delete_all_training_folders(project_folder) - self._training = TrainerLogic.generate_training(project_folder, context) + self._training = generate_training(project_folder, context) self._training.data = TrainingData(categories=Category.from_list(details['categories'])) self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details) self._training.training_number = details['training_number'] @@ -405,7 +406,7 @@ async def _do_detections(self) -> None: content = json.load(f) model_information = from_dict(data_class=ModelInformation, data=content) - project_folder = Node.create_project_folder(context) + project_folder = create_project_folder(context) image_folder = create_image_folder(project_folder) self.node.data_exchanger.set_context(context) image_ids = [] @@ -528,8 +529,7 @@ def get_log(self) -> str: def may_restart(self) -> None: if self.restart_after_training: logging.info('restarting') - assert self._node is not None - self._node.restart() + sys.exit(0) else: logging.info('not restarting') diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py index d2ae3249..d26831a2 100644 --- a/learning_loop_node/trainer/trainer_node.py +++ b/learning_loop_node/trainer/trainer_node.py @@ -126,11 +126,6 @@ async def continue_run_if_incomplete(self) -> bool: return True return False - async def get_state(self): - if self.trainer_logic._executor is not None and self.trainer_logic._executor.is_process_running(): # pylint: disable=protected-access - return NodeState.Running - return NodeState.Idle - def get_node_type(self): return 'trainer' diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py index 47781d3d..df6e1292 100644 --- a/mock_trainer/app_code/tests/test_detections.py +++ b/mock_trainer/app_code/tests/test_detections.py @@ -5,8 +5,8 @@ from learning_loop_node.data_classes import Category, Context from learning_loop_node.globals import GLOBALS +from learning_loop_node.helpers.misc import create_project_folder from learning_loop_node.loop_communication import LoopCommunicator -from learning_loop_node.node import Node from learning_loop_node.tests import test_helper from learning_loop_node.trainer.trainer_logic import TrainerLogic from learning_loop_node.trainer.trainer_node import TrainerNode @@ -32,7 +32,7 @@ async def test_all(setup_test_project1, glc: LoopCommunicator): # pylint: disab trainer._node = node # pylint: disable=protected-access trainer.init_new_training(context=context, details=details) - project_folder = Node.create_project_folder(context) + project_folder = create_project_folder(context) training = TrainerLogic.generate_training(project_folder, context) training.model_id_for_detecting = latest_model_id trainer._training = training # pylint: disable=protected-access From f38e8468cac2b9ef2290937cd0b05cf087f67df6 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 6 Mar 2024 18:12:41 +0100 Subject: [PATCH 07/62] Simplify declaration of node_type --- .../annotation/annotator_node.py | 5 +- learning_loop_node/data_classes/__init__.py | 16 ++--- learning_loop_node/data_classes/training.py | 7 +- learning_loop_node/detector/detector_node.py | 8 +-- learning_loop_node/node.py | 9 +-- .../tests/states/test_state_detecting.py | 4 +- .../trainer/tests/test_trainer_states.py | 7 +- learning_loop_node/trainer/trainer_logic.py | 67 ++++++++++--------- learning_loop_node/trainer/trainer_node.py | 61 +++++++---------- 9 files changed, 82 insertions(+), 102 deletions(-) diff --git a/learning_loop_node/annotation/annotator_node.py b/learning_loop_node/annotation/annotator_node.py index 474f28e6..d12bcc0f 100644 --- a/learning_loop_node/annotation/annotator_node.py +++ b/learning_loop_node/annotation/annotator_node.py @@ -18,7 +18,7 @@ class AnnotatorNode(Node): def __init__(self, name: str, annotator_logic: AnnotatorLogic, uuid: Optional[str] = None): - super().__init__(name, uuid) + super().__init__(name, uuid, 'annotation_node') self.tool = annotator_logic self.histories: Dict = {} annotator_logic.init(self) @@ -89,9 +89,6 @@ async def download_image(self, context: Context, uuid: str): downloader = DataExchanger(context=context, loop_communicator=self.loop_communicator) await downloader.download_images([uuid], images_folder) - def get_node_type(self): - return 'annotation_node' - async def on_startup(self): pass diff --git a/learning_loop_node/data_classes/__init__.py b/learning_loop_node/data_classes/__init__.py index bc2980cd..0e0a10e9 100644 --- a/learning_loop_node/data_classes/__init__.py +++ b/learning_loop_node/data_classes/__init__.py @@ -1,12 +1,8 @@ -from .annotations import (AnnotationData, AnnotationEventType, - SegmentationAnnotation, ToolOutput, UserInput) -from .detections import (BoxDetection, ClassificationDetection, Detections, - Observation, Point, PointDetection, +from .annotations import AnnotationData, AnnotationEventType, SegmentationAnnotation, ToolOutput, UserInput +from .detections import (BoxDetection, ClassificationDetection, Detections, Observation, Point, PointDetection, SegmentationDetection, Shape) -from .general import (AnnotationNodeStatus, Category, CategoryType, Context, - DetectionStatus, ErrorConfiguration, ModelInformation, - NodeState, NodeStatus) +from .general import (AnnotationNodeStatus, Category, CategoryType, Context, DetectionStatus, ErrorConfiguration, + ModelInformation, NodeState, NodeStatus) from .socket_response import SocketResponse -from .training import (BasicModel, Errors, Hyperparameter, Model, - PretrainedModel, Training, TrainingData, TrainingError, - TrainingOut, TrainingState, TrainingStatus) +from .training import (BasicModel, Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData, + TrainingError, TrainingOut, TrainingStatus) diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index 49432925..b78190ef 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -41,7 +41,8 @@ class PretrainedModel(): description: str -class TrainingState(str, Enum): +class TrainerState(str, Enum): + Idle = 'idle' Initialized = 'initialized' Preparing = 'preparing' DataDownloading = 'data_downloading' @@ -64,7 +65,7 @@ class TrainingState(str, Enum): class TrainingStatus(): id: str # TODO this must not be changed, but tests wont detect it -> update tests! name: str - state: Union[Optional[TrainingState], str] + state: Union[Optional[TrainerState], str] errors: Optional[Dict] uptime: Optional[float] progress: Optional[float] @@ -98,7 +99,7 @@ class Training(): base_model_id: Optional[str] = None data: Optional[TrainingData] = None training_number: Optional[int] = None - training_state: Optional[Union[TrainingState, str]] = None + training_state: Optional[Union[TrainerState, str]] = None model_id_for_detecting: Optional[str] = None hyperparameters: Optional[Dict] = None diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py index 00271f64..18b8ab6c 100644 --- a/learning_loop_node/detector/detector_node.py +++ b/learning_loop_node/detector/detector_node.py @@ -14,7 +14,7 @@ from fastapi_socketio import SocketManager from socketio import AsyncClient -from ..data_classes import Category, Context, Detections, DetectionStatus, ModelInformation, NodeState, Shape +from ..data_classes import Category, Context, Detections, DetectionStatus, ModelInformation, Shape from ..data_classes.socket_response import SocketResponse from ..data_exchanger import DataExchanger, DownloadError from ..globals import GLOBALS @@ -34,9 +34,8 @@ class DetectorNode(Node): def __init__(self, name: str, detector: DetectorLogic, uuid: Optional[str] = None, use_backdoor_controls: bool = False) -> None: - super().__init__(name, uuid) + super().__init__(name, uuid, 'detector', False) self.detector_logic = detector - self.needs_login = False self.organization = environment_reader.organization() self.project = environment_reader.project() assert self.organization and self.project, 'Detector node needs an organization and an project' @@ -353,9 +352,6 @@ def find_category_id_by_name(categories: List[Category], category_name: str): classification_detection.category_id = category_id return detections - def get_node_type(self): - return 'detector' - def register_sio_events(self, sio_client: AsyncClient): pass diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py index 5e7adf33..85a81fd6 100644 --- a/learning_loop_node/node.py +++ b/learning_loop_node/node.py @@ -20,8 +20,9 @@ class Node(FastAPI): - def __init__(self, name: str, uuid: Optional[str] = None, needs_login: bool = True): + def __init__(self, name: str, uuid: Optional[str] = None, node_type: str = 'node', needs_login: bool = True): """Base class for all nodes. A node is a process that communicates with the zauberzeug learning loop. + This class provides the basic functionality to connect to the learning loop via socket.io and to exchange data. Args: name (str): The name of the node. This name is used to generate a uuid. @@ -49,7 +50,7 @@ def __init__(self, name: str, uuid: Optional[str] = None, needs_login: bool = Tr self.sio_headers = {'organization': self.loop_communicator.organization, 'project': self.loop_communicator.project, - 'nodeType': self.get_node_type()} + 'nodeType': node_type} self.repeat_task: Any = None @@ -159,10 +160,6 @@ async def connect_sio(self): # --------------------------------------------------- ABSTRACT METHODS --------------------------------------------------- - @abstractmethod - def get_node_type(self): - """Return the type of the node. This is used to register the node at the learning loop.""" - @abstractmethod async def on_startup(self): """This method is called when the node is started.""" diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py index d571a665..a0ad04d7 100644 --- a/learning_loop_node/trainer/tests/states/test_state_detecting.py +++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py @@ -1,7 +1,7 @@ import asyncio from learning_loop_node.conftest import get_dummy_detections -from learning_loop_node.data_classes import TrainingState +from learning_loop_node.data_classes import TrainerState from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic @@ -31,7 +31,7 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state=TrainingState.TrainModelUploaded) + create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded) trainer.init_from_last_training() trainer.training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242' diff --git a/learning_loop_node/trainer/tests/test_trainer_states.py b/learning_loop_node/trainer/tests/test_trainer_states.py index c6e449b7..74e630d1 100644 --- a/learning_loop_node/trainer/tests/test_trainer_states.py +++ b/learning_loop_node/trainer/tests/test_trainer_states.py @@ -1,10 +1,9 @@ from uuid import uuid4 -from learning_loop_node.data_classes import Context, Training, TrainingState +from learning_loop_node.data_classes import Context, TrainerState, Training from learning_loop_node.trainer.io_helpers import LastTrainingIO -from learning_loop_node.trainer.tests.testing_trainer_logic import \ - TestingTrainerLogic +from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_node import TrainerNode @@ -27,7 +26,7 @@ def test_fixture_trainer_node(test_initialized_trainer_node): def test_save_load_training(): training = create_training() last_training_io = LastTrainingIO('00000000-0000-0000-0000-000000000000') - training.training_state = TrainingState.Preparing + training.training_state = TrainerState.Preparing last_training_io.save(training) training = last_training_io.load() diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 2cff2d6f..dedaa9e6 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -19,9 +19,8 @@ from tqdm import tqdm from ..data_classes import (BasicModel, Category, Context, Detections, Errors, Hyperparameter, ModelInformation, - PretrainedModel, Training, TrainingData, TrainingError, TrainingState) + PretrainedModel, TrainerState, Training, TrainingData, TrainingError) from ..helpers.misc import create_image_folder, create_project_folder, generate_training, is_valid_uuid4 -from ..node import Node from . import training_syncronizer from .downloader import TrainingsDownloader from .executor import Executor @@ -76,6 +75,14 @@ def is_initialized(self) -> bool: """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'""" return self._training is not None and self._active_training_io is not None and self._node is not None + @property + def state(self) -> str: + if (not self.is_initialized) or (self.training.training_state is None): + return TrainerState.Idle.value + else: + state = self.training.training_state + return state.value if isinstance(state, TrainerState) else state + def init_new_training(self, context: Context, details: Dict) -> None: """Called on `begin_training` event from the Learning Loop. Note that details needs the entries 'categories' and 'training_number'""" @@ -90,7 +97,7 @@ def init_new_training(self, context: Context, details: Dict) -> None: self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details) self._training.training_number = details['training_number'] self._training.base_model_id = details['id'] - self._training.training_state = TrainingState.Initialized + self._training.training_state = TrainerState.Initialized self._active_training_io = ActiveTrainingIO(self._training.training_folder) logging.info(f'init training: {self._training}') except Exception: @@ -112,7 +119,7 @@ async def run(self) -> None: except asyncio.CancelledError: if not self.shutdown_event.is_set(): logging.info('training task was cancelled but not by shutdown event') - self.training.training_state = TrainingState.ReadyForCleanup + self.training.training_state = TrainerState.ReadyForCleanup self.node.last_training_io.save(self.training) await self.clear_training() @@ -134,27 +141,27 @@ async def _run(self) -> None: tstate = self.training.training_state logging.info(f'STATE LOOP: {tstate}') await asyncio.sleep(0.6) # Note: Required for pytests! - if tstate == TrainingState.Initialized: # -> DataDownloading -> DataDownloaded + if tstate == TrainerState.Initialized: # -> DataDownloading -> DataDownloaded await self.prepare() - elif tstate == TrainingState.DataDownloaded: # -> TrainModelDownloading -> TrainModelDownloaded + elif tstate == TrainerState.DataDownloaded: # -> TrainModelDownloading -> TrainModelDownloaded await self.download_model() - elif tstate == TrainingState.TrainModelDownloaded: # -> TrainingRunning -> TrainingFinished + elif tstate == TrainerState.TrainModelDownloaded: # -> TrainingRunning -> TrainingFinished await self.train() - elif tstate == TrainingState.TrainingFinished: # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced + elif tstate == TrainerState.TrainingFinished: # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced await self.ensure_confusion_matrix_synced() - elif tstate == TrainingState.ConfusionMatrixSynced: # -> TrainModelUploading -> TrainModelUploaded + elif tstate == TrainerState.ConfusionMatrixSynced: # -> TrainModelUploading -> TrainModelUploaded await self.upload_model() - elif tstate == TrainingState.TrainModelUploaded: # -> Detecting -> Detected + elif tstate == TrainerState.TrainModelUploaded: # -> Detecting -> Detected await self.do_detections() - elif tstate == TrainingState.Detected: # -> DetectionUploading -> ReadyForCleanup + elif tstate == TrainerState.Detected: # -> DetectionUploading -> ReadyForCleanup await self.upload_detections() - elif tstate == TrainingState.ReadyForCleanup: # -> RESTART or TrainingFinished + elif tstate == TrainerState.ReadyForCleanup: # -> RESTART or TrainingFinished await self.clear_training() self.may_restart() async def prepare(self) -> None: previous_state = self.training.training_state - self.training.training_state = TrainingState.DataDownloading + self.training.training_state = TrainerState.DataDownloading error_key = 'prepare' try: await self._prepare() @@ -167,7 +174,7 @@ async def prepare(self) -> None: self.errors.set(error_key, str(e)) else: self.errors.reset(error_key) - self.training.training_state = TrainingState.DataDownloaded + self.training.training_state = TrainerState.DataDownloaded self.node.last_training_io.save(self.training) async def _prepare(self) -> None: @@ -181,7 +188,7 @@ async def _prepare(self) -> None: async def download_model(self) -> None: logging.info('Downloading model') previous_state = self.training.training_state - self.training.training_state = TrainingState.TrainModelDownloading + self.training.training_state = TrainerState.TrainModelDownloading error_key = 'download_model' try: await self._download_model() @@ -195,7 +202,7 @@ async def download_model(self) -> None: else: self.errors.reset(error_key) logging.info('download_model_task finished') - self.training.training_state = TrainingState.TrainModelDownloaded + self.training.training_state = TrainerState.TrainModelDownloaded self.node.last_training_io.save(self.training) async def _download_model(self) -> None: @@ -218,7 +225,7 @@ async def train(self) -> None: self.errors.reset(error_key) previous_state = self.training.training_state self._executor = Executor(self.training.training_folder) - self.training.training_state = TrainingState.TrainingRunning + self.training.training_state = TrainerState.TrainingRunning try: await self._start_training() @@ -263,7 +270,7 @@ async def train(self) -> None: self.training.training_state = previous_state logging.exception('Error in run_training') else: - self.training.training_state = TrainingState.TrainingFinished + self.training.training_state = TrainerState.TrainingFinished self.node.last_training_io.save(self.training) async def _start_training(self): @@ -283,7 +290,7 @@ async def _start_training(self): async def ensure_confusion_matrix_synced(self): logging.info('Ensure syncing confusion matrix') previous_state = self.training.training_state - self.training.training_state = TrainingState.ConfusionMatrixSyncing + self.training.training_state = TrainerState.ConfusionMatrixSyncing try: await self.sync_confusion_matrix() except asyncio.CancelledError: @@ -293,7 +300,7 @@ async def ensure_confusion_matrix_synced(self): logging.exception('Error in ensure_confusion_matrix_synced') self.training.training_state = previous_state else: - self.training.training_state = TrainingState.ConfusionMatrixSynced + self.training.training_state = TrainerState.ConfusionMatrixSynced self.node.last_training_io.save(self.training) async def sync_confusion_matrix(self): @@ -315,11 +322,11 @@ async def sync_confusion_matrix(self): async def upload_model(self) -> None: error_key = 'upload_model' previous_state = self.training.training_state - self.training.training_state = TrainingState.TrainModelUploading + self.training.training_state = TrainerState.TrainModelUploading try: new_model_id = await self._upload_model_return_new_id(self.training.context) if new_model_id is None: - self.training.training_state = TrainingState.ReadyForCleanup + self.training.training_state = TrainerState.ReadyForCleanup logging.error('could not upload model - maybe training failed.. cleaning up') return assert new_model_id is not None, 'uploaded_model must be set' @@ -335,7 +342,7 @@ async def upload_model(self) -> None: # self.training.training_state = TrainingState.ReadyForCleanup else: self.errors.reset(error_key) - self.training.training_state = TrainingState.TrainModelUploaded + self.training.training_state = TrainerState.TrainModelUploaded self.node.last_training_io.save(self.training) async def _upload_model_return_new_id(self, context: Context) -> Optional[str]: @@ -377,7 +384,7 @@ async def do_detections(self): error_key = 'detecting' previous_state = self.training.training_state try: - self.training.training_state = TrainingState.Detecting + self.training.training_state = TrainerState.Detecting await self._do_detections() except asyncio.CancelledError: logging.warning('CancelledError in do_detections') @@ -388,7 +395,7 @@ async def do_detections(self): self.training.training_state = previous_state else: self.errors.reset(error_key) - self.training.training_state = TrainingState.Detected + self.training.training_state = TrainerState.Detected self.node.last_training_io.save(self.training) async def _do_detections(self) -> None: @@ -439,7 +446,7 @@ async def _do_detections(self) -> None: async def upload_detections(self): error_key = 'upload_detections' previous_state = self.training.training_state - self.training.training_state = TrainingState.DetectionUploading + self.training.training_state = TrainerState.DetectionUploading await asyncio.sleep(0.1) # NOTE needed for tests try: json_files = self.active_training_io.get_detection_file_names() @@ -460,7 +467,7 @@ async def upload_detections(self): self.training.training_state = previous_state else: self.errors.reset(error_key) - self.training.training_state = TrainingState.ReadyForCleanup + self.training.training_state = TrainerState.ReadyForCleanup self.node.last_training_io.save(self.training) async def _upload_detections_batched(self, context: Context, detections: List[Detections]): @@ -540,11 +547,11 @@ def general_progress(self) -> Optional[float]: return None t_state = self.training.training_state - if t_state == TrainingState.DataDownloading: + if t_state == TrainerState.DataDownloading: return self.node.data_exchanger.progress - if t_state == TrainingState.TrainingRunning: + if t_state == TrainerState.TrainingRunning: return self.training_progress - if t_state == TrainingState.Detecting: + if t_state == TrainerState.Detecting: return self.detection_progress return None diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py index d26831a2..219a48ca 100644 --- a/learning_loop_node/trainer/trainer_node.py +++ b/learning_loop_node/trainer/trainer_node.py @@ -7,7 +7,7 @@ from fastapi.encoders import jsonable_encoder from socketio import AsyncClient -from ..data_classes import Context, NodeState, TrainingState, TrainingStatus +from ..data_classes import Context, NodeState, TrainerState, TrainingStatus from ..data_classes.socket_response import SocketResponse from ..node import Node from .io_helpers import LastTrainingIO @@ -18,7 +18,7 @@ class TrainerNode(Node): def __init__(self, name: str, trainer_logic: TrainerLogic, uuid: Optional[str] = None, use_backdoor_controls: bool = False): - super().__init__(name, uuid) + super().__init__(name, uuid, 'trainer') trainer_logic._node = self # pylint: disable=protected-access self.trainer_logic = trainer_logic self.last_training_io = LastTrainingIO(self.uuid) @@ -65,7 +65,6 @@ def register_sio_events(self, sio_client: AsyncClient): @sio_client.event async def begin_training(organization: str, project: str, details: Dict): - assert self._sio_client is not None self.log.info('received begin_training from server') self.trainer_logic.init_new_training(Context(organization=organization, project=project), details) asyncio.get_event_loop().create_task(self.trainer_logic.run()) @@ -81,20 +80,13 @@ async def stop_training(): return True async def send_status(self): - if self._sio_client is None or not self._sio_client.connected: + if not self.sio_client.connected: self.log.warning('cannot send status - not connected to the Learning Loop') return - if not self.trainer_logic.is_initialized: - state_for_learning_loop = str(NodeState.Idle.value) - else: - assert self.trainer_logic.training.training_state is not None - state_for_learning_loop = TrainerNode.state_for_learning_loop( - self.trainer_logic.training.training_state) - status = TrainingStatus(id=self.uuid, name=self.name, - state=state_for_learning_loop, + state=self.trainer_logic.state, errors={}, uptime=self.training_uptime, progress=self.progress) @@ -111,12 +103,10 @@ async def send_status(self): status.context = self.trainer_logic.training.context self.log.info(f'sending status: {status.short_str()}') - result = await self._sio_client.call('update_trainer', jsonable_encoder(asdict(status)), timeout=30) + result = await self.sio_client.call('update_trainer', jsonable_encoder(asdict(status)), timeout=30) assert isinstance(result, Dict) - response = from_dict(data_class=SocketResponse, data=result) - - if not response.success: - self.log.error(f'Error when sending status update: Response from loop was:\n {asdict(response)}') + if not result['success']: + self.log.error(f'Error when sending status update: Response from loop was:\n {result}') async def continue_run_if_incomplete(self) -> bool: if not self.trainer_logic.is_initialized and self.last_training_io.exists(): @@ -126,43 +116,40 @@ async def continue_run_if_incomplete(self) -> bool: return True return False - def get_node_type(self): - return 'trainer' - # --------------------------------------------------- HELPER --------------------------------------------------- @staticmethod - def state_for_learning_loop(trainer_state: Union[TrainingState, str]) -> str: - if trainer_state == TrainingState.Initialized: + def state_for_learning_loop(trainer_state: Union[TrainerState, str]) -> str: + if trainer_state == TrainerState.Initialized: return 'Training is initialized' - if trainer_state == TrainingState.DataDownloading: + if trainer_state == TrainerState.DataDownloading: return 'Downloading data' - if trainer_state == TrainingState.DataDownloaded: + if trainer_state == TrainerState.DataDownloaded: return 'Data downloaded' - if trainer_state == TrainingState.TrainModelDownloading: + if trainer_state == TrainerState.TrainModelDownloading: return 'Downloading model' - if trainer_state == TrainingState.TrainModelDownloaded: + if trainer_state == TrainerState.TrainModelDownloaded: return 'Model downloaded' - if trainer_state == TrainingState.TrainingRunning: + if trainer_state == TrainerState.TrainingRunning: return NodeState.Running - if trainer_state == TrainingState.TrainingFinished: + if trainer_state == TrainerState.TrainingFinished: return 'Training finished' - if trainer_state == TrainingState.Detecting: + if trainer_state == TrainerState.Detecting: return NodeState.Detecting - if trainer_state == TrainingState.ConfusionMatrixSyncing: + if trainer_state == TrainerState.ConfusionMatrixSyncing: return 'Syncing confusion matrix' - if trainer_state == TrainingState.ConfusionMatrixSynced: + if trainer_state == TrainerState.ConfusionMatrixSynced: return 'Confusion matrix synced' - if trainer_state == TrainingState.TrainModelUploading: + if trainer_state == TrainerState.TrainModelUploading: return 'Uploading trained model' - if trainer_state == TrainingState.TrainModelUploaded: + if trainer_state == TrainerState.TrainModelUploaded: return 'Trained model uploaded' - if trainer_state == TrainingState.Detecting: + if trainer_state == TrainerState.Detecting: return 'calculating detections' - if trainer_state == TrainingState.Detected: + if trainer_state == TrainerState.Detected: return 'Detections calculated' - if trainer_state == TrainingState.DetectionUploading: + if trainer_state == TrainerState.DetectionUploading: return 'Uploading detections' - if trainer_state == TrainingState.ReadyForCleanup: + if trainer_state == TrainerState.ReadyForCleanup: return 'Cleaning training' return 'unknown state' From 00482753112ea41f5ed4112bd27e3b90b9216c8b Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Thu, 7 Mar 2024 13:26:51 +0100 Subject: [PATCH 08/62] Further cleanup and make annotation node send status once. --- .../annotation/annotator_node.py | 17 ++++++-- learning_loop_node/node.py | 3 +- learning_loop_node/trainer/trainer_node.py | 42 +------------------ mock_annotator/app_code/restart/restart.py | 2 + mock_annotator/start.sh | 2 +- 5 files changed, 19 insertions(+), 47 deletions(-) create mode 100644 mock_annotator/app_code/restart/restart.py diff --git a/learning_loop_node/annotation/annotator_node.py b/learning_loop_node/annotation/annotator_node.py index d12bcc0f..94848506 100644 --- a/learning_loop_node/annotation/annotator_node.py +++ b/learning_loop_node/annotation/annotator_node.py @@ -22,6 +22,7 @@ def __init__(self, name: str, annotator_logic: AnnotatorLogic, uuid: Optional[st self.tool = annotator_logic self.histories: Dict = {} annotator_logic.init(self) + self.status_sent = False def register_sio_events(self, sio_client: AsyncClient): @@ -65,6 +66,9 @@ def get_history(self, frontend_id: str) -> Dict: return self.histories.setdefault(frontend_id, self.tool.create_empty_history()) async def send_status(self): + if self.status_sent: + return + status = AnnotationNodeStatus( id=self.uuid, name=self.name, @@ -73,14 +77,19 @@ async def send_status(self): ) self.log.info(f'Sending status {status}') - if self._sio_client is None: - raise Exception('No socket client') - result = await self._sio_client.call('update_annotation_node', jsonable_encoder(asdict(status)), timeout=10) + try: + result = await self.sio_client.call('update_annotation_node', jsonable_encoder(asdict(status)), timeout=10) + except Exception as e: + self.log.error(f'Error for updating: {str(e)}') + return + assert isinstance(result, Dict) response = from_dict(data_class=SocketResponse, data=result) if not response.success: self.log.error(f'Error for updating: Response from loop was : {asdict(response)}') + else: + self.status_sent = True async def download_image(self, context: Context, uuid: str): project_folder = create_project_folder(context) @@ -96,4 +105,4 @@ async def on_shutdown(self): pass async def on_repeat(self): - pass + await self.send_status() diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py index 85a81fd6..38742fa4 100644 --- a/learning_loop_node/node.py +++ b/learning_loop_node/node.py @@ -162,7 +162,8 @@ async def connect_sio(self): @abstractmethod async def on_startup(self): - """This method is called when the node is started.""" + """This method is called when the node is started. + Note: In this method the sio connection is not yet established!""" @abstractmethod async def on_shutdown(self): diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py index 219a48ca..ae8f2527 100644 --- a/learning_loop_node/trainer/trainer_node.py +++ b/learning_loop_node/trainer/trainer_node.py @@ -3,12 +3,10 @@ from dataclasses import asdict from typing import Dict, Optional, Union -from dacite import from_dict from fastapi.encoders import jsonable_encoder from socketio import AsyncClient -from ..data_classes import Context, NodeState, TrainerState, TrainingStatus -from ..data_classes.socket_response import SocketResponse +from ..data_classes import Context, TrainingStatus from ..node import Node from .io_helpers import LastTrainingIO from .rest import backdoor_controls, controls @@ -115,41 +113,3 @@ async def continue_run_if_incomplete(self) -> bool: asyncio.get_event_loop().create_task(self.trainer_logic.run()) return True return False - - # --------------------------------------------------- HELPER --------------------------------------------------- - - @staticmethod - def state_for_learning_loop(trainer_state: Union[TrainerState, str]) -> str: - if trainer_state == TrainerState.Initialized: - return 'Training is initialized' - if trainer_state == TrainerState.DataDownloading: - return 'Downloading data' - if trainer_state == TrainerState.DataDownloaded: - return 'Data downloaded' - if trainer_state == TrainerState.TrainModelDownloading: - return 'Downloading model' - if trainer_state == TrainerState.TrainModelDownloaded: - return 'Model downloaded' - if trainer_state == TrainerState.TrainingRunning: - return NodeState.Running - if trainer_state == TrainerState.TrainingFinished: - return 'Training finished' - if trainer_state == TrainerState.Detecting: - return NodeState.Detecting - if trainer_state == TrainerState.ConfusionMatrixSyncing: - return 'Syncing confusion matrix' - if trainer_state == TrainerState.ConfusionMatrixSynced: - return 'Confusion matrix synced' - if trainer_state == TrainerState.TrainModelUploading: - return 'Uploading trained model' - if trainer_state == TrainerState.TrainModelUploaded: - return 'Trained model uploaded' - if trainer_state == TrainerState.Detecting: - return 'calculating detections' - if trainer_state == TrainerState.Detected: - return 'Detections calculated' - if trainer_state == TrainerState.DetectionUploading: - return 'Uploading detections' - if trainer_state == TrainerState.ReadyForCleanup: - return 'Cleaning training' - return 'unknown state' diff --git a/mock_annotator/app_code/restart/restart.py b/mock_annotator/app_code/restart/restart.py new file mode 100644 index 00000000..915175ed --- /dev/null +++ b/mock_annotator/app_code/restart/restart.py @@ -0,0 +1,2 @@ +# add 'reload_dirs=['./app_code/restart'] to uvicorn call in main.py +# save this file to trigger uvicorn restart diff --git a/mock_annotator/start.sh b/mock_annotator/start.sh index e6d3aaac..7814999d 100755 --- a/mock_annotator/start.sh +++ b/mock_annotator/start.sh @@ -5,5 +5,5 @@ if [[ $1 = "debug" ]]; then elif [[ $1 = "profile" ]]; then kernprof -l /app/main.py else - python3 /app/main.py + uvicorn main:node --host 0.0.0.0 --port 80 --reload --lifespan on --reload-dir /app/app_code/restart fi \ No newline at end of file From 17eb503bbfb3017982f9fb134d37ac4242b8b3d2 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Thu, 7 Mar 2024 15:24:54 +0100 Subject: [PATCH 09/62] change value of trainer state to old version, so old trainers still work in the loop --- learning_loop_node/data_classes/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index b78190ef..dd9fb624 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -49,7 +49,7 @@ class TrainerState(str, Enum): DataDownloaded = 'data_downloaded' TrainModelDownloading = 'train_model_downloading' TrainModelDownloaded = 'train_model_downloaded' - TrainingRunning = 'training_running' + TrainingRunning = 'running' TrainingFinished = 'training_finished' ConfusionMatrixSyncing = 'confusion_matrix_syncing' ConfusionMatrixSynced = 'confusion_matrix_synced' From 68a0eb56120df6ebf127b9f53573fb68a6f0e332 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Thu, 7 Mar 2024 16:48:44 +0100 Subject: [PATCH 10/62] Refactoring: use enums --- learning_loop_node/data_classes/training.py | 4 +- .../tests/states/test_state_detecting.py | 6 +- .../states/test_state_download_train_model.py | 11 +-- .../tests/states/test_state_prepare.py | 12 +-- .../test_state_sync_confusion_matrix.py | 25 ++++--- .../trainer/tests/states/test_state_train.py | 25 ++++--- .../states/test_state_upload_detections.py | 29 +++---- .../tests/states/test_state_upload_model.py | 18 ++--- .../trainer/tests/test_trainer_states.py | 2 +- learning_loop_node/trainer/trainer_logic.py | 61 +++++++++------ .../trainer/trainer_logic_abstraction.py | 75 +++++++++++++++++++ learning_loop_node/trainer/trainer_node.py | 57 +++++--------- 12 files changed, 200 insertions(+), 125 deletions(-) create mode 100644 learning_loop_node/trainer/trainer_logic_abstraction.py diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index dd9fb624..9a41928f 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -65,7 +65,7 @@ class TrainerState(str, Enum): class TrainingStatus(): id: str # TODO this must not be changed, but tests wont detect it -> update tests! name: str - state: Union[Optional[TrainerState], str] + state: Optional[TrainerState] errors: Optional[Dict] uptime: Optional[float] progress: Optional[float] @@ -99,7 +99,7 @@ class Training(): base_model_id: Optional[str] = None data: Optional[TrainingData] = None training_number: Optional[int] = None - training_state: Optional[Union[TrainerState, str]] = None + training_state: Optional[TrainerState] = None model_id_for_detecting: Optional[str] = None hyperparameters: Optional[Dict] = None diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py index a0ad04d7..5d7583fe 100644 --- a/learning_loop_node/trainer/tests/states/test_state_detecting.py +++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py @@ -24,7 +24,7 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi await assert_training_state(trainer.training, 'detected', timeout=10, interval=0.001) assert trainer_has_error(trainer) is False - assert trainer.training.training_state == 'detected' + assert trainer.training.training_state == TrainerState.Detected assert trainer.node.last_training_io.load() == trainer.training assert trainer.active_training_io.detections_exist() @@ -48,7 +48,7 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer async def test_model_not_downloadable_error(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='train_model_uploaded', + create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded, model_id_for_detecting='00000000-0000-0000-0000-000000000000') # bad model id trainer.init_from_last_training() @@ -58,7 +58,7 @@ async def test_model_not_downloadable_error(test_initialized_trainer: TestingTra await assert_training_state(trainer.training, 'train_model_uploaded', timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == 'train_model_uploaded' + assert trainer.training.training_state == TrainerState.TrainModelUploaded assert trainer.training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000' assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py index 687e5060..5785e5fa 100644 --- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py @@ -2,13 +2,14 @@ import asyncio import os +from learning_loop_node.data_classes import TrainerState from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic async def test_downloading_is_successful(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='data_downloaded') + create_active_training_file(trainer, training_state=TrainerState.DataDownloaded) trainer.model_format = 'mocked' trainer.init_from_last_training() @@ -17,7 +18,7 @@ async def test_downloading_is_successful(test_initialized_trainer: TestingTraine await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) await assert_training_state(trainer.training, 'train_model_downloaded', timeout=1, interval=0.001) - assert trainer.training.training_state == 'train_model_downloaded' + assert trainer.training.training_state == TrainerState.TrainModelDownloaded assert trainer.node.last_training_io.load() == trainer.training # file on disk @@ -43,15 +44,15 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='data_downloaded', + create_active_training_file(trainer, training_state=TrainerState.DataDownloaded, base_model_id='00000000-0000-0000-0000-000000000000') # bad model id) trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) - await assert_training_state(trainer.training, 'data_downloaded', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.DataDownloaded, timeout=1, interval=0.001) assert trainer.errors.has_error_for('download_model') assert trainer._training is not None # pylint: disable=protected-access - assert trainer.training.training_state == 'data_downloaded' + assert trainer.training.training_state == TrainerState.DataDownloaded assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_prepare.py b/learning_loop_node/trainer/tests/states/test_state_prepare.py index 9d2eedcc..261fbb70 100644 --- a/learning_loop_node/trainer/tests/states/test_state_prepare.py +++ b/learning_loop_node/trainer/tests/states/test_state_prepare.py @@ -1,6 +1,6 @@ import asyncio -from learning_loop_node.data_classes import Context +from learning_loop_node.data_classes import Context, TrainerState from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic @@ -19,7 +19,7 @@ async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerL await trainer.prepare() assert trainer_has_error(trainer) is False - assert trainer.training.training_state == 'data_downloaded' + assert trainer.training.training_state == TrainerState.DataDownloaded assert trainer.training.data is not None assert trainer.node.last_training_io.load() == trainer.training @@ -30,7 +30,7 @@ async def test_abort_preparing(test_initialized_trainer: TestingTrainerLogic): trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'data_downloading', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -46,10 +46,10 @@ async def test_request_error(test_initialized_trainer: TestingTrainerLogic): trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'data_downloading', timeout=3, interval=0.001) - await assert_training_state(trainer.training, 'initialized', timeout=3, interval=0.001) + await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=3, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Initialized, timeout=3, interval=0.001) assert trainer_has_error(trainer) assert trainer._training is not None # pylint: disable=protected-access - assert trainer.training.training_state == 'initialized' + assert trainer.training.training_state == TrainerState.Initialized assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py index b6cce7c2..51fec3ff 100644 --- a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py +++ b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py @@ -3,6 +3,7 @@ from pytest_mock import MockerFixture # pip install pytest-mock +from learning_loop_node.data_classes import TrainerState from learning_loop_node.trainer.trainer_logic import TrainerLogic from learning_loop_node.trainer.trainer_node import TrainerNode @@ -21,14 +22,14 @@ async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic): # TODO this requires trainer to have _training # trainer.load_active_training() - create_active_training_file(trainer, training_state='training_finished') + create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'confusion_matrix_synced', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) assert trainer_has_error(trainer) is False - assert trainer.training.training_state == 'confusion_matrix_synced' + assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced assert trainer.node.last_training_io.load() == trainer.training @@ -37,16 +38,16 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine assert isinstance(trainer, TestingTrainerLogic) await mock_socket_io_call(mocker, test_initialized_trainer_node, {'success': True}) - create_active_training_file(trainer, training_state='training_finished') + create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) trainer.init_from_last_training() trainer.has_new_model = True _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'confusion_matrix_synced', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) assert trainer_has_error(trainer) is False -# assert trainer.training.training_state == 'confusion_matrix_synced' +# assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced assert trainer.node.last_training_io.load() == trainer.training @@ -54,7 +55,7 @@ async def test_unsynced_model_available__sio_not_connected(test_initialized_trai trainer = test_initialized_trainer_node.trainer_logic assert isinstance(trainer, TestingTrainerLogic) - create_active_training_file(trainer, training_state='training_finished') + create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) assert test_initialized_trainer_node.sio_client.connected is False trainer.has_new_model = True @@ -62,10 +63,10 @@ async def test_unsynced_model_available__sio_not_connected(test_initialized_trai _ = asyncio.get_running_loop().create_task(trainer.run()) await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001) - await assert_training_state(trainer.training, 'training_finished', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == 'training_finished' + assert trainer.training.training_state == TrainerState.TrainingFinished assert trainer.node.last_training_io.load() == trainer.training @@ -75,16 +76,16 @@ async def test_unsynced_model_available__request_is_not_successful(test_initiali await mock_socket_io_call(mocker, test_initialized_trainer_node, {'success': False}) - create_active_training_file(trainer, training_state='training_finished') + create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) trainer.has_new_model = True _ = asyncio.get_running_loop().create_task(trainer.run()) await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001) - await assert_training_state(trainer.training, 'training_finished', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == 'training_finished' + assert trainer.training.training_state == TrainerState.TrainingFinished assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py index c46294ba..9d6b31f4 100644 --- a/learning_loop_node/trainer/tests/states/test_state_train.py +++ b/learning_loop_node/trainer/tests/states/test_state_train.py @@ -1,5 +1,6 @@ import asyncio +from learning_loop_node.data_classes import TrainerState from learning_loop_node.tests.test_helper import condition from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic @@ -8,41 +9,41 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='train_model_downloaded') + create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'start_training' # pylint: disable=protected-access assert trainer._executor is not None trainer._executor.stop() # NOTE normally a training terminates itself - await assert_training_state(trainer.training, 'training_finished', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.training.training_state == 'training_finished' + assert trainer.training.training_state == TrainerState.TrainingFinished assert trainer.node.last_training_io.load() == trainer.training async def test_stop_running_training(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='train_model_downloaded') + create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) # pylint: disable=protected-access - await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'start_training' await trainer.stop() - await assert_training_state(trainer.training, 'training_finished', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.training.training_state == 'training_finished' + assert trainer.training.training_state == TrainerState.TrainingFinished assert trainer.node.last_training_io.load() == trainer.training @@ -50,21 +51,21 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain trainer = test_initialized_trainer # NOTE e.g. when a node-computer is restarted - create_active_training_file(trainer, training_state='train_model_downloaded') + create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) trainer.init_from_last_training() trainer._can_resume = True # pylint: disable=protected-access _ = asyncio.get_running_loop().create_task(trainer.run()) await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) # pylint: disable=protected-access - await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'resume' # pylint: disable=protected-access assert trainer._executor is not None trainer._executor.stop() # NOTE normally a training terminates itself e.g - await assert_training_state(trainer.training, 'training_finished', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.training.training_state == 'training_finished' + assert trainer.training.training_state == TrainerState.TrainingFinished assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py index ca6912d1..a6f69c56 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py @@ -1,3 +1,4 @@ +from learning_loop_node.data_classes import TrainerState import asyncio import pytest @@ -43,13 +44,13 @@ async def create_valid_detection_file(trainer: TrainerLogic, number_of_entries: @pytest.mark.asyncio async def test_upload_successful(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='detected') + create_active_training_file(trainer, training_state=TrainerState.Detected) trainer.init_from_last_training() await create_valid_detection_file(trainer) await trainer.upload_detections() - assert trainer.training.training_state == 'ready_for_cleanup' + assert trainer.training.training_state == TrainerState.ReadyForCleanup assert trainer.node.last_training_io.load() == trainer.training @@ -57,7 +58,7 @@ async def test_upload_successful(test_initialized_trainer: TestingTrainerLogic): async def test_detection_upload_progress_is_stored(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='detected') + create_active_training_file(trainer, training_state=TrainerState.Detected) trainer.init_from_last_training() await create_valid_detection_file(trainer) @@ -72,7 +73,7 @@ async def test_detection_upload_progress_is_stored(test_initialized_trainer: Tes async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='detected') + create_active_training_file(trainer, training_state=TrainerState.Detected) trainer.init_from_last_training() await create_valid_detection_file(trainer, 2, 0) @@ -114,17 +115,17 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test async def test_bad_status_from_LearningLoop(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='detected', context=Context( + create_active_training_file(trainer, training_state=TrainerState.Detected, context=Context( organization='zauberzeug', project='some_bad_project')) trainer.init_from_last_training() trainer.active_training_io.save_detections([get_dummy_detections()]) _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'detection_uploading', timeout=1, interval=0.001) - await assert_training_state(trainer.training, 'detected', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == 'detected' + assert trainer.training.training_state == TrainerState.Detected assert trainer.node.last_training_io.load() == trainer.training @@ -132,28 +133,28 @@ async def test_other_errors(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer # e.g. missing detection file - create_active_training_file(trainer, training_state='detected') + create_active_training_file(trainer, training_state=TrainerState.Detected) trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'detection_uploading', timeout=1, interval=0.001) - await assert_training_state(trainer.training, 'detected', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == 'detected' + assert trainer.training.training_state == TrainerState.Detected assert trainer.node.last_training_io.load() == trainer.training async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='detected') + create_active_training_file(trainer, training_state=TrainerState.Detected) trainer.init_from_last_training() await create_valid_detection_file(trainer) _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'detection_uploading', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py index 41a5a4a8..efc40010 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py @@ -2,7 +2,7 @@ from pytest_mock import MockerFixture -from learning_loop_node.data_classes import Context +from learning_loop_node.data_classes import Context, TrainerState from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic @@ -23,11 +23,11 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer train_task = asyncio.get_running_loop().create_task(trainer.upload_model()) - await assert_training_state(trainer.training, 'train_model_uploading', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) await train_task assert trainer_has_error(trainer) is False - assert trainer.training.training_state == 'train_model_uploaded' + assert trainer.training.training_state == TrainerState.TrainModelUploaded assert trainer.training.model_id_for_detecting is not None assert trainer.node.last_training_io.load() == trainer.training @@ -35,12 +35,12 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='confusion_matrix_synced') + create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced) trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'train_model_uploading', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -55,17 +55,17 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai The training should be aborted and the training state should be set to confusion_matrix_synced.""" trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='confusion_matrix_synced') + create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced) trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'train_model_uploading', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) # TODO goes to finished because of the error - await assert_training_state(trainer.training, 'confusion_matrix_synced', timeout=2, interval=0.001) + await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == 'confusion_matrix_synced' + assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced assert trainer.training.model_id_for_detecting is None assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/test_trainer_states.py b/learning_loop_node/trainer/tests/test_trainer_states.py index 74e630d1..c5f2d04e 100644 --- a/learning_loop_node/trainer/tests/test_trainer_states.py +++ b/learning_loop_node/trainer/tests/test_trainer_states.py @@ -30,4 +30,4 @@ def test_save_load_training(): last_training_io.save(training) training = last_training_io.load() - assert training.training_state == 'preparing' + assert training.training_state == TrainerState.Preparing diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index dedaa9e6..717e9c8f 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -25,12 +25,10 @@ from .downloader import TrainingsDownloader from .executor import Executor from .io_helpers import ActiveTrainingIO +from .trainer_logic_abstraction import TrainerLogicAbstraction -if TYPE_CHECKING: - from .trainer_node import TrainerNode - -class TrainerLogic(): +class TrainerLogic(TrainerLogicAbstraction): def __init__(self, model_format: str) -> None: self.model_format: str = model_format @@ -44,12 +42,15 @@ def __init__(self, model_format: str) -> None: self._training: Optional[Training] = None self._active_training_io: Optional[ActiveTrainingIO] = None - self._node: Optional[TrainerNode] = None self.restart_after_training = os.environ.get('RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1'] self.keep_old_trainings = os.environ.get('KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1'] self.inference_batch_size = int(os.environ.get('INFERENCE_BATCH_SIZE', '10')) logging.info(f'INFERENCE_BATCH_SIZE: {self.inference_batch_size}') + @property + def training_uptime(self) -> Union[float, None]: + return time.time() - self.start_time if self.start_time else None + @property def executor(self) -> Executor: assert self._executor is not None, 'executor must be set, call `run_training` first' @@ -66,22 +67,28 @@ def active_training_io(self) -> ActiveTrainingIO: return self._active_training_io @property - def node(self) -> 'TrainerNode': - assert self._node is not None, 'node should be set by TrainerNodes before initialization' - return self._node - - @property - def is_initialized(self) -> bool: + def training_active(self) -> bool: """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'""" return self._training is not None and self._active_training_io is not None and self._node is not None @property - def state(self) -> str: - if (not self.is_initialized) or (self.training.training_state is None): - return TrainerState.Idle.value + def state(self) -> TrainerState: + if (not self.training_active) or (self.training.training_state is None): + return TrainerState.Idle else: - state = self.training.training_state - return state.value if isinstance(state, TrainerState) else state + return self.training.training_state + + @property + def training_data(self) -> TrainingData | None: + if self.training_active and self.training.data: + return self.training.data + return None + + @property + def training_context(self) -> Context | None: + if self.training_active: + return self.training.context + return None def init_new_training(self, context: Context, details: Dict) -> None: """Called on `begin_training` event from the Learning Loop. @@ -108,13 +115,25 @@ def init_from_last_training(self) -> None: assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder' self._active_training_io = ActiveTrainingIO(self._training.training_folder) + async def continue_run_if_incomplete(self) -> bool: + if not self.training_active and self.node.last_training_io.exists(): + logging.info('found incomplete training, continuing now.') + self.init_from_last_training() + asyncio.get_event_loop().create_task(self.run()) + return True + return False + + async def begin_training(self, organization: str, project: str, details: Dict) -> None: + self.init_new_training(Context(organization=organization, project=project), details) + asyncio.get_event_loop().create_task(self.run()) + async def run(self) -> None: """Called on `begin_training` event from the Learning Loop.""" self.start_time = time.time() self.errors.reset_all() try: - self.training_task = asyncio.get_running_loop().create_task(self._run()) + self.training_task = asyncio.get_running_loop().create_task(self._run_training_loop()) await self.training_task # Object is used to potentially cancel the task except asyncio.CancelledError: if not self.shutdown_event.is_set(): @@ -130,10 +149,10 @@ async def run(self) -> None: # ---------------------------------------- TRAINING STATES ---------------------------------------- - async def _run(self) -> None: + async def _run_training_loop(self) -> None: """asyncio.CancelledError is catched in train""" - if not self.is_initialized: + if not self.training_active: logging.error('could not start training - trainer is not initialized') return @@ -511,7 +530,7 @@ async def clear_training(self): async def stop(self) -> None: """If executor is running, stop it. Else cancel training task.""" - if not self.is_initialized: + if not self.training_active: return if self._executor and self._executor.is_process_running(): self.executor.stop() @@ -543,7 +562,7 @@ def may_restart(self) -> None: @property def general_progress(self) -> Optional[float]: """Represents the progress for different states.""" - if not self.is_initialized: + if not self.training_active: return None t_state = self.training.training_state diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py new file mode 100644 index 00000000..a4d1f39b --- /dev/null +++ b/learning_loop_node/trainer/trainer_logic_abstraction.py @@ -0,0 +1,75 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, List, Optional + +from ..data_classes import Context, Errors, PretrainedModel, TrainerState, TrainingData + +if TYPE_CHECKING: + from .trainer_node import TrainerNode + + +class TrainerLogicAbstraction(ABC): + + def __init__(self): + self._node: Optional['TrainerNode'] = None # type: ignore + self.errors = Errors() + + @property + def node(self) -> 'TrainerNode': + assert self._node is not None, 'node should be set by TrainerNodes before initialization' + return self._node + + @property + @abstractmethod + def state(self) -> TrainerState: + """Returns the current state of the training logic""" + + @property + @abstractmethod + def training_uptime(self) -> float | None: + """Returns the time in seconds since the training started or None if idle""" + + @property + @abstractmethod + def general_progress(self) -> float | None: + """Returns the general progress of the training per state or None if idle""" + + @property + @abstractmethod + def provided_pretrained_models(self) -> List[PretrainedModel]: + """Returns the list of provided pretrained models""" + + @property + @abstractmethod + def model_architecture(self) -> str: + """Returns the architecture name of the model""" + + @property + @abstractmethod + def hyperparameters(self) -> dict | None: + """Returns the hyperparameters if available""" + + @property + @abstractmethod + def training_data(self) -> TrainingData | None: + """Returns the training data if available""" + + @property + @abstractmethod + def training_context(self) -> Context | None: + """Returns the training context if available""" + + @abstractmethod + async def begin_training(self, organization: str, project: str, details: dict): + """Starts the training process""" + + @abstractmethod + async def stop(self): + """Stops the training process""" + + @abstractmethod + async def shutdown(self): + """Stops the training process and releases resources""" + + @abstractmethod + async def continue_run_if_incomplete(self) -> bool: + """Continues the training if it is incomplete""" diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py index ae8f2527..fb191a9d 100644 --- a/learning_loop_node/trainer/trainer_node.py +++ b/learning_loop_node/trainer/trainer_node.py @@ -1,21 +1,20 @@ import asyncio -import time from dataclasses import asdict -from typing import Dict, Optional, Union +from typing import Dict, Optional from fastapi.encoders import jsonable_encoder from socketio import AsyncClient -from ..data_classes import Context, TrainingStatus +from ..data_classes import TrainingStatus from ..node import Node from .io_helpers import LastTrainingIO from .rest import backdoor_controls, controls -from .trainer_logic import TrainerLogic +from .trainer_logic_abstraction import TrainerLogicAbstraction class TrainerNode(Node): - def __init__(self, name: str, trainer_logic: TrainerLogic, uuid: Optional[str] = None, use_backdoor_controls: bool = False): + def __init__(self, name: str, trainer_logic: TrainerLogicAbstraction, uuid: Optional[str] = None, use_backdoor_controls: bool = False): super().__init__(name, uuid, 'trainer') trainer_logic._node = self # pylint: disable=protected-access self.trainer_logic = trainer_logic @@ -24,18 +23,7 @@ def __init__(self, name: str, trainer_logic: TrainerLogic, uuid: Optional[str] = if use_backdoor_controls: self.include_router(backdoor_controls.router, tags=["controls"]) - # --------------------------------------------------- STATUS --------------------------------------------------- - - @property - def progress(self) -> Union[float, None]: - return self.trainer_logic.general_progress if (self.trainer_logic is not None and - hasattr(self.trainer_logic, 'general_progress')) else None - - @property - def training_uptime(self) -> Union[float, None]: - return time.time() - self.trainer_logic.start_time if self.trainer_logic.start_time else None - - # ----------------------------------- LIVECYCLE: ABSTRACT NODE METHODS -------------------------- + # ----------------------------------- NODE LIVECYCLE METHODS -------------------------- async def on_startup(self): pass @@ -46,26 +34,24 @@ async def on_shutdown(self): async def on_repeat(self): try: - if await self.continue_run_if_incomplete(): + if await self.trainer_logic.continue_run_if_incomplete(): return # NOTE: we prevent sending idle status after starting a continuation await self.send_status() except Exception as e: if isinstance(e, asyncio.TimeoutError): self.log.warning('timeout when sending status to learning loop, reconnecting sio_client') - await self.sio_client.disconnect() - # NOTE: reconnect happens in node._on_repeat + await self.sio_client.disconnect() # NOTE: reconnect happens in node._on_repeat else: self.log.exception(f'could not send status state: {e}') - # ---------------------------------------------- NODE ABSTRACT METHODS --------------------------------------------------- + # ---------------------------------------------- NODE METHODS --------------------------------------------------- def register_sio_events(self, sio_client: AsyncClient): @sio_client.event async def begin_training(organization: str, project: str, details: Dict): self.log.info('received begin_training from server') - self.trainer_logic.init_new_training(Context(organization=organization, project=project), details) - asyncio.get_event_loop().create_task(self.trainer_logic.run()) + await self.trainer_logic.begin_training(organization, project, details) return True @sio_client.event @@ -86,30 +72,21 @@ async def send_status(self): name=self.name, state=self.trainer_logic.state, errors={}, - uptime=self.training_uptime, - progress=self.progress) + uptime=self.trainer_logic.training_uptime, + progress=self.trainer_logic.general_progress) status.pretrained_models = self.trainer_logic.provided_pretrained_models status.architecture = self.trainer_logic.model_architecture - if self.trainer_logic.is_initialized and self.trainer_logic.training.data: - status.train_image_count = self.trainer_logic.training.data.train_image_count() - status.test_image_count = self.trainer_logic.training.data.test_image_count() - status.skipped_image_count = self.trainer_logic.training.data.skipped_image_count + if data := self.trainer_logic.training_data: + status.train_image_count = data.train_image_count() + status.test_image_count = data.test_image_count() + status.skipped_image_count = data.skipped_image_count status.hyperparameters = self.trainer_logic.hyperparameters status.errors = self.trainer_logic.errors.errors - status.context = self.trainer_logic.training.context + status.context = self.trainer_logic.training_context self.log.info(f'sending status: {status.short_str()}') result = await self.sio_client.call('update_trainer', jsonable_encoder(asdict(status)), timeout=30) - assert isinstance(result, Dict) - if not result['success']: + if isinstance(result, Dict) and not result['success']: self.log.error(f'Error when sending status update: Response from loop was:\n {result}') - - async def continue_run_if_incomplete(self) -> bool: - if not self.trainer_logic.is_initialized and self.last_training_io.exists(): - self.log.info('found incomplete training, continuing now.') - self.trainer_logic.init_from_last_training() - asyncio.get_event_loop().create_task(self.trainer_logic.run()) - return True - return False From 3b908b31058f2166ce7018ed85a06504c3538acd Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Thu, 7 Mar 2024 16:50:37 +0100 Subject: [PATCH 11/62] introduce trainer_logic_abstraction --- learning_loop_node/trainer/trainer_logic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 717e9c8f..b08f1c87 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -10,7 +10,7 @@ from datetime import datetime from glob import glob from time import perf_counter -from typing import TYPE_CHECKING, Coroutine, Dict, List, Optional, Union +from typing import Coroutine, Dict, List, Optional, Union from uuid import uuid4 import socketio @@ -18,7 +18,7 @@ from fastapi.encoders import jsonable_encoder from tqdm import tqdm -from ..data_classes import (BasicModel, Category, Context, Detections, Errors, Hyperparameter, ModelInformation, +from ..data_classes import (BasicModel, Category, Context, Detections, Hyperparameter, ModelInformation, PretrainedModel, TrainerState, Training, TrainingData, TrainingError) from ..helpers.misc import create_image_folder, create_project_folder, generate_training, is_valid_uuid4 from . import training_syncronizer @@ -31,12 +31,12 @@ class TrainerLogic(TrainerLogicAbstraction): def __init__(self, model_format: str) -> None: + super().__init__() self.model_format: str = model_format self._executor: Optional[Executor] = None self.start_time: Optional[float] = None self.training_task: Optional[asyncio.Task] = None self.start_training_task: Optional[Coroutine] = None - self.errors = Errors() self.shutdown_event: asyncio.Event = asyncio.Event() self.detection_progress = 0.0 From c6eb38b127bb5d3dffb63d23ae5ca7c46ecb9174 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Thu, 7 Mar 2024 22:31:53 +0100 Subject: [PATCH 12/62] Remove lots of duplicate code in state processing - test locally green --- learning_loop_node/data_classes/training.py | 8 +- learning_loop_node/data_exchanger.py | 3 + .../tests/test_client_communication.py | 3 +- learning_loop_node/trainer/io_helpers.py | 64 ++- learning_loop_node/trainer/rest/controls.py | 2 +- .../trainer/tests/state_helper.py | 2 +- .../tests/states/test_state_detecting.py | 28 +- .../states/test_state_download_train_model.py | 29 +- .../tests/states/test_state_prepare.py | 18 +- .../test_state_sync_confusion_matrix.py | 26 +- .../trainer/tests/states/test_state_train.py | 25 +- .../states/test_state_upload_detections.py | 37 +- .../tests/states/test_state_upload_model.py | 22 +- .../trainer/tests/test_errors.py | 17 +- .../trainer/tests/testing_trainer_logic.py | 9 +- learning_loop_node/trainer/trainer_logic.py | 412 +++++------------- .../trainer/trainer_logic_abstraction.py | 136 ++++-- learning_loop_node/trainer/trainer_node.py | 8 +- .../trainer/training_syncronizer.py | 45 +- mock_trainer/app_code/progress_simulator.py | 4 +- .../app_code/tests/test_mock_trainer.py | 5 +- 21 files changed, 433 insertions(+), 470 deletions(-) diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index 9a41928f..a0601c2d 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -1,8 +1,9 @@ import sys +import time from dataclasses import dataclass, field from enum import Enum -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional # pylint: disable=no-name-in-module from .general import Category, Context @@ -65,7 +66,7 @@ class TrainerState(str, Enum): class TrainingStatus(): id: str # TODO this must not be changed, but tests wont detect it -> update tests! name: str - state: Optional[TrainerState] + state: Optional[str] errors: Optional[Dict] uptime: Optional[float] progress: Optional[float] @@ -95,11 +96,12 @@ class Training(): project_folder: str images_folder: str training_folder: str + start_time: float = field(default_factory=time.time) base_model_id: Optional[str] = None data: Optional[TrainingData] = None training_number: Optional[int] = None - training_state: Optional[TrainerState] = None + training_state: Optional[str] = None model_id_for_detecting: Optional[str] = None hyperparameters: Optional[Dict] = None diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index 6bb30e6d..ab53b243 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -21,6 +21,9 @@ def __init__(self, cause: str, *args: object) -> None: super().__init__(*args) self.cause = cause + def __str__(self) -> str: + return f'DownloadError: {self.cause}' + class DataExchanger(): diff --git a/learning_loop_node/detector/tests/test_client_communication.py b/learning_loop_node/detector/tests/test_client_communication.py index 16f0fa6b..97daf93a 100644 --- a/learning_loop_node/detector/tests/test_client_communication.py +++ b/learning_loop_node/detector/tests/test_client_communication.py @@ -2,7 +2,7 @@ import json import pytest -import requests +import requests # type: ignore from learning_loop_node import DetectorNode from learning_loop_node.data_classes import ModelInformation @@ -88,6 +88,7 @@ async def test_sio_upload(test_detector_node: DetectorNode, sio_client): assert len(get_outbox_files(test_detector_node.outbox)) == 2, 'There should be one image and one .json file.' +# NOTE: This test seems to be flaky. async def test_about_endpoint(test_detector_node: DetectorNode): await asyncio.sleep(1) response = requests.get(f'http://localhost:{GLOBALS.detector_port}/about', timeout=30) diff --git a/learning_loop_node/trainer/io_helpers.py b/learning_loop_node/trainer/io_helpers.py index 3755f2f2..6ec7a5c3 100644 --- a/learning_loop_node/trainer/io_helpers.py +++ b/learning_loop_node/trainer/io_helpers.py @@ -1,5 +1,6 @@ import json +import logging import os from dataclasses import asdict from pathlib import Path @@ -8,8 +9,9 @@ from dacite import from_dict from fastapi.encoders import jsonable_encoder -from ..data_classes import Detections, Training +from ..data_classes import Context, Detections, Training from ..globals import GLOBALS +from ..loop_communication import LoopCommunicator class LastTrainingIO: @@ -35,13 +37,16 @@ def exists(self) -> bool: class ActiveTrainingIO: - @staticmethod - def create_mocked_training_io() -> 'ActiveTrainingIO': - training_folder = '' - return ActiveTrainingIO(training_folder) + # @staticmethod + # def create_mocked_training_io() -> 'ActiveTrainingIO': + # training_folder = '' + # return ActiveTrainingIO(training_folder) - def __init__(self, training_folder: str): + def __init__(self, training_folder: str, loop_communicator: LoopCommunicator, context: Context) -> None: self.training_folder = training_folder + self.loop_communicator = loop_communicator + self.context = context + self.mup_path = f'{training_folder}/model_uploading_progress.txt' # string with placeholder gor index self.det_path = f'{training_folder}' + '/detections_{0}.json' @@ -63,13 +68,16 @@ def load_model_upload_progress(self) -> List[str]: # detections - def get_detection_file_names(self) -> List[Path]: + def _get_detection_file_names(self) -> List[Path]: files = [f for f in Path(self.training_folder).iterdir() if f.is_file() and f.name.startswith('detections_')] if not files: return [] return files + def get_number_of_detection_files(self) -> int: + return len(self._get_detection_file_names()) + # TODO: saving and uploading multiple files is not tested! def save_detections(self, detections: List[Detections], index: int = 0) -> None: with open(self.det_path.format(index), 'w') as f: @@ -81,11 +89,11 @@ def load_detections(self, index: int = 0) -> List[Detections]: return [from_dict(data_class=Detections, data=d) for d in dict_list] def delete_detections(self) -> None: - for file in self.get_detection_file_names(): + for file in self._get_detection_file_names(): os.remove(Path(self.training_folder) / file) def detections_exist(self) -> bool: - return bool(self.get_detection_file_names()) + return bool(self._get_detection_file_names()) # detections upload file index @@ -124,3 +132,41 @@ def delete_detection_upload_progress(self) -> None: def detection_upload_progress_exist(self) -> bool: return os.path.exists(self.dup_path) + + async def upload_detetions(self): + num_files = self.get_number_of_detection_files() + print(f'num_files: {num_files}', flush=True) + if not num_files: + raise Exception('no detection files found') + current_json_file_index = self.load_detections_upload_file_index() + for i in range(current_json_file_index, num_files): + detections = self.load_detections(i) + logging.info(f'uploading detections {i}/{num_files}') + await self._upload_detections_batched(self.context, detections) + self.save_detections_upload_file_index(i+1) + + async def _upload_detections_batched(self, context: Context, detections: List[Detections]): + batch_size = 10 + skip_detections = self.load_detection_upload_progress() + for i in range(skip_detections, len(detections), batch_size): + up_progress = i+batch_size + batch_detections = detections[i:up_progress] + dict_detections = [jsonable_encoder(asdict(detection)) for detection in batch_detections] + logging.info(f'uploading detections. File size : {len(json.dumps(dict_detections))}') + await self._upload_detections(context, batch_detections, up_progress) + skip_detections = up_progress + + async def _upload_detections(self, context: Context, batch_detections: List[Detections], up_progress: int): + detections_json = [jsonable_encoder(asdict(detections)) for detections in batch_detections] + response = await self.loop_communicator.post( + f'/{context.organization}/projects/{context.project}/detections', json=detections_json) + if response.status_code != 200: + msg = f'could not upload detections. {str(response)}' + logging.error(msg) + raise Exception(msg) + else: + logging.info('successfully uploaded detections') + if up_progress > len(batch_detections): + self.save_detection_upload_progress(0) + else: + self.save_detection_upload_progress(up_progress) diff --git a/learning_loop_node/trainer/rest/controls.py b/learning_loop_node/trainer/rest/controls.py index 17434d64..b8fbbec8 100644 --- a/learning_loop_node/trainer/rest/controls.py +++ b/learning_loop_node/trainer/rest/controls.py @@ -22,5 +22,5 @@ async def operation_mode(organization: str, project: str, version: str, request: model_id = next(m for m in models if m['version'] == version)['id'] logging.info(model_id) trainer: TrainerLogic = request.app.trainer - await trainer.do_detections() + await trainer._do_detections() return "OK" diff --git a/learning_loop_node/trainer/tests/state_helper.py b/learning_loop_node/trainer/tests/state_helper.py index 01c9001d..a5b982ec 100644 --- a/learning_loop_node/trainer/tests/state_helper.py +++ b/learning_loop_node/trainer/tests/state_helper.py @@ -7,7 +7,7 @@ def create_active_training_file(trainer: TrainerLogic, **kwargs) -> None: update_attributes(trainer._training, **kwargs) # pylint: disable=protected-access - trainer.node.last_training_io.save(training=trainer.training) + trainer.node.last_training_io.save(training=trainer.active_training) async def assert_training_state(training: Training, state: str, timeout: float, interval: float) -> None: diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py index 5d7583fe..d48279ee 100644 --- a/learning_loop_node/trainer/tests/states/test_state_detecting.py +++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py @@ -18,14 +18,17 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi create_active_training_file(trainer, training_state='train_model_uploaded', model_id_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a') # trainer.load_active_training() - _ = asyncio.get_running_loop().create_task(trainer.do_detections()) + _ = asyncio.get_running_loop().create_task( + trainer.perform_state('do_detections', TrainerState.Detecting, + TrainerState.Detected, trainer._do_detections) + ) - await assert_training_state(trainer.training, 'detecting', timeout=1, interval=0.001) - await assert_training_state(trainer.training, 'detected', timeout=10, interval=0.001) + await assert_training_state(trainer.active_training, 'detecting', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, 'detected', timeout=10, interval=0.001) assert trainer_has_error(trainer) is False - assert trainer.training.training_state == TrainerState.Detected - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.Detected + assert trainer.node.last_training_io.load() == trainer.active_training assert trainer.active_training_io.detections_exist() @@ -33,11 +36,11 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded) trainer.init_from_last_training() - trainer.training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242' + trainer.active_training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242' _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'detecting', timeout=5, interval=0.001) + await assert_training_state(trainer.active_training, 'detecting', timeout=5, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -54,13 +57,14 @@ async def test_model_not_downloadable_error(test_initialized_trainer: TestingTra _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'detecting', timeout=1, interval=0.001) - await assert_training_state(trainer.training, 'train_model_uploaded', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, 'detecting', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, 'train_model_uploaded', timeout=1, interval=0.001) + await asyncio.sleep(0.1) assert trainer_has_error(trainer) - assert trainer.training.training_state == TrainerState.TrainModelUploaded - assert trainer.training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000' - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.TrainModelUploaded + assert trainer.active_training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000' + assert trainer.node.last_training_io.load() == trainer.active_training def test_save_load_detections(test_initialized_trainer: TestingTrainerLogic): diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py index 5785e5fa..12e9b745 100644 --- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py @@ -14,17 +14,20 @@ async def test_downloading_is_successful(test_initialized_trainer: TestingTraine trainer.model_format = 'mocked' trainer.init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.download_model()) - await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) - await assert_training_state(trainer.training, 'train_model_downloaded', timeout=1, interval=0.001) + asyncio.get_running_loop().create_task( + trainer.perform_state('download_model', + TrainerState.TrainModelDownloading, + TrainerState.TrainModelDownloaded, trainer._download_model)) + await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, 'train_model_downloaded', timeout=1, interval=0.001) - assert trainer.training.training_state == TrainerState.TrainModelDownloaded - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.TrainModelDownloaded + assert trainer.node.last_training_io.load() == trainer.active_training # file on disk - assert os.path.exists(f'{trainer.training.training_folder}/base_model.json') - assert os.path.exists(f'{trainer.training.training_folder}/file_1.txt') - assert os.path.exists(f'{trainer.training.training_folder}/file_2.txt') + assert os.path.exists(f'{trainer.active_training.training_folder}/base_model.json') + assert os.path.exists(f'{trainer.active_training.training_folder}/file_1.txt') + assert os.path.exists(f'{trainer.active_training.training_folder}/file_2.txt') async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogic): @@ -33,7 +36,7 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -49,10 +52,10 @@ async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic) trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) - await assert_training_state(trainer.training, TrainerState.DataDownloaded, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.DataDownloaded, timeout=1, interval=0.001) assert trainer.errors.has_error_for('download_model') assert trainer._training is not None # pylint: disable=protected-access - assert trainer.training.training_state == TrainerState.DataDownloaded - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.DataDownloaded + assert trainer.node.last_training_io.load() == trainer.active_training diff --git a/learning_loop_node/trainer/tests/states/test_state_prepare.py b/learning_loop_node/trainer/tests/states/test_state_prepare.py index 261fbb70..8c490c92 100644 --- a/learning_loop_node/trainer/tests/states/test_state_prepare.py +++ b/learning_loop_node/trainer/tests/states/test_state_prepare.py @@ -17,11 +17,11 @@ async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerL create_active_training_file(trainer) trainer.init_from_last_training() - await trainer.prepare() + await trainer.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare) assert trainer_has_error(trainer) is False - assert trainer.training.training_state == TrainerState.DataDownloaded - assert trainer.training.data is not None - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.DataDownloaded + assert trainer.active_training.data is not None + assert trainer.node.last_training_io.load() == trainer.active_training async def test_abort_preparing(test_initialized_trainer: TestingTrainerLogic): @@ -30,7 +30,7 @@ async def test_abort_preparing(test_initialized_trainer: TestingTrainerLogic): trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -46,10 +46,10 @@ async def test_request_error(test_initialized_trainer: TestingTrainerLogic): trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=3, interval=0.001) - await assert_training_state(trainer.training, TrainerState.Initialized, timeout=3, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=3, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.Initialized, timeout=3, interval=0.001) assert trainer_has_error(trainer) assert trainer._training is not None # pylint: disable=protected-access - assert trainer.training.training_state == TrainerState.Initialized - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.Initialized + assert trainer.node.last_training_io.load() == trainer.active_training diff --git a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py index 51fec3ff..cc145233 100644 --- a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py +++ b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py @@ -27,10 +27,10 @@ async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic): _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) assert trainer_has_error(trainer) is False - assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced + assert trainer.node.last_training_io.load() == trainer.active_training async def test_unsynced_model_available__sync_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture): @@ -44,11 +44,11 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine trainer.has_new_model = True _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) assert trainer_has_error(trainer) is False # assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.node.last_training_io.load() == trainer.active_training async def test_unsynced_model_available__sio_not_connected(test_initialized_trainer_node: TrainerNode): @@ -62,12 +62,12 @@ async def test_unsynced_model_available__sio_not_connected(test_initialized_trai _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001) - await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.active_training async def test_unsynced_model_available__request_is_not_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture): @@ -81,12 +81,12 @@ async def test_unsynced_model_available__request_is_not_successful(test_initiali trainer.has_new_model = True _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001) - await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.active_training async def test_basic_mock(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture): diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py index 9d6b31f4..46a7f953 100644 --- a/learning_loop_node/trainer/tests/states/test_state_train.py +++ b/learning_loop_node/trainer/tests/states/test_state_train.py @@ -14,17 +14,18 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await asyncio.sleep(0.1) # give tests a bit time to to check for the state assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'start_training' # pylint: disable=protected-access assert trainer._executor is not None trainer._executor.stop() # NOTE normally a training terminates itself - await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.active_training async def test_stop_running_training(test_initialized_trainer: TestingTrainerLogic): @@ -36,15 +37,15 @@ async def test_stop_running_training(test_initialized_trainer: TestingTrainerLog _ = asyncio.get_running_loop().create_task(trainer.run()) await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) # pylint: disable=protected-access - await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'start_training' await trainer.stop() - await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.active_training async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrainerLogic): @@ -58,14 +59,14 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain _ = asyncio.get_running_loop().create_task(trainer.run()) await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) # pylint: disable=protected-access - await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'resume' # pylint: disable=protected-access assert trainer._executor is not None trainer._executor.stop() # NOTE normally a training terminates itself e.g - await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.active_training diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py index a6f69c56..757cf968 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py @@ -1,11 +1,10 @@ -from learning_loop_node.data_classes import TrainerState import asyncio import pytest from dacite import from_dict from learning_loop_node.conftest import get_dummy_detections -from learning_loop_node.data_classes import BoxDetection, Context, Detections +from learning_loop_node.data_classes import BoxDetection, Context, Detections, TrainerState from learning_loop_node.loop_communication import LoopCommunicator from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic @@ -48,10 +47,11 @@ async def test_upload_successful(test_initialized_trainer: TestingTrainerLogic): trainer.init_from_last_training() await create_valid_detection_file(trainer) - await trainer.upload_detections() + await asyncio.get_running_loop().create_task( + trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions)) - assert trainer.training.training_state == TrainerState.ReadyForCleanup - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.ReadyForCleanup + assert trainer.node.last_training_io.load() == trainer.active_training @pytest.mark.asyncio @@ -64,7 +64,10 @@ async def test_detection_upload_progress_is_stored(test_initialized_trainer: Tes await create_valid_detection_file(trainer) assert trainer.active_training_io.load_detections_upload_file_index() == 0 - await trainer.upload_detections() + # await trainer.upload_detections() + await asyncio.get_running_loop().create_task( + trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions)) + assert trainer.active_training_io.load_detection_upload_progress() == 0 # Progress is reset for every file assert trainer.active_training_io.load_detections_upload_file_index() == 1 @@ -88,7 +91,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test for i in range(skip_detections, len(detections), batch_size): batch_detections = detections[i:i+batch_size] # pylint: disable=protected-access - await trainer._upload_detections(trainer.training.context, batch_detections, i + batch_size) + await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size) expected_value = i + batch_size if i + batch_size < len(detections) else 0 # Progress is reset for every file assert trainer.active_training_io.load_detection_upload_progress() == expected_value @@ -104,7 +107,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test for i in range(skip_detections, len(detections), batch_size): batch_detections = detections[i:i+batch_size] # pylint: disable=protected-access - await trainer._upload_detections(trainer.training.context, batch_detections, i + batch_size) + await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size) expected_value = i + batch_size if i + batch_size < len(detections) else 0 # Progress is reset for every file assert trainer.active_training_io.load_detection_upload_progress() == expected_value @@ -121,12 +124,12 @@ async def test_bad_status_from_LearningLoop(test_initialized_trainer: TestingTra trainer.active_training_io.save_detections([get_dummy_detections()]) _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) - await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == TrainerState.Detected - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.Detected + assert trainer.node.last_training_io.load() == trainer.active_training async def test_other_errors(test_initialized_trainer: TestingTrainerLogic): @@ -137,12 +140,12 @@ async def test_other_errors(test_initialized_trainer: TestingTrainerLogic): trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) - await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == TrainerState.Detected - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.Detected + assert trainer.node.last_training_io.load() == trainer.active_training async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic): @@ -154,7 +157,7 @@ async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic): _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py index efc40010..21727b27 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py @@ -23,13 +23,13 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer train_task = asyncio.get_running_loop().create_task(trainer.upload_model()) - await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) await train_task assert trainer_has_error(trainer) is False - assert trainer.training.training_state == TrainerState.TrainModelUploaded - assert trainer.training.model_id_for_detecting is not None - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.TrainModelUploaded + assert trainer.active_training.model_id_for_detecting is not None + assert trainer.node.last_training_io.load() == trainer.active_training async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic): @@ -40,7 +40,7 @@ async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic) _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -60,14 +60,14 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) # TODO goes to finished because of the error - await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001) assert trainer_has_error(trainer) - assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced - assert trainer.training.model_id_for_detecting is None - assert trainer.node.last_training_io.load() == trainer.training + assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced + assert trainer.active_training.model_id_for_detecting is None + assert trainer.node.last_training_io.load() == trainer.active_training async def test_mock_loop_response_example(mocker: MockerFixture, test_initialized_trainer: TestingTrainerLogic): @@ -79,7 +79,7 @@ async def test_mock_loop_response_example(mocker: MockerFixture, test_initialize trainer.init_from_last_training() # pylint: disable=protected-access - result = await trainer._upload_model_return_new_id(Context(organization='zauberzeug', project='demo')) + result = await trainer._upload_model_return_new_model_uuid(Context(organization='zauberzeug', project='demo')) assert result is not None diff --git a/learning_loop_node/trainer/tests/test_errors.py b/learning_loop_node/trainer/tests/test_errors.py index bb6b3d8a..1ba85572 100644 --- a/learning_loop_node/trainer/tests/test_errors.py +++ b/learning_loop_node/trainer/tests/test_errors.py @@ -1,35 +1,38 @@ import asyncio import re +from learning_loop_node.data_classes import TrainerState from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic async def test_training_process_is_stopped_when_trainer_reports_error(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='train_model_downloaded') + create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) trainer.error_msg = 'some_error' - await assert_training_state(trainer.training, 'train_model_downloaded', timeout=6, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) async def test_log_can_provide_only_data_for_current_run(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state='train_model_downloaded') + create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) trainer.init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await asyncio.sleep(0.1) # give tests a bit time to to check for the state + assert trainer._executor is not None assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) == 1 trainer.error_msg = 'some_error' - await assert_training_state(trainer.training, 'train_model_downloaded', timeout=6, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) trainer.error_msg = None - await assert_training_state(trainer.training, 'training_running', timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) await asyncio.sleep(1) assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) > 1 diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index 08589657..d6e9b78a 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -54,19 +54,14 @@ async def _download_model(self) -> None: await super()._download_model() await asyncio.sleep(0.1) # give tests a bit time to to check for the state - async def ensure_confusion_matrix_synced(self): - await asyncio.sleep(0.1) # give tests a bit time to to check for the state - await super().ensure_confusion_matrix_synced() - await asyncio.sleep(0.1) # give tests a bit time to to check for the state - async def upload_model(self) -> None: await asyncio.sleep(0.1) # give tests a bit time to to check for the state await super().upload_model() await asyncio.sleep(0.1) # give tests a bit time to to check for the state - async def _upload_model_return_new_id(self, context: Context) -> Optional[str]: + async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]: await asyncio.sleep(0.1) # give tests a bit time to to check for the state - result = await super()._upload_model_return_new_id(context) + result = await super()._upload_model_return_new_model_uuid(context) await asyncio.sleep(0.1) # give tests a bit time to to check for the state assert isinstance(result, str) return result diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index b08f1c87..4bfdb743 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -4,7 +4,6 @@ import os import shutil import sys -import time from abc import abstractmethod from dataclasses import asdict from datetime import datetime @@ -18,10 +17,9 @@ from fastapi.encoders import jsonable_encoder from tqdm import tqdm -from ..data_classes import (BasicModel, Category, Context, Detections, Hyperparameter, ModelInformation, - PretrainedModel, TrainerState, Training, TrainingData, TrainingError) +from ..data_classes import (BasicModel, Category, Context, Detections, Hyperparameter, ModelInformation, TrainerState, + Training, TrainingData, TrainingError, TrainingOut) from ..helpers.misc import create_image_folder, create_project_folder, generate_training, is_valid_uuid4 -from . import training_syncronizer from .downloader import TrainingsDownloader from .executor import Executor from .io_helpers import ActiveTrainingIO @@ -31,98 +29,54 @@ class TrainerLogic(TrainerLogicAbstraction): def __init__(self, model_format: str) -> None: - super().__init__() + super().__init__(model_format) self.model_format: str = model_format + # NOTE: String to be used in the file path for the model on the server: + # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' + self._executor: Optional[Executor] = None - self.start_time: Optional[float] = None self.training_task: Optional[asyncio.Task] = None self.start_training_task: Optional[Coroutine] = None self.shutdown_event: asyncio.Event = asyncio.Event() self.detection_progress = 0.0 - self._training: Optional[Training] = None - self._active_training_io: Optional[ActiveTrainingIO] = None - self.restart_after_training = os.environ.get('RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1'] - self.keep_old_trainings = os.environ.get('KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1'] - self.inference_batch_size = int(os.environ.get('INFERENCE_BATCH_SIZE', '10')) - logging.info(f'INFERENCE_BATCH_SIZE: {self.inference_batch_size}') - - @property - def training_uptime(self) -> Union[float, None]: - return time.time() - self.start_time if self.start_time else None - @property def executor(self) -> Executor: assert self._executor is not None, 'executor must be set, call `run_training` first' return self._executor - @property - def training(self) -> Training: - assert self._training is not None, 'training must be set, call `init` first' - return self._training - - @property - def active_training_io(self) -> ActiveTrainingIO: - assert self._active_training_io is not None, 'active_training_io must be set, call `init` first' - return self._active_training_io - - @property - def training_active(self) -> bool: - """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'""" - return self._training is not None and self._active_training_io is not None and self._node is not None - - @property - def state(self) -> TrainerState: - if (not self.training_active) or (self.training.training_state is None): - return TrainerState.Idle - else: - return self.training.training_state - - @property - def training_data(self) -> TrainingData | None: - if self.training_active and self.training.data: - return self.training.data - return None - - @property - def training_context(self) -> Context | None: - if self.training_active: - return self.training.context - return None - def init_new_training(self, context: Context, details: Dict) -> None: """Called on `begin_training` event from the Learning Loop. Note that details needs the entries 'categories' and 'training_number'""" - try: - project_folder = create_project_folder(context) - if not self.keep_old_trainings: - # NOTE: We delete all existing training folders because they are not needed anymore. - TrainerLogic.delete_all_training_folders(project_folder) - self._training = generate_training(project_folder, context) - self._training.data = TrainingData(categories=Category.from_list(details['categories'])) - self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details) - self._training.training_number = details['training_number'] - self._training.base_model_id = details['id'] - self._training.training_state = TrainerState.Initialized - self._active_training_io = ActiveTrainingIO(self._training.training_folder) - logging.info(f'init training: {self._training}') - except Exception: - logging.exception('Error in init') - - def init_from_last_training(self) -> None: - self._training = self.node.last_training_io.load() - assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder' - self._active_training_io = ActiveTrainingIO(self._training.training_folder) - - async def continue_run_if_incomplete(self) -> bool: - if not self.training_active and self.node.last_training_io.exists(): + project_folder = create_project_folder(context) + if not self.keep_old_trainings: + # NOTE: We delete all existing training folders because they are not needed anymore. + TrainerLogic.delete_all_training_folders(project_folder) + self._training = generate_training(project_folder, context) + self._training.data = TrainingData(categories=Category.from_list(details['categories'])) + self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details) + self._training.training_number = details['training_number'] + self._training.base_model_id = details['id'] + self._training.training_state = TrainerState.Initialized + self._active_training_io = ActiveTrainingIO( + self._training.training_folder, self.loop_communicator, context) + logging.info(f'training initialized: {self._training}') + + async def try_continue_run_if_incomplete(self) -> bool: + if not self.training_active and self.last_training_io.exists(): logging.info('found incomplete training, continuing now.') self.init_from_last_training() asyncio.get_event_loop().create_task(self.run()) return True return False + def init_from_last_training(self) -> None: + self._training = self.last_training_io.load() + assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder' + self._active_training_io = ActiveTrainingIO( + self._training.training_folder, self.loop_communicator, self._training.context) + async def begin_training(self, organization: str, project: str, details: Dict) -> None: self.init_new_training(Context(organization=organization, project=project), details) asyncio.get_event_loop().create_task(self.run()) @@ -130,7 +84,6 @@ async def begin_training(self, organization: str, project: str, details: Dict) - async def run(self) -> None: """Called on `begin_training` event from the Learning Loop.""" - self.start_time = time.time() self.errors.reset_all() try: self.training_task = asyncio.get_running_loop().create_task(self._run_training_loop()) @@ -138,14 +91,12 @@ async def run(self) -> None: except asyncio.CancelledError: if not self.shutdown_event.is_set(): logging.info('training task was cancelled but not by shutdown event') - self.training.training_state = TrainerState.ReadyForCleanup - self.node.last_training_io.save(self.training) + self.active_training.training_state = TrainerState.ReadyForCleanup + self.last_training_io.save(self.active_training) await self.clear_training() except Exception as e: logging.exception(f'Error in train: {e}') - finally: - self.start_time = None # ---------------------------------------- TRAINING STATES ---------------------------------------- @@ -157,94 +108,55 @@ async def _run_training_loop(self) -> None: return while self._training is not None: - tstate = self.training.training_state - logging.info(f'STATE LOOP: {tstate}') + tstate = self.active_training.training_state + logging.info(f'STATE LOOP: {tstate}, eerrors: {self.errors.errors}') await asyncio.sleep(0.6) # Note: Required for pytests! if tstate == TrainerState.Initialized: # -> DataDownloading -> DataDownloaded - await self.prepare() + await self.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare) elif tstate == TrainerState.DataDownloaded: # -> TrainModelDownloading -> TrainModelDownloaded - await self.download_model() + await self.perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model) elif tstate == TrainerState.TrainModelDownloaded: # -> TrainingRunning -> TrainingFinished - await self.train() + await self.perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train) elif tstate == TrainerState.TrainingFinished: # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced - await self.ensure_confusion_matrix_synced() + await self.perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self.sync_confusion_matrix) + # await self.ensure_confusion_matrix_synced() elif tstate == TrainerState.ConfusionMatrixSynced: # -> TrainModelUploading -> TrainModelUploaded await self.upload_model() elif tstate == TrainerState.TrainModelUploaded: # -> Detecting -> Detected - await self.do_detections() + await self.perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections) elif tstate == TrainerState.Detected: # -> DetectionUploading -> ReadyForCleanup - await self.upload_detections() + await self.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions) elif tstate == TrainerState.ReadyForCleanup: # -> RESTART or TrainingFinished await self.clear_training() self.may_restart() - async def prepare(self) -> None: - previous_state = self.training.training_state - self.training.training_state = TrainerState.DataDownloading - error_key = 'prepare' - try: - await self._prepare() - except asyncio.CancelledError: - logging.warning('CancelledError in prepare') - raise - except Exception as e: - logging.exception("Unknown error in 'prepare'. Exception:") - self.training.training_state = previous_state - self.errors.set(error_key, str(e)) - else: - self.errors.reset(error_key) - self.training.training_state = TrainerState.DataDownloaded - self.node.last_training_io.save(self.training) - async def _prepare(self) -> None: - self.node.data_exchanger.set_context(self.training.context) - downloader = TrainingsDownloader(self.node.data_exchanger) - image_data, skipped_image_count = await downloader.download_training_data(self.training.images_folder) - assert self.training.data is not None, 'training.data must be set' - self.training.data.image_data = image_data - self.training.data.skipped_image_count = skipped_image_count - - async def download_model(self) -> None: - logging.info('Downloading model') - previous_state = self.training.training_state - self.training.training_state = TrainerState.TrainModelDownloading - error_key = 'download_model' - try: - await self._download_model() - except asyncio.CancelledError: - logging.warning('CancelledError in download_model') - raise - except Exception as e: - logging.exception('download_model failed') - self.training.training_state = previous_state - self.errors.set(error_key, str(e)) - else: - self.errors.reset(error_key) - logging.info('download_model_task finished') - self.training.training_state = TrainerState.TrainModelDownloaded - self.node.last_training_io.save(self.training) + self.data_exchanger.set_context(self.active_training.context) + downloader = TrainingsDownloader(self.data_exchanger) + image_data, skipped_image_count = await downloader.download_training_data(self.active_training.images_folder) + assert self.active_training.data is not None, 'training.data must be set' + self.active_training.data.image_data = image_data + self.active_training.data.skipped_image_count = skipped_image_count async def _download_model(self) -> None: - model_id = self.training.base_model_id + model_id = self.active_training.base_model_id assert model_id is not None, 'model_id must be set' if is_valid_uuid4( - self.training.base_model_id): # TODO this checks if we continue a training -> make more explicit + self.active_training.base_model_id): # TODO this checks if we continue a training -> make more explicit logging.info('loading model from Learning Loop') logging.info(f'downloading model {model_id} as {self.model_format}') - await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, model_id, self.model_format) - shutil.move(f'{self.training.training_folder}/model.json', - f'{self.training.training_folder}/base_model.json') + await self.data_exchanger.download_model(self.active_training.training_folder, self.active_training.context, model_id, self.model_format) + shutil.move(f'{self.active_training.training_folder}/model.json', + f'{self.active_training.training_folder}/base_model.json') else: logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download') - async def train(self) -> None: - logging.info('Running training') + async def _train(self) -> None: + previous_state = TrainerState.TrainModelDownloaded error_key = 'run_training' - # NOTE normally we reset errors after the step was successful. We do not want to display an old error during the whole training. - self.errors.reset(error_key) - previous_state = self.training.training_state - self._executor = Executor(self.training.training_folder) - self.training.training_state = TrainerState.TrainingRunning + self._executor = Executor(self.active_training.training_folder) + self.active_training.training_state = TrainerState.TrainingRunning + try: await self._start_training() @@ -276,28 +188,19 @@ async def train(self) -> None: # self.errors.set(error_key, f'Executor return code was {self.executor.return_code}') # raise TrainingError(cause=f'Executor return code was {self.executor.return_code}') - except asyncio.CancelledError: - logging.warning('CancelledError in run_training') - raise except TrainingError: logging.exception('Error in TrainingProcess') if self.executor.is_process_running(): self.executor.stop() - self.training.training_state = previous_state - except Exception as e: - self.errors.set(error_key, f'Could not start training {str(e)}') - self.training.training_state = previous_state - logging.exception('Error in run_training') - else: - self.training.training_state = TrainerState.TrainingFinished - self.node.last_training_io.save(self.training) + self.active_training.training_state = previous_state + raise async def _start_training(self): self.start_training_task = None # NOTE: this is used i.e. by tests if self.can_resume(): self.start_training_task = self.resume() else: - base_model_id = self.training.base_model_id + base_model_id = self.active_training.base_model_id if not is_valid_uuid4(base_model_id): # TODO this check was done earlier! assert isinstance(base_model_id, str) # TODO this could be removed here and accessed via self.training.base_model_id @@ -306,27 +209,32 @@ async def _start_training(self): self.start_training_task = self.start_training() await self.start_training_task - async def ensure_confusion_matrix_synced(self): - logging.info('Ensure syncing confusion matrix') - previous_state = self.training.training_state - self.training.training_state = TrainerState.ConfusionMatrixSyncing - try: - await self.sync_confusion_matrix() - except asyncio.CancelledError: - logging.warning('CancelledError in run_training') - raise - except Exception: - logging.exception('Error in ensure_confusion_matrix_synced') - self.training.training_state = previous_state - else: - self.training.training_state = TrainerState.ConfusionMatrixSynced - self.node.last_training_io.save(self.training) - async def sync_confusion_matrix(self): logging.info('Syncing confusion matrix') error_key = 'sync_confusion_matrix' try: - await training_syncronizer.try_sync_model(self, self.node.uuid, self.node.sio_client) + try: + model = self.get_new_model() + except Exception as exc: + logging.exception('error while getting new model') + raise Exception(f'Could not get new model: {str(exc)}') from exc + if model and self.active_training.data: + new_training = TrainingOut( + trainer_id=self.node_uuid, + confusion_matrix=model.confusion_matrix, + train_image_count=self.active_training.data.train_image_count(), + test_image_count=self.active_training.data.test_image_count(), + hyperparameters=self.hyperparameters) + + await asyncio.sleep(0.1) # NOTE needed for tests. + result = await self.sio_client.call('update_training', (self.active_training.context.organization, self.active_training.context.project, jsonable_encoder(new_training))) + if isinstance(result, dict) and result['success']: + logging.info(f'successfully updated training {asdict(new_training)}') + self.on_model_published(model) + else: + error_msg = f'Error for update_training: Response from loop was : {result}' + logging.error(error_msg) + raise Exception(error_msg) except socketio.exceptions.BadNamespaceError as e: # type: ignore logging.error('Error during confusion matrix syncronization. BadNamespaceError') self.errors.set(error_key, str(e)) @@ -340,86 +248,72 @@ async def sync_confusion_matrix(self): async def upload_model(self) -> None: error_key = 'upload_model' - previous_state = self.training.training_state - self.training.training_state = TrainerState.TrainModelUploading + previous_state = self.active_training.training_state + self.active_training.training_state = TrainerState.TrainModelUploading try: - new_model_id = await self._upload_model_return_new_id(self.training.context) + new_model_id = await self._upload_model_return_new_model_uuid(self.active_training.context) if new_model_id is None: - self.training.training_state = TrainerState.ReadyForCleanup + self.active_training.training_state = TrainerState.ReadyForCleanup logging.error('could not upload model - maybe training failed.. cleaning up') return assert new_model_id is not None, 'uploaded_model must be set' logging.info(f'successfully uploaded model and received new model id: {new_model_id}') - self.training.model_id_for_detecting = new_model_id + self.active_training.model_id_for_detecting = new_model_id except asyncio.CancelledError: logging.warning('CancelledError in upload_model') raise except Exception as e: logging.exception('Error in upload_model. Exception:') self.errors.set(error_key, str(e)) - self.training.training_state = previous_state # TODO... going back is pointless here as it ends in a deadlock ?! + self.active_training.training_state = previous_state # TODO... going back is pointless here as it ends in a deadlock ?! # self.training.training_state = TrainingState.ReadyForCleanup else: self.errors.reset(error_key) - self.training.training_state = TrainerState.TrainModelUploaded - self.node.last_training_io.save(self.training) + self.active_training.training_state = TrainerState.TrainModelUploaded + self.last_training_io.save(self.active_training) - async def _upload_model_return_new_id(self, context: Context) -> Optional[str]: + async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]: """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file. Note that with the latest trainers the conversion to (.wts) is done by the trainer. The conversion from .wts to .engine is done by the detector (needs to be done on target hardware). Note that trainer may train with different classes, which is why we send an initial model.json file. """ files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files) - if files is None: return None if isinstance(files, List): files = {self.model_format: files} - assert isinstance(files, Dict), f'can only save model as list or dict, but was {files}' + assert isinstance(files, Dict), f'can only upload model as list or dict, but was {files}' - model_json_path = self.create_model_json_with_categories() already_uploaded_formats = self.active_training_io.load_model_upload_progress() - new_id = None + new_model_uuid = None for file_format in files: if file_format in already_uploaded_formats: continue _files = files[file_format] - # model.json was mandatory in previous versions. Now its forbidden to provide an own model.json file. assert not any(f for f in _files if 'model.json' in f), "Upload 'model.json' not allowed (added automatically)." - _files.append(model_json_path) - new_id = await self.node.data_exchanger.upload_model_get_uuid(context, _files, self.training.training_number, file_format) - if new_id is None: + _files.append(self.dump_categories_to_json()) + new_model_uuid = await self.data_exchanger.upload_model_get_uuid(context, _files, self.active_training.training_number, file_format) + if new_model_uuid is None: return None already_uploaded_formats.append(file_format) self.active_training_io.save_model_upload_progress(already_uploaded_formats) - return new_id + return new_model_uuid - async def do_detections(self): - error_key = 'detecting' - previous_state = self.training.training_state - try: - self.training.training_state = TrainerState.Detecting - await self._do_detections() - except asyncio.CancelledError: - logging.warning('CancelledError in do_detections') - raise - except Exception as e: - self.errors.set(error_key, str(e)) - logging.exception('Error in do_detections - Exception:') - self.training.training_state = previous_state - else: - self.errors.reset(error_key) - self.training.training_state = TrainerState.Detected - self.node.last_training_io.save(self.training) + def dump_categories_to_json(self) -> str: + content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None + json_path = '/tmp/model.json' + with open(json_path, 'w') as f: + json.dump(content, f) + return json_path async def _do_detections(self) -> None: - context = self.training.context - model_id = self.training.model_id_for_detecting + context = self.active_training.context + model_id = self.active_training.model_id_for_detecting assert model_id, 'model_id must be set' tmp_folder = f'/tmp/model_for_auto_detections_{model_id}_{self.model_format}' @@ -427,22 +321,22 @@ async def _do_detections(self) -> None: os.makedirs(tmp_folder) logging.info(f'downloading detection model to {tmp_folder}') - await self.node.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format) + await self.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format) with open(f'{tmp_folder}/model.json', 'r') as f: content = json.load(f) model_information = from_dict(data_class=ModelInformation, data=content) project_folder = create_project_folder(context) image_folder = create_image_folder(project_folder) - self.node.data_exchanger.set_context(context) + self.data_exchanger.set_context(context) image_ids = [] for state, p in zip(['inbox', 'annotate', 'review', 'complete'], [0.1, 0.2, 0.3, 0.4]): self.detection_progress = p logging.info(f'fetching image ids of {state}') - new_ids = await self.node.data_exchanger.fetch_image_uuids(query_params=f'state={state}') + new_ids = await self.data_exchanger.fetch_image_uuids(query_params=f'state={state}') image_ids += new_ids logging.info(f'downloading {len(new_ids)} images') - await self.node.data_exchanger.download_images(new_ids, image_folder) + await self.data_exchanger.download_images(new_ids, image_folder) self.detection_progress = 0.42 # await delete_corrupt_images(image_folder) @@ -462,70 +356,15 @@ async def _do_detections(self) -> None: return None - async def upload_detections(self): - error_key = 'upload_detections' - previous_state = self.training.training_state - self.training.training_state = TrainerState.DetectionUploading - await asyncio.sleep(0.1) # NOTE needed for tests - try: - json_files = self.active_training_io.get_detection_file_names() - if not json_files: - raise Exception() - current_json_file_index = self.active_training_io.load_detections_upload_file_index() - for i in range(current_json_file_index, len(json_files)): - detections = self.active_training_io.load_detections(i) - logging.info(f'uploading detections {i}/{len(json_files)}') - await self._upload_detections_batched(self.training.context, detections) - self.active_training_io.save_detections_upload_file_index(i+1) - except asyncio.CancelledError: - logging.warning('CancelledError in upload_detections') - raise - except Exception as e: - self.errors.set(error_key, str(e)) - logging.exception('Error in upload_detections') - self.training.training_state = previous_state - else: - self.errors.reset(error_key) - self.training.training_state = TrainerState.ReadyForCleanup - self.node.last_training_io.save(self.training) - - async def _upload_detections_batched(self, context: Context, detections: List[Detections]): - batch_size = 10 - skip_detections = self.active_training_io.load_detection_upload_progress() - for i in tqdm(range(skip_detections, len(detections), batch_size), position=0, leave=True): - up_progress = i+batch_size - batch_detections = detections[i:up_progress] - dict_detections = [jsonable_encoder(asdict(detection)) for detection in batch_detections] - logging.info(f'uploading detections. File size : {len(json.dumps(dict_detections))}') - await self._upload_detections(context, batch_detections, up_progress) - skip_detections = up_progress - - async def _upload_detections(self, context: Context, batch_detections: List[Detections], up_progress: int): - assert self._active_training_io is not None, 'active_training must be set' - - detections_json = [jsonable_encoder(asdict(detections)) for detections in batch_detections] - response = await self.node.loop_communicator.post( - f'/{context.organization}/projects/{context.project}/detections', json=detections_json) - if response.status_code != 200: - msg = f'could not upload detections. {str(response)}' - logging.error(msg) - raise Exception(msg) - else: - logging.info('successfully uploaded detections') - if up_progress > len(batch_detections): - self._active_training_io.save_detection_upload_progress(0) - else: - self._active_training_io.save_detection_upload_progress(up_progress) - async def clear_training(self): self.active_training_io.delete_detections() self.active_training_io.delete_detection_upload_progress() self.active_training_io.delete_detections_upload_file_index() - await self.clear_training_data(self.training.training_folder) - self.node.last_training_io.delete() + await self.clear_training_data(self.active_training.training_folder) + self.last_training_io.delete() # self.training.training_state = TrainingState.TrainingFinished - assert self._node is not None - await self._node.send_status() # make sure the status is updated before we stop the training + + await self.node.send_status() self._training = None async def stop(self) -> None: @@ -565,9 +404,9 @@ def general_progress(self) -> Optional[float]: if not self.training_active: return None - t_state = self.training.training_state + t_state = self.active_training.training_state if t_state == TrainerState.DataDownloading: - return self.node.data_exchanger.progress + return self.data_exchanger.progress if t_state == TrainerState.TrainingRunning: return self.training_progress if t_state == TrainerState.Detecting: @@ -582,16 +421,6 @@ def training_progress(self) -> Optional[float]: """Represents the training progress.""" raise NotImplementedError - @property - @abstractmethod - def provided_pretrained_models(self) -> List[PretrainedModel]: - raise NotImplementedError - - @property - @abstractmethod - def model_architecture(self) -> Optional[str]: - raise NotImplementedError - @abstractmethod async def start_training(self) -> None: '''Should be used to start a training.''' @@ -698,18 +527,3 @@ def hyperparameters(self) -> Optional[Dict]: information['flipUd'] = self._training.data.hyperparameter.flip_ud return information return None - - def create_model_json_with_categories(self) -> str: - """Remaining fields are filled by the Learning Loop""" - if self._training and self._training.data: - content = { - 'categories': [asdict(c) for c in self._training.data.categories], - } - else: - content = None - - model_json_path = '/tmp/model.json' - with open(model_json_path, 'w') as f: - json.dump(content, f) - - return model_json_path diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py index a4d1f39b..b7f6b006 100644 --- a/learning_loop_node/trainer/trainer_logic_abstraction.py +++ b/learning_loop_node/trainer/trainer_logic_abstraction.py @@ -1,7 +1,16 @@ +import asyncio +import logging +import os +import time from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Callable, Coroutine, List, Optional -from ..data_classes import Context, Errors, PretrainedModel, TrainerState, TrainingData +from socketio import AsyncClient + +from ..data_classes import Context, Errors, PretrainedModel, TrainerState, Training, TrainingData +from ..data_exchanger import DataExchanger +from ..loop_communication import LoopCommunicator +from .io_helpers import ActiveTrainingIO, LastTrainingIO if TYPE_CHECKING: from .trainer_node import TrainerNode @@ -9,24 +18,88 @@ class TrainerLogicAbstraction(ABC): - def __init__(self): + def __init__(self, model_format: str): + + # NOTE: String to be used in the file path for the model on the server: + # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' + self.model_format: str = model_format + self._node: Optional['TrainerNode'] = None # type: ignore + self._last_training_io: Optional[LastTrainingIO] = None # type: ignore self.errors = Errors() + self._training: Optional[Training] = None + self._active_training_io: Optional[ActiveTrainingIO] = None + + self.restart_after_training = os.environ.get('RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1'] + self.keep_old_trainings = os.environ.get('KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1'] + self.inference_batch_size = int(os.environ.get('INFERENCE_BATCH_SIZE', '10')) + @property def node(self) -> 'TrainerNode': - assert self._node is not None, 'node should be set by TrainerNodes before initialization' + assert self._node is not None, 'node should be set by TrainerNode before initialization' return self._node @property - @abstractmethod - def state(self) -> TrainerState: - """Returns the current state of the training logic""" + def last_training_io(self) -> LastTrainingIO: + assert self._last_training_io is not None, 'last_training_io should be set by TrainerNode before initialization' + return self._last_training_io @property - @abstractmethod - def training_uptime(self) -> float | None: - """Returns the time in seconds since the training started or None if idle""" + def data_exchanger(self) -> DataExchanger: + return self.node.data_exchanger + + @property + def loop_communicator(self) -> LoopCommunicator: + return self.node.loop_communicator + + @property + def node_uuid(self) -> str: + return self.node.uuid + + @property + def sio_client(self) -> AsyncClient: + return self.node.sio_client + + @property + def active_training_io(self) -> ActiveTrainingIO: + assert self._active_training_io is not None, 'active_training_io must be set, call `init` first' + return self._active_training_io + + @property + def training_active(self) -> bool: + """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'""" + return self._training is not None and self._active_training_io is not None + + @property + def state(self) -> str: + if (not self.training_active) or (self.active_training.training_state is None): + return TrainerState.Idle.value + else: + return self.active_training.training_state + + @property + def active_training(self) -> Training: + assert self._training is not None, 'training must be initialized, call `init` first' + return self._training + + @property + def training_uptime(self) -> Optional[float]: + if self.active_training: + return time.time() - self.active_training.start_time + return None + + @property + def training_data(self) -> TrainingData | None: + if self.training_active and self.active_training.data: + return self.active_training.data + return None + + @property + def training_context(self) -> Context | None: + if self.training_active: + return self.active_training.context + return None @property @abstractmethod @@ -40,23 +113,13 @@ def provided_pretrained_models(self) -> List[PretrainedModel]: @property @abstractmethod - def model_architecture(self) -> str: - """Returns the architecture name of the model""" + def model_architecture(self) -> Optional[str]: + """Returns the architecture name of the model if available""" @property @abstractmethod def hyperparameters(self) -> dict | None: - """Returns the hyperparameters if available""" - - @property - @abstractmethod - def training_data(self) -> TrainingData | None: - """Returns the training data if available""" - - @property - @abstractmethod - def training_context(self) -> Context | None: - """Returns the training context if available""" + """Returns the currently used hyperparameters if available""" @abstractmethod async def begin_training(self, organization: str, project: str, details: dict): @@ -71,5 +134,28 @@ async def shutdown(self): """Stops the training process and releases resources""" @abstractmethod - async def continue_run_if_incomplete(self) -> bool: - """Continues the training if it is incomplete""" + async def try_continue_run_if_incomplete(self) -> bool: + """Start training continuation if possible, returns True if continuation started""" + + async def perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False): + await asyncio.sleep(0.1) + logging.info(f'Performing state: {state_during}') + previous_state = self.active_training.training_state + self.active_training.training_state = state_during + await asyncio.sleep(0.1) + if reset_early: + self.errors.reset(error_key) + + try: + await action() + except asyncio.CancelledError: + logging.warning(f'CancelledError in {state_during}') + raise + except Exception as e: + self.errors.set(error_key, str(e)) + logging.exception(f'Error in {state_during} - Exception:') + self.active_training.training_state = previous_state + else: + self.errors.reset(error_key) + self.active_training.training_state = state_after + self.last_training_io.save(self.active_training) diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py index fb191a9d..f2e011d5 100644 --- a/learning_loop_node/trainer/trainer_node.py +++ b/learning_loop_node/trainer/trainer_node.py @@ -16,9 +16,11 @@ class TrainerNode(Node): def __init__(self, name: str, trainer_logic: TrainerLogicAbstraction, uuid: Optional[str] = None, use_backdoor_controls: bool = False): super().__init__(name, uuid, 'trainer') - trainer_logic._node = self # pylint: disable=protected-access + trainer_logic._node = self self.trainer_logic = trainer_logic self.last_training_io = LastTrainingIO(self.uuid) + self.trainer_logic._last_training_io = self.last_training_io + self.include_router(controls.router, tags=["controls"]) if use_backdoor_controls: self.include_router(backdoor_controls.router, tags=["controls"]) @@ -34,7 +36,7 @@ async def on_shutdown(self): async def on_repeat(self): try: - if await self.trainer_logic.continue_run_if_incomplete(): + if await self.trainer_logic.try_continue_run_if_incomplete(): return # NOTE: we prevent sending idle status after starting a continuation await self.send_status() except Exception as e: @@ -70,7 +72,7 @@ async def send_status(self): status = TrainingStatus(id=self.uuid, name=self.name, - state=self.trainer_logic.state, + state=self.trainer_logic.state.value, errors={}, uptime=self.trainer_logic.training_uptime, progress=self.trainer_logic.general_progress) diff --git a/learning_loop_node/trainer/training_syncronizer.py b/learning_loop_node/trainer/training_syncronizer.py index 1707d407..97041bb9 100644 --- a/learning_loop_node/trainer/training_syncronizer.py +++ b/learning_loop_node/trainer/training_syncronizer.py @@ -2,7 +2,7 @@ import asyncio import logging from dataclasses import asdict -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING import socketio from dacite import from_dict @@ -15,7 +15,28 @@ from .trainer_logic import TrainerLogic -async def try_sync_model(trainer: 'TrainerLogic', trainer_node_uuid: str, sio_client: socketio.AsyncClient): +class TrainingSyncronizer: + def __init__(self, trainer_node_uuid: str, sio_client: socketio.AsyncClient): + self.trainer_node_uuid = trainer_node_uuid + self.sio_client = sio_client + + async def sync_model(model, current_training): + new_training = TrainingOut( + trainer_id=self.trainer_node_uuid, + confusion_matrix=model.confusion_matrix, + train_image_count=current_training.data.train_image_count(), + test_image_count=current_training.data.test_image_count(), + hyperparameters=trainer.hyperparameters) + + await asyncio.sleep(0.1) # NOTE needed for tests. + + result = await self.sio_client.call('update_training', (current_training.context.organization, current_training.context.project, jsonable_encoder(new_training))) + response = from_dict(data_class=SocketResponse, data=result) + + return response + + +async def try_sync_model(mo): try: model = trainer.get_new_model() except Exception as exc: @@ -30,23 +51,3 @@ async def try_sync_model(trainer: 'TrainerLogic', trainer_node_uuid: str, sio_cl error_msg = f'Error for update_training: Response from loop was : {asdict(response)}' logging.error(error_msg) raise Exception(error_msg) - - -async def sync_model(trainer, trainer_node_uuid, sio_client, model): - current_training = trainer.training - new_training = TrainingOut( - trainer_id=trainer_node_uuid, - confusion_matrix=model.confusion_matrix, - train_image_count=current_training.data.train_image_count(), - test_image_count=current_training.data.test_image_count(), - hyperparameters=trainer.hyperparameters) - - await asyncio.sleep(0.1) # NOTE needed for tests. - - result = await sio_client.call('update_training', (current_training.context.organization, current_training.context.project, jsonable_encoder(new_training))) - response = from_dict(data_class=SocketResponse, data=result) - - if response.success: - logging.info(f'successfully updated training {asdict(new_training)}') - trainer.on_model_published(model) - return response diff --git a/mock_trainer/app_code/progress_simulator.py b/mock_trainer/app_code/progress_simulator.py index 042a0b29..6eaf5ced 100644 --- a/mock_trainer/app_code/progress_simulator.py +++ b/mock_trainer/app_code/progress_simulator.py @@ -10,8 +10,8 @@ def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) - return None confusion_matrix = {} - assert trainer.training.data is not None - for category in trainer.training.data.categories: + assert trainer.active_training.data is not None + for category in trainer.active_training.data.categories: try: minimum = latest_known_confusion_matrix[category.id]['tp'] except Exception: diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py index 20e43931..72929505 100644 --- a/mock_trainer/app_code/tests/test_mock_trainer.py +++ b/mock_trainer/app_code/tests/test_mock_trainer.py @@ -1,8 +1,7 @@ from typing import Dict from uuid import uuid4 -from learning_loop_node.data_classes import (Context, Model, Training, - TrainingData) +from learning_loop_node.data_classes import Context, Model, Training, TrainingData from learning_loop_node.globals import GLOBALS from learning_loop_node.trainer.executor import Executor @@ -38,6 +37,6 @@ async def test_get_new_model(setup_test_project2): project_folder="", images_folder="", training_folder="",) - mock_trainer.training.data = TrainingData(image_data=[], categories=[]) + mock_trainer.active_training.data = TrainingData(image_data=[], categories=[]) model = mock_trainer.get_new_model() assert model is not None From 02016291e2cc44494dd663d75d537625b9c2ead3 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 8 Mar 2024 00:02:37 +0100 Subject: [PATCH 13/62] finalize intermiate abstraction layer to trainer logic --- learning_loop_node/tests/test_helper.py | 5 +- .../tests/states/test_state_upload_model.py | 3 +- .../trainer/tests/testing_trainer_logic.py | 2 +- learning_loop_node/trainer/trainer_logic.py | 386 +----------------- .../trainer/trainer_logic_abstraction.py | 63 ++- .../trainer/trainer_logic_generic.py | 325 +++++++++++++++ learning_loop_node/trainer/trainer_node.py | 4 +- mock_trainer/app_code/mock_trainer_logic.py | 2 +- .../app_code/tests/test_mock_trainer.py | 2 +- 9 files changed, 380 insertions(+), 412 deletions(-) create mode 100644 learning_loop_node/trainer/trainer_logic_generic.py diff --git a/learning_loop_node/tests/test_helper.py b/learning_loop_node/tests/test_helper.py index 1f485506..e802c7a0 100644 --- a/learning_loop_node/tests/test_helper.py +++ b/learning_loop_node/tests/test_helper.py @@ -7,9 +7,8 @@ from typing import Callable from learning_loop_node.data_classes import Context -from learning_loop_node.helpers.misc import create_image_folder, create_project_folder +from learning_loop_node.helpers.misc import create_image_folder, create_project_folder, create_training_folder from learning_loop_node.loop_communication import LoopCommunicator -from learning_loop_node.node import Node from learning_loop_node.trainer.trainer_logic import TrainerLogic @@ -68,5 +67,5 @@ def create_needed_folders(training_uuid: str = 'some_uuid'): # pylint: disable= project_folder = create_project_folder( Context(organization='zauberzeug', project='pytest')) image_folder = create_image_folder(project_folder) - training_folder = TrainerLogic.create_training_folder(project_folder, training_uuid) + training_folder = create_training_folder(project_folder, training_uuid) return project_folder, image_folder, training_folder diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py index 21727b27..9faa656f 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py @@ -21,7 +21,8 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer create_active_training_file(trainer) trainer.init_from_last_training() - train_task = asyncio.get_running_loop().create_task(trainer.upload_model()) + train_task = asyncio.get_running_loop().create_task( + trainer.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, trainer._upload_model)) await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) await train_task diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index d6e9b78a..c7faeca8 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -38,7 +38,7 @@ async def start_training(self, model: str = 'model.model') -> None: async def start_training_from_scratch(self, base_model_id: str) -> None: await self.start_training(model=f'model_{base_model_id}.pt') - def get_new_model(self) -> Optional[BasicModel]: + def get_new_best_model(self) -> Optional[BasicModel]: if self.has_new_model: return BasicModel(confusion_matrix={}) return None diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 4bfdb743..40b706fd 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -3,30 +3,19 @@ import logging import os import shutil -import sys from abc import abstractmethod -from dataclasses import asdict from datetime import datetime -from glob import glob -from time import perf_counter -from typing import Coroutine, Dict, List, Optional, Union -from uuid import uuid4 +from typing import Coroutine, Dict, List, Optional -import socketio from dacite import from_dict -from fastapi.encoders import jsonable_encoder -from tqdm import tqdm -from ..data_classes import (BasicModel, Category, Context, Detections, Hyperparameter, ModelInformation, TrainerState, - Training, TrainingData, TrainingError, TrainingOut) -from ..helpers.misc import create_image_folder, create_project_folder, generate_training, is_valid_uuid4 -from .downloader import TrainingsDownloader +from ..data_classes import Detections, ModelInformation, TrainerState, TrainingError +from ..helpers.misc import create_image_folder, create_project_folder, images_for_ids, is_valid_uuid4 from .executor import Executor -from .io_helpers import ActiveTrainingIO -from .trainer_logic_abstraction import TrainerLogicAbstraction +from .trainer_logic_generic import TrainerLogicGeneric -class TrainerLogic(TrainerLogicAbstraction): +class TrainerLogic(TrainerLogicGeneric): def __init__(self, model_format: str) -> None: super().__init__(model_format) @@ -35,121 +24,22 @@ def __init__(self, model_format: str) -> None: # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' self._executor: Optional[Executor] = None - self.training_task: Optional[asyncio.Task] = None self.start_training_task: Optional[Coroutine] = None - self.shutdown_event: asyncio.Event = asyncio.Event() - self.detection_progress = 0.0 @property def executor(self) -> Executor: assert self._executor is not None, 'executor must be set, call `run_training` first' return self._executor - def init_new_training(self, context: Context, details: Dict) -> None: - """Called on `begin_training` event from the Learning Loop. - Note that details needs the entries 'categories' and 'training_number'""" - - project_folder = create_project_folder(context) - if not self.keep_old_trainings: - # NOTE: We delete all existing training folders because they are not needed anymore. - TrainerLogic.delete_all_training_folders(project_folder) - self._training = generate_training(project_folder, context) - self._training.data = TrainingData(categories=Category.from_list(details['categories'])) - self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details) - self._training.training_number = details['training_number'] - self._training.base_model_id = details['id'] - self._training.training_state = TrainerState.Initialized - self._active_training_io = ActiveTrainingIO( - self._training.training_folder, self.loop_communicator, context) - logging.info(f'training initialized: {self._training}') - - async def try_continue_run_if_incomplete(self) -> bool: - if not self.training_active and self.last_training_io.exists(): - logging.info('found incomplete training, continuing now.') - self.init_from_last_training() - asyncio.get_event_loop().create_task(self.run()) - return True - return False - - def init_from_last_training(self) -> None: - self._training = self.last_training_io.load() - assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder' - self._active_training_io = ActiveTrainingIO( - self._training.training_folder, self.loop_communicator, self._training.context) - - async def begin_training(self, organization: str, project: str, details: Dict) -> None: - self.init_new_training(Context(organization=organization, project=project), details) - asyncio.get_event_loop().create_task(self.run()) - - async def run(self) -> None: - """Called on `begin_training` event from the Learning Loop.""" - - self.errors.reset_all() - try: - self.training_task = asyncio.get_running_loop().create_task(self._run_training_loop()) - await self.training_task # Object is used to potentially cancel the task - except asyncio.CancelledError: - if not self.shutdown_event.is_set(): - logging.info('training task was cancelled but not by shutdown event') - self.active_training.training_state = TrainerState.ReadyForCleanup - self.last_training_io.save(self.active_training) - await self.clear_training() - - except Exception as e: - logging.exception(f'Error in train: {e}') - - # ---------------------------------------- TRAINING STATES ---------------------------------------- - - async def _run_training_loop(self) -> None: - """asyncio.CancelledError is catched in train""" - - if not self.training_active: - logging.error('could not start training - trainer is not initialized') - return - - while self._training is not None: - tstate = self.active_training.training_state - logging.info(f'STATE LOOP: {tstate}, eerrors: {self.errors.errors}') - await asyncio.sleep(0.6) # Note: Required for pytests! - if tstate == TrainerState.Initialized: # -> DataDownloading -> DataDownloaded - await self.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare) - elif tstate == TrainerState.DataDownloaded: # -> TrainModelDownloading -> TrainModelDownloaded - await self.perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model) - elif tstate == TrainerState.TrainModelDownloaded: # -> TrainingRunning -> TrainingFinished - await self.perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train) - elif tstate == TrainerState.TrainingFinished: # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced - await self.perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self.sync_confusion_matrix) - # await self.ensure_confusion_matrix_synced() - elif tstate == TrainerState.ConfusionMatrixSynced: # -> TrainModelUploading -> TrainModelUploaded - await self.upload_model() - elif tstate == TrainerState.TrainModelUploaded: # -> Detecting -> Detected - await self.perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections) - elif tstate == TrainerState.Detected: # -> DetectionUploading -> ReadyForCleanup - await self.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions) - elif tstate == TrainerState.ReadyForCleanup: # -> RESTART or TrainingFinished - await self.clear_training() - self.may_restart() - - async def _prepare(self) -> None: - self.data_exchanger.set_context(self.active_training.context) - downloader = TrainingsDownloader(self.data_exchanger) - image_data, skipped_image_count = await downloader.download_training_data(self.active_training.images_folder) - assert self.active_training.data is not None, 'training.data must be set' - self.active_training.data.image_data = image_data - self.active_training.data.skipped_image_count = skipped_image_count - - async def _download_model(self) -> None: - model_id = self.active_training.base_model_id - assert model_id is not None, 'model_id must be set' - if is_valid_uuid4( - self.active_training.base_model_id): # TODO this checks if we continue a training -> make more explicit - logging.info('loading model from Learning Loop') - logging.info(f'downloading model {model_id} as {self.model_format}') - await self.data_exchanger.download_model(self.active_training.training_folder, self.active_training.context, model_id, self.model_format) - shutil.move(f'{self.active_training.training_folder}/model.json', - f'{self.active_training.training_folder}/base_model.json') - else: - logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download') + @property + def hyperparameters(self) -> Optional[Dict]: + if self._training and self._training.data and self._training.data.hyperparameter: + information = {} + information['resolution'] = self._training.data.hyperparameter.resolution + information['flipRl'] = self._training.data.hyperparameter.flip_rl + information['flipUd'] = self._training.data.hyperparameter.flip_ud + return information + return None async def _train(self) -> None: previous_state = TrainerState.TrainModelDownloaded @@ -170,7 +60,7 @@ async def _train(self) -> None: break self.errors.reset(error_key) try: - await self.sync_confusion_matrix() + await self._sync_confusion_matrix() except asyncio.CancelledError: logging.warning('CancelledError in run_training') raise @@ -209,108 +99,6 @@ async def _start_training(self): self.start_training_task = self.start_training() await self.start_training_task - async def sync_confusion_matrix(self): - logging.info('Syncing confusion matrix') - error_key = 'sync_confusion_matrix' - try: - try: - model = self.get_new_model() - except Exception as exc: - logging.exception('error while getting new model') - raise Exception(f'Could not get new model: {str(exc)}') from exc - if model and self.active_training.data: - new_training = TrainingOut( - trainer_id=self.node_uuid, - confusion_matrix=model.confusion_matrix, - train_image_count=self.active_training.data.train_image_count(), - test_image_count=self.active_training.data.test_image_count(), - hyperparameters=self.hyperparameters) - - await asyncio.sleep(0.1) # NOTE needed for tests. - result = await self.sio_client.call('update_training', (self.active_training.context.organization, self.active_training.context.project, jsonable_encoder(new_training))) - if isinstance(result, dict) and result['success']: - logging.info(f'successfully updated training {asdict(new_training)}') - self.on_model_published(model) - else: - error_msg = f'Error for update_training: Response from loop was : {result}' - logging.error(error_msg) - raise Exception(error_msg) - except socketio.exceptions.BadNamespaceError as e: # type: ignore - logging.error('Error during confusion matrix syncronization. BadNamespaceError') - self.errors.set(error_key, str(e)) - raise - except Exception as e: - logging.exception('Error during confusion matrix syncronization') - self.errors.set(error_key, str(e)) - raise - - self.errors.reset(error_key) - - async def upload_model(self) -> None: - error_key = 'upload_model' - previous_state = self.active_training.training_state - self.active_training.training_state = TrainerState.TrainModelUploading - try: - new_model_id = await self._upload_model_return_new_model_uuid(self.active_training.context) - if new_model_id is None: - self.active_training.training_state = TrainerState.ReadyForCleanup - logging.error('could not upload model - maybe training failed.. cleaning up') - return - assert new_model_id is not None, 'uploaded_model must be set' - logging.info(f'successfully uploaded model and received new model id: {new_model_id}') - self.active_training.model_id_for_detecting = new_model_id - except asyncio.CancelledError: - logging.warning('CancelledError in upload_model') - raise - except Exception as e: - logging.exception('Error in upload_model. Exception:') - self.errors.set(error_key, str(e)) - self.active_training.training_state = previous_state # TODO... going back is pointless here as it ends in a deadlock ?! - # self.training.training_state = TrainingState.ReadyForCleanup - else: - self.errors.reset(error_key) - self.active_training.training_state = TrainerState.TrainModelUploaded - self.last_training_io.save(self.active_training) - - async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]: - """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file. - Note that with the latest trainers the conversion to (.wts) is done by the trainer. - The conversion from .wts to .engine is done by the detector (needs to be done on target hardware). - Note that trainer may train with different classes, which is why we send an initial model.json file. - """ - files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files) - if files is None: - return None - - if isinstance(files, List): - files = {self.model_format: files} - assert isinstance(files, Dict), f'can only upload model as list or dict, but was {files}' - - already_uploaded_formats = self.active_training_io.load_model_upload_progress() - - new_model_uuid = None - for file_format in files: - if file_format in already_uploaded_formats: - continue - _files = files[file_format] - assert not any(f for f in _files if 'model.json' in f), "Upload 'model.json' not allowed (added automatically)." - _files.append(self.dump_categories_to_json()) - new_model_uuid = await self.data_exchanger.upload_model_get_uuid(context, _files, self.active_training.training_number, file_format) - if new_model_uuid is None: - return None - - already_uploaded_formats.append(file_format) - self.active_training_io.save_model_upload_progress(already_uploaded_formats) - - return new_model_uuid - - def dump_categories_to_json(self) -> str: - content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None - json_path = '/tmp/model.json' - with open(json_path, 'w') as f: - json.dump(content, f) - return json_path - async def _do_detections(self) -> None: context = self.active_training.context model_id = self.active_training.model_id_for_detecting @@ -323,8 +111,7 @@ async def _do_detections(self) -> None: await self.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format) with open(f'{tmp_folder}/model.json', 'r') as f: - content = json.load(f) - model_information = from_dict(data_class=ModelInformation, data=content) + model_information = from_dict(data_class=ModelInformation, data=json.load(f)) project_folder = create_project_folder(context) image_folder = create_image_folder(project_folder) @@ -340,32 +127,17 @@ async def _do_detections(self) -> None: self.detection_progress = 0.42 # await delete_corrupt_images(image_folder) - images = await asyncio.get_event_loop().run_in_executor(None, TrainerLogic.images_for_ids, image_ids, image_folder) + images = await asyncio.get_event_loop().run_in_executor(None, images_for_ids, image_ids, image_folder) + if not images: + self.active_training_io.save_detections([], 0) num_images = len(images) - logging.info(f'running detections on {num_images} images') + batch_size = 200 - idx = 0 - if not images: - self.active_training_io.save_detections([], idx) - for i in tqdm(range(0, num_images, batch_size), position=0, leave=True): + for idx, i in enumerate(range(0, num_images, batch_size)): self.detection_progress = 0.5 + (i/num_images)*0.5 batch_images = images[i:i+batch_size] batch_detections = await self._detect(model_information, batch_images, tmp_folder) self.active_training_io.save_detections(batch_detections, idx) - idx += 1 - - return None - - async def clear_training(self): - self.active_training_io.delete_detections() - self.active_training_io.delete_detection_upload_progress() - self.active_training_io.delete_detections_upload_file_index() - await self.clear_training_data(self.active_training.training_folder) - self.last_training_io.delete() - # self.training.training_state = TrainingState.TrainingFinished - - await self.node.send_status() - self._training = None async def stop(self) -> None: """If executor is running, stop it. Else cancel training task.""" @@ -383,44 +155,11 @@ async def stop(self) -> None: logging.info('cancelled training task') self.may_restart() - async def shutdown(self) -> None: - self.shutdown_event.set() - await self.stop() - await self.stop() # NOTE first stop may only stop training. - def get_log(self) -> str: return self.executor.get_log() - def may_restart(self) -> None: - if self.restart_after_training: - logging.info('restarting') - sys.exit(0) - else: - logging.info('not restarting') - - @property - def general_progress(self) -> Optional[float]: - """Represents the progress for different states.""" - if not self.training_active: - return None - - t_state = self.active_training.training_state - if t_state == TrainerState.DataDownloading: - return self.data_exchanger.progress - if t_state == TrainerState.TrainingRunning: - return self.training_progress - if t_state == TrainerState.Detecting: - return self.detection_progress - - return None # ---------------------------------------- ABSTRACT METHODS ---------------------------------------- - @property - @abstractmethod - def training_progress(self) -> Optional[float]: - """Represents the training progress.""" - raise NotImplementedError - @abstractmethod async def start_training(self) -> None: '''Should be used to start a training.''' @@ -440,90 +179,9 @@ async def resume(self) -> None: One may resume the training on a previously trained model stored by self.on_model_published(basic_model).''' @abstractmethod - def get_executor_error_from_log(self) -> Optional[str]: # TODO we should allow other options to get the error + def get_executor_error_from_log(self) -> Optional[str]: '''Should be used to provide error informations to the Learning Loop by extracting data from self.executor.get_log().''' - @abstractmethod - def get_new_model(self) -> Optional[BasicModel]: - '''Is called frequently in `try_sync_model` to check if a new "best" model is availabe. - Returns None if no new model could be found. Otherwise BasicModel(confusion_matrix, meta_information). - `confusion_matrix` contains a dict of all classes: - - The classes must be identified by their id, not their name. - - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives). - `meta_information` can hold any data which is helpful for self.on_model_published to store weight file etc for later upload via self.get_model_files - ''' - - @abstractmethod - def on_model_published(self, basic_model: BasicModel) -> None: - '''Called after a BasicModel has been successfully send to the Learning Loop. - The files for this model should be stored. - self.get_latest_model_files is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop. - In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model - ''' - - @abstractmethod - def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]: - '''Called when the Learning Loop requests to backup the latest model for the training. - Should return a list of file paths which describe the model. - These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.) - and will be stored in the Learning Loop unter the format of this trainer. - Note: by convention the weightfile should be named "model." where extension is the file format of the weightfile. - For example "model.pt" for pytorch or "model.weights" for darknet/yolo. - - If a trainer can also generate other formats (for example for an detector), - a dictionary mapping format -> list of files can be returned.''' - @abstractmethod async def _detect(self, model_information: ModelInformation, images: List[str], model_folder: str) -> List[Detections]: '''Called to run detections on a list of images.''' - - @abstractmethod - async def clear_training_data(self, training_folder: str) -> None: - '''Called after a training has finished. Deletes all data that is not needed anymore after a training run. - This can be old weightfiles or any additional files.''' - - # ---------------------------------------- HELPER METHODS ---------------------------------------- - - @staticmethod - def images_for_ids(image_ids, image_folder) -> List[str]: - logging.info(f'### Going to get images for {len(image_ids)} images ids') - start = perf_counter() - images = [img for img in glob(f'{image_folder}/**/*.*', recursive=True) - if os.path.splitext(os.path.basename(img))[0] in image_ids] - end = perf_counter() - logging.info(f'found {len(images)} images for {len(image_ids)} image ids, which took {end-start:0.2f} seconds') - return images - - @staticmethod - def generate_training(project_folder: str, context: Context) -> Training: - training_uuid = str(uuid4()) - return Training( - id=training_uuid, - context=context, - project_folder=project_folder, - images_folder=create_image_folder(project_folder), - training_folder=TrainerLogic.create_training_folder(project_folder, training_uuid) - ) - - @staticmethod - def delete_all_training_folders(project_folder: str): - if not os.path.exists(f'{project_folder}/trainings'): - return - for uuid in os.listdir(f'{project_folder}/trainings'): - shutil.rmtree(f'{project_folder}/trainings/{uuid}', ignore_errors=True) - - @staticmethod - def create_training_folder(project_folder: str, trainings_id: str) -> str: - training_folder = f'{project_folder}/trainings/{trainings_id}' - os.makedirs(training_folder, exist_ok=True) - return training_folder - - @property - def hyperparameters(self) -> Optional[Dict]: - if self._training and self._training.data and self._training.data.hyperparameter: - information = {} - information['resolution'] = self._training.data.hyperparameter.resolution - information['flipRl'] = self._training.data.hyperparameter.flip_rl - information['flipUd'] = self._training.data.hyperparameter.flip_ud - return information - return None diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py index b7f6b006..2b432998 100644 --- a/learning_loop_node/trainer/trainer_logic_abstraction.py +++ b/learning_loop_node/trainer/trainer_logic_abstraction.py @@ -1,9 +1,7 @@ -import asyncio -import logging import os import time from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Callable, Coroutine, List, Optional +from typing import TYPE_CHECKING, Dict, List, Optional from socketio import AsyncClient @@ -90,27 +88,32 @@ def training_uptime(self) -> Optional[float]: return None @property - def training_data(self) -> TrainingData | None: + def training_data(self) -> Optional[TrainingData]: if self.training_active and self.active_training.data: return self.active_training.data return None @property - def training_context(self) -> Context | None: + def training_context(self) -> Optional[Context]: if self.training_active: return self.active_training.context return None + # --- ABSTRACT PROPERTIES + # --------- implemented in TrainerLogicGeneric + @property @abstractmethod - def general_progress(self) -> float | None: + def general_progress(self) -> Optional[float]: """Returns the general progress of the training per state or None if idle""" + # --------- implemented in TrainerLogic(with Executor) @property @abstractmethod - def provided_pretrained_models(self) -> List[PretrainedModel]: - """Returns the list of provided pretrained models""" + def hyperparameters(self) -> Optional[Dict]: + """Returns the currently used hyperparameters if available""" + # --------- not implemented in any abstract class @property @abstractmethod def model_architecture(self) -> Optional[str]: @@ -118,44 +121,26 @@ def model_architecture(self) -> Optional[str]: @property @abstractmethod - def hyperparameters(self) -> dict | None: - """Returns the currently used hyperparameters if available""" + def provided_pretrained_models(self) -> List[PretrainedModel]: + """Returns the list of provided pretrained models""" - @abstractmethod - async def begin_training(self, organization: str, project: str, details: dict): - """Starts the training process""" + # --- ABSTRACT METHODS ----- + # --------- implemented in TrainerLogicGeneric --- @abstractmethod - async def stop(self): - """Stops the training process""" + async def on_shutdown(self): + """Called when the trainer is shut down""" @abstractmethod - async def shutdown(self): - """Stops the training process and releases resources""" + async def begin_training(self, organization: str, project: str, details: dict): + """Starts the training process""" @abstractmethod async def try_continue_run_if_incomplete(self) -> bool: """Start training continuation if possible, returns True if continuation started""" - async def perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False): - await asyncio.sleep(0.1) - logging.info(f'Performing state: {state_during}') - previous_state = self.active_training.training_state - self.active_training.training_state = state_during - await asyncio.sleep(0.1) - if reset_early: - self.errors.reset(error_key) - - try: - await action() - except asyncio.CancelledError: - logging.warning(f'CancelledError in {state_during}') - raise - except Exception as e: - self.errors.set(error_key, str(e)) - logging.exception(f'Error in {state_during} - Exception:') - self.active_training.training_state = previous_state - else: - self.errors.reset(error_key) - self.active_training.training_state = state_after - self.last_training_io.save(self.active_training) + # --- implemented in TrainerLogic(with Executor) --- + + @abstractmethod + async def stop(self): + """Stops the training process""" diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py new file mode 100644 index 00000000..ac0479c1 --- /dev/null +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -0,0 +1,325 @@ +import asyncio +import json +import logging +import shutil +import sys +from abc import abstractmethod +from dataclasses import asdict +from typing import Callable, Coroutine, Dict, List, Optional, Union + +from dacite import from_dict +from fastapi.encoders import jsonable_encoder + +from ..data_classes import BasicModel, Category, Context, Hyperparameter, TrainerState, TrainingData, TrainingOut +from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4 +from .downloader import TrainingsDownloader +from .io_helpers import ActiveTrainingIO +from .trainer_logic_abstraction import TrainerLogicAbstraction + + +class TrainerLogicGeneric(TrainerLogicAbstraction): + + def __init__(self, model_format: str) -> None: + super().__init__(model_format) + self.training_task: Optional[asyncio.Task] = None + self.detection_progress = 0.0 + self.shutdown_event: asyncio.Event = asyncio.Event() + + @property + def general_progress(self) -> Optional[float]: + """Represents the progress for different states.""" + if not self.training_active: + return None + + t_state = self.active_training.training_state + if t_state == TrainerState.DataDownloading: + return self.data_exchanger.progress + if t_state == TrainerState.TrainingRunning: + return self.training_progress + if t_state == TrainerState.Detecting: + return self.detection_progress + + return None + + def init_new_training(self, context: Context, details: Dict) -> None: + """Called on `begin_training` event from the Learning Loop. + Note that details needs the entries 'categories' and 'training_number'""" + + project_folder = create_project_folder(context) + if not self.keep_old_trainings: + # NOTE: We delete all existing training folders because they are not needed anymore. + delete_all_training_folders(project_folder) + self._training = generate_training(project_folder, context) + self._training.data = TrainingData(categories=Category.from_list(details['categories'])) + self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details) + self._training.training_number = details['training_number'] + self._training.base_model_id = details['id'] + self._training.training_state = TrainerState.Initialized + self._active_training_io = ActiveTrainingIO( + self._training.training_folder, self.loop_communicator, context) + logging.info(f'training initialized: {self._training}') + + async def try_continue_run_if_incomplete(self) -> bool: + if not self.training_active and self.last_training_io.exists(): + logging.info('found incomplete training, continuing now.') + self.init_from_last_training() + asyncio.get_event_loop().create_task(self.run()) + return True + return False + + def init_from_last_training(self) -> None: + self._training = self.last_training_io.load() + assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder' + self._active_training_io = ActiveTrainingIO( + self._training.training_folder, self.loop_communicator, self._training.context) + + async def begin_training(self, organization: str, project: str, details: Dict) -> None: + """Called on `begin_training` event from the Learning Loop.""" + + self.init_new_training(Context(organization=organization, project=project), details) + asyncio.get_event_loop().create_task(self.run()) + + async def run(self) -> None: + self.errors.reset_all() + try: + self.training_task = asyncio.get_running_loop().create_task(self._training_loop()) + await self.training_task # NOTE: Task object is used to potentially cancel the task + except asyncio.CancelledError: + if not self.shutdown_event.is_set(): + logging.info('training task was cancelled but not by shutdown event') + self.active_training.training_state = TrainerState.ReadyForCleanup + self.last_training_io.save(self.active_training) + await self.clear_training() + except Exception as e: + logging.exception(f'Error in train: {e}') + + # ---------------------------------------- TRAINING STATES ---------------------------------------- + + async def _training_loop(self) -> None: + """asyncio.CancelledError is catched in run""" + + assert self.training_active + + while self._training is not None: + tstate = self.active_training.training_state + logging.info(f'STATE LOOP: {tstate}, eerrors: {self.errors.errors}') + await asyncio.sleep(0.6) # Note: Required for pytests! + if tstate == TrainerState.Initialized: # -> DataDownloading -> DataDownloaded + await self.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare) + elif tstate == TrainerState.DataDownloaded: # -> TrainModelDownloading -> TrainModelDownloaded + await self.perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model) + elif tstate == TrainerState.TrainModelDownloaded: # -> TrainingRunning -> TrainingFinished + await self.perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train) + elif tstate == TrainerState.TrainingFinished: # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced + await self.perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix) + elif tstate == TrainerState.ConfusionMatrixSynced: # -> TrainModelUploading -> TrainModelUploaded + await self.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model) + elif tstate == TrainerState.TrainModelUploaded: # -> Detecting -> Detected + await self.perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections) + elif tstate == TrainerState.Detected: # -> DetectionUploading -> ReadyForCleanup + await self.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions) + elif tstate == TrainerState.ReadyForCleanup: # -> RESTART or TrainingFinished + await self.clear_training() + self.may_restart() + + async def perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False): + await asyncio.sleep(0.1) + logging.info(f'Performing state: {state_during}') + previous_state = self.active_training.training_state + self.active_training.training_state = state_during + await asyncio.sleep(0.1) + if reset_early: + self.errors.reset(error_key) + + try: + if await action(): + logging.error('Something went really bad.. cleaning up') + state_after = TrainerState.ReadyForCleanup + except asyncio.CancelledError: + logging.warning(f'CancelledError in {state_during}') + raise + except Exception as e: + self.errors.set(error_key, str(e)) + logging.exception(f'Error in {state_during} - Exception:') + self.active_training.training_state = previous_state + else: + self.errors.reset(error_key) + self.active_training.training_state = state_after + self.last_training_io.save(self.active_training) + + async def _prepare(self) -> None: + self.data_exchanger.set_context(self.active_training.context) + downloader = TrainingsDownloader(self.data_exchanger) + image_data, skipped_image_count = await downloader.download_training_data(self.active_training.images_folder) + assert self.active_training.data is not None, 'training.data must be set' + self.active_training.data.image_data = image_data + self.active_training.data.skipped_image_count = skipped_image_count + + async def _download_model(self) -> None: + model_id = self.active_training.base_model_id + assert model_id is not None, 'model_id must be set' + if is_valid_uuid4( + self.active_training.base_model_id): # TODO this checks if we continue a training -> make more explicit + logging.info('loading model from Learning Loop') + logging.info(f'downloading model {model_id} as {self.model_format}') + await self.data_exchanger.download_model(self.active_training.training_folder, self.active_training.context, model_id, self.model_format) + shutil.move(f'{self.active_training.training_folder}/model.json', + f'{self.active_training.training_folder}/base_model.json') + else: + logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download') + + async def _sync_confusion_matrix(self): + error_key = 'sync_confusion_matrix' + try: + new_best_model = self.get_new_best_model() + if new_best_model and self.active_training.data: + new_training = TrainingOut(trainer_id=self.node_uuid, + confusion_matrix=new_best_model.confusion_matrix, + train_image_count=self.active_training.data.train_image_count(), + test_image_count=self.active_training.data.test_image_count(), + hyperparameters=self.hyperparameters) + await asyncio.sleep(0.1) # NOTE needed for tests. + + result = await self.sio_client.call('update_training', ( + self.active_training.context.organization, self.active_training.context.project, jsonable_encoder(new_training))) + if isinstance(result, dict) and result['success']: + logging.info(f'successfully updated training {asdict(new_training)}') + self.on_model_published(new_best_model) + else: + raise Exception(f'Error for update_training: Response from loop was : {result}') + except Exception as e: + logging.exception('Error during confusion matrix syncronization') + self.errors.set(error_key, str(e)) + raise + self.errors.reset(error_key) + + async def _upload_model(self) -> None | bool: + """Returns True if the training should be cleaned up.""" + + new_model_id = await self._upload_model_return_new_model_uuid(self.active_training.context) + if new_model_id is None: + self.active_training.training_state = TrainerState.ReadyForCleanup + logging.error('could not upload model - maybe training failed.. cleaning up') + return True + logging.info(f'Successfully uploaded model and received new model id: {new_model_id}') + self.active_training.model_id_for_detecting = new_model_id + return None + + async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]: + """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file. + Note that with the latest trainers the conversion to (.wts) is done by the trainer. + The conversion from .wts to .engine is done by the detector (needs to be done on target hardware). + Note that trainer may train with different classes, which is why we send an initial model.json file. + """ + files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files) + if files is None: + return None + + if isinstance(files, List): + files = {self.model_format: files} + assert isinstance(files, Dict), f'can only upload model as list or dict, but was {files}' + + already_uploaded_formats = self.active_training_io.load_model_upload_progress() + + model_uuid = None + for file_format in [f for f in files if f not in already_uploaded_formats]: + _files = files[file_format] + [self.dump_categories_to_json()] + assert len([f for f in _files if 'model.json' in f]) == 1, "model.json must be included exactly once" + + model_uuid = await self.data_exchanger.upload_model_get_uuid(context, _files, self.active_training.training_number, file_format) + if model_uuid is None: + return None + + already_uploaded_formats.append(file_format) + self.active_training_io.save_model_upload_progress(already_uploaded_formats) + + return model_uuid + + def dump_categories_to_json(self) -> str: + content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None + json_path = '/tmp/model.json' + with open(json_path, 'w') as f: + json.dump(content, f) + return json_path + + async def clear_training(self): + self.active_training_io.delete_detections() + self.active_training_io.delete_detection_upload_progress() + self.active_training_io.delete_detections_upload_file_index() + await self.clear_training_data(self.active_training.training_folder) + self.last_training_io.delete() + # self.training.training_state = TrainingState.TrainingFinished + + await self.node.send_status() + self._training = None + + # ---------------------------------------- OTHER METHODS ---------------------------------------- + + def may_restart(self) -> None: + if self.restart_after_training: + logging.info('restarting') + sys.exit(0) + else: + logging.info('not restarting') + + async def on_shutdown(self) -> None: + self.shutdown_event.set() + await self.stop() + await self.stop() + + # ---------------------------------------- ABSTRACT PROPERTIES ---------------------------------------- + + @property + @abstractmethod + def training_progress(self) -> Optional[float]: + """Represents the training progress.""" + raise NotImplementedError + + # ---------------------------------------- ABSTRACT METHODS ---------------------------------------- + + @abstractmethod + async def _train(self) -> None: + '''Should be used to execute a training. + The model should be synchronized with the Learning Loop via self._sync_confusion_matrix() every now and then. + asyncio.CancelledError should be catched and re-raised.''' + + @abstractmethod + async def _do_detections(self) -> None: + '''Should be used to execute detections. + active_training_io.save_detections(...) should be used to store the detections. + asyncio.CancelledError should be catched and re-raised.''' + + @abstractmethod + def get_new_best_model(self) -> Optional[BasicModel]: + '''Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe. + Returns None if no new model could be found. Otherwise BasicModel(confusion_matrix, meta_information). + `confusion_matrix` contains a dict of all classes: + - The classes must be identified by their id, not their name. + - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives). + `meta_information` can hold any data which is helpful for self.on_model_published to store weight file etc for later upload via self.get_model_files + ''' + + @abstractmethod + def on_model_published(self, basic_model: BasicModel) -> None: + '''Called after a BasicModel has been successfully send to the Learning Loop. + The files for this model should be stored. + self.get_latest_model_files is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop. + In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model + ''' + + @abstractmethod + def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]: + '''Called when the Learning Loop requests to backup the latest model for the training. + Should return a list of file paths which describe the model. + These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.) + and will be stored in the Learning Loop unter the format of this trainer. + Note: by convention the weightfile should be named "model." where extension is the file format of the weightfile. + For example "model.pt" for pytorch or "model.weights" for darknet/yolo. + + If a trainer can also generate other formats (for example for an detector), + a dictionary mapping format -> list of files can be returned.''' + + @abstractmethod + async def clear_training_data(self, training_folder: str) -> None: + '''Called after a training has finished. Deletes all data that is not needed anymore after a training run. + This can be old weightfiles or any additional files.''' diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py index f2e011d5..c87124c1 100644 --- a/learning_loop_node/trainer/trainer_node.py +++ b/learning_loop_node/trainer/trainer_node.py @@ -32,7 +32,7 @@ async def on_startup(self): async def on_shutdown(self): self.log.info('shutdown detected, stopping training') - await self.trainer_logic.shutdown() + await self.trainer_logic.on_shutdown() async def on_repeat(self): try: @@ -72,7 +72,7 @@ async def send_status(self): status = TrainingStatus(id=self.uuid, name=self.name, - state=self.trainer_logic.state.value, + state=self.trainer_logic.state, errors={}, uptime=self.trainer_logic.training_uptime, progress=self.trainer_logic.general_progress) diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py index b3f1adb5..b24dc531 100644 --- a/mock_trainer/app_code/mock_trainer_logic.py +++ b/mock_trainer/app_code/mock_trainer_logic.py @@ -111,7 +111,7 @@ def training_progress(self) -> float: print(f'prog. is {self.current_iteration} / {self.max_iterations} = {self.current_iteration / self.max_iterations}') return self.current_iteration / self.max_iterations - def get_new_model(self) -> Optional[BasicModel]: + def get_new_best_model(self) -> Optional[BasicModel]: logging.warning('get_new_model called') if self.error_configuration.get_new_model: raise Exception() diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py index 72929505..f20797b0 100644 --- a/mock_trainer/app_code/tests/test_mock_trainer.py +++ b/mock_trainer/app_code/tests/test_mock_trainer.py @@ -38,5 +38,5 @@ async def test_get_new_model(setup_test_project2): images_folder="", training_folder="",) mock_trainer.active_training.data = TrainingData(image_data=[], categories=[]) - model = mock_trainer.get_new_model() + model = mock_trainer.get_new_best_model() assert model is not None From eb9b5a962492d7f9d1acee2c490ac1498a01ef7a Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 8 Mar 2024 09:56:36 +0100 Subject: [PATCH 14/62] Fix minor bug in abstraction layer --- learning_loop_node/trainer/trainer_logic_abstraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py index 2b432998..64349e3d 100644 --- a/learning_loop_node/trainer/trainer_logic_abstraction.py +++ b/learning_loop_node/trainer/trainer_logic_abstraction.py @@ -83,7 +83,7 @@ def active_training(self) -> Training: @property def training_uptime(self) -> Optional[float]: - if self.active_training: + if self.training_active: return time.time() - self.active_training.start_time return None From e2272e547061f8f9a5aa7709ce46bb7d12e857ae Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 8 Mar 2024 11:23:47 +0100 Subject: [PATCH 15/62] Fix error states that are required by the backend tests --- learning_loop_node/data_classes/training.py | 3 +++ learning_loop_node/trainer/rest/backdoor_controls.py | 12 +++++++++--- learning_loop_node/trainer/trainer_logic.py | 1 - learning_loop_node/trainer/trainer_logic_generic.py | 4 +++- mock_trainer/app_code/mock_trainer_logic.py | 4 ++-- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index a0601c2d..449cc85b 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -165,3 +165,6 @@ class TrainingError(Exception): def __init__(self, cause: str, *args: object) -> None: super().__init__(*args) self.cause = cause + + def __str__(self) -> str: + return f'TrainingError: {self.cause}' diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py index 8349e737..a796fc4d 100644 --- a/learning_loop_node/trainer/rest/backdoor_controls.py +++ b/learning_loop_node/trainer/rest/backdoor_controls.py @@ -9,6 +9,7 @@ from fastapi import APIRouter, HTTPException, Request from ...data_classes import ErrorConfiguration, NodeState +from ..trainer_logic import TrainerLogic if TYPE_CHECKING: from ..trainer_node import TrainerNode @@ -95,6 +96,8 @@ async def add_steps(request: Request): trainer_node = trainer_node_from_request(request) trainer_logic = trainer_node.trainer_logic # NOTE: is MockTrainerLogic which has 'provide_new_model' and 'current_iteration' + assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic' + if not trainer_logic._executor or not trainer_logic._executor.is_process_running(): # pylint: disable=protected-access training = trainer_logic._training # pylint: disable=protected-access logging.error(f'cannot add steps when not running, state: {training.training_state if training else "None"}') @@ -109,7 +112,7 @@ async def add_steps(request: Request): for _ in range(steps): try: logging.warning('calling sync_confusion_matrix') - await trainer_logic.sync_confusion_matrix() + await trainer_logic._sync_confusion_matrix() # pylint: disable=protected-access except Exception: pass # Tests can force synchroniation to fail, error state is reported to backend trainer_logic.provide_new_model = previous_state # type: ignore @@ -119,11 +122,14 @@ async def add_steps(request: Request): @router.post("/kill_training_process") async def kill_process(request: Request): + # pylint: disable=protected-access trainer_node = trainer_node_from_request(request) - if not trainer_node.trainer_logic._executor or not trainer_node.trainer_logic._executor.is_process_running(): + trainer_logic = trainer_node.trainer_logic + assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic' + if not trainer_logic._executor or not trainer_logic._executor.is_process_running(): raise HTTPException(status_code=409, detail="trainer is not running") - trainer_node.trainer_logic._executor.stop() + trainer_logic._executor.stop() @router.post("/force_status_update") diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 40b706fd..82fd8aad 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -71,7 +71,6 @@ async def _train(self) -> None: error = self.get_executor_error_from_log() if error: - self.errors.set(error_key, error) raise TrainingError(cause=error) # TODO check if this works: # if self.executor.return_code != 0: diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index ac0479c1..7221e6ec 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -143,7 +143,8 @@ async def perform_state(self, error_key: str, state_during: TrainerState, state_ logging.exception(f'Error in {state_during} - Exception:') self.active_training.training_state = previous_state else: - self.errors.reset(error_key) + if not reset_early: + self.errors.reset(error_key) self.active_training.training_state = state_after self.last_training_io.save(self.active_training) @@ -169,6 +170,7 @@ async def _download_model(self) -> None: logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download') async def _sync_confusion_matrix(self): + '''NOTE: This stage sets the errors explicitly because it may be used inside the training stage.''' error_key = 'sync_confusion_matrix' try: new_best_model = self.get_new_best_model() diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py index b24dc531..e88a2de3 100644 --- a/mock_trainer/app_code/mock_trainer_logic.py +++ b/mock_trainer/app_code/mock_trainer_logic.py @@ -32,7 +32,7 @@ async def resume(self) -> None: async def start_training(self) -> None: self.current_iteration = 0 if self.error_configuration.begin_training: - raise Exception() + raise Exception('Could not start training') self.executor.start('while true; do sleep 1; done') async def start_training_from_scratch(self, base_model_id: str) -> None: @@ -114,7 +114,7 @@ def training_progress(self) -> float: def get_new_best_model(self) -> Optional[BasicModel]: logging.warning('get_new_model called') if self.error_configuration.get_new_model: - raise Exception() + raise Exception('Could not get new model') if not self.provide_new_model: return None self.current_iteration += 1 From ee1d113c4d4777dc964c2e0f5fa28f443526ae98 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 8 Mar 2024 11:30:06 +0100 Subject: [PATCH 16/62] fix more tests (use enums) --- .../trainer/tests/states/test_state_detecting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py index d48279ee..fbb8e9c0 100644 --- a/learning_loop_node/trainer/tests/states/test_state_detecting.py +++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py @@ -23,8 +23,8 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi TrainerState.Detected, trainer._do_detections) ) - await assert_training_state(trainer.active_training, 'detecting', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, 'detected', timeout=10, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=1, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=10, interval=0.001) assert trainer_has_error(trainer) is False assert trainer.active_training.training_state == TrainerState.Detected @@ -40,7 +40,7 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, 'detecting', timeout=5, interval=0.001) + await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=5, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) From 6a83928e372c3a5f30f033bd9ea1c8928e50c8d8 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 8 Mar 2024 11:35:51 +0100 Subject: [PATCH 17/62] try reduce flakynes of test_about_endpoint --- learning_loop_node/detector/tests/test_client_communication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_loop_node/detector/tests/test_client_communication.py b/learning_loop_node/detector/tests/test_client_communication.py index 97daf93a..97e3f074 100644 --- a/learning_loop_node/detector/tests/test_client_communication.py +++ b/learning_loop_node/detector/tests/test_client_communication.py @@ -90,7 +90,7 @@ async def test_sio_upload(test_detector_node: DetectorNode, sio_client): # NOTE: This test seems to be flaky. async def test_about_endpoint(test_detector_node: DetectorNode): - await asyncio.sleep(1) + await asyncio.sleep(3) response = requests.get(f'http://localhost:{GLOBALS.detector_port}/about', timeout=30) assert response.status_code == 200 From 704cdf4d367940502ada454df2d99fa8dd89fcd2 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 8 Mar 2024 12:42:06 +0100 Subject: [PATCH 18/62] fix mock_trainer tests --- mock_trainer/app_code/tests/test_detections.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py index df6e1292..42fbfe8b 100644 --- a/mock_trainer/app_code/tests/test_detections.py +++ b/mock_trainer/app_code/tests/test_detections.py @@ -5,10 +5,9 @@ from learning_loop_node.data_classes import Category, Context from learning_loop_node.globals import GLOBALS -from learning_loop_node.helpers.misc import create_project_folder +from learning_loop_node.helpers.misc import create_project_folder, generate_training from learning_loop_node.loop_communication import LoopCommunicator from learning_loop_node.tests import test_helper -from learning_loop_node.trainer.trainer_logic import TrainerLogic from learning_loop_node.trainer.trainer_node import TrainerNode from ..mock_trainer_logic import MockTrainerLogic @@ -33,7 +32,7 @@ async def test_all(setup_test_project1, glc: LoopCommunicator): # pylint: disab trainer.init_new_training(context=context, details=details) project_folder = create_project_folder(context) - training = TrainerLogic.generate_training(project_folder, context) + training = generate_training(project_folder, context) training.model_id_for_detecting = latest_model_id trainer._training = training # pylint: disable=protected-access await trainer._do_detections() # pylint: disable=protected-access From 811514425120302bd18ac277b9e1ddee51d82161 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 22 Mar 2024 19:19:41 +0100 Subject: [PATCH 19/62] improve code documentation, abstraction layers and api --- learning_loop_node/data_classes/training.py | 23 +- learning_loop_node/data_exchanger.py | 8 +- learning_loop_node/helpers/misc.py | 5 - learning_loop_node/py.typed | 0 learning_loop_node/trainer/io_helpers.py | 10 + learning_loop_node/trainer/tests/conftest.py | 28 +- .../trainer/tests/state_helper.py | 2 +- .../tests/states/test_state_cleanup.py | 4 +- .../tests/states/test_state_detecting.py | 36 +- .../states/test_state_download_train_model.py | 40 +- .../tests/states/test_state_prepare.py | 28 +- .../test_state_sync_confusion_matrix.py | 38 +- .../trainer/tests/states/test_state_train.py | 36 +- .../states/test_state_upload_detections.py | 48 +-- .../tests/states/test_state_upload_model.py | 34 +- .../trainer/tests/test_errors.py | 18 +- learning_loop_node/trainer/trainer_logic.py | 44 +- .../trainer/trainer_logic_abstraction.py | 146 ------- .../trainer/trainer_logic_generic.py | 395 ++++++++++++------ learning_loop_node/trainer/trainer_node.py | 4 +- mock_trainer/app_code/progress_simulator.py | 4 +- .../app_code/tests/test_detections.py | 2 +- .../app_code/tests/test_mock_trainer.py | 2 +- 23 files changed, 490 insertions(+), 465 deletions(-) create mode 100644 learning_loop_node/py.typed delete mode 100644 learning_loop_node/trainer/trainer_logic_abstraction.py diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index 449cc85b..2ce1c95b 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -17,6 +17,14 @@ class Hyperparameter(): flip_rl: bool flip_ud: bool + @staticmethod + def from_data(data: Dict): + return Hyperparameter( + resolution=data['resolution'], + flip_rl=data.get('flip_rl', False), + flip_ud=data.get('flip_ud', False) + ) + @dataclass(**KWONLY_SLOTS) class TrainingData(): @@ -93,18 +101,25 @@ class Training(): id: str context: Context - project_folder: str - images_folder: str - training_folder: str + project_folder: str # f'{GLOBALS.data_folder}/{context.organization}/{context.project}' + images_folder: str # f'{project_folder}/images' + training_folder: str # f'{project_folder}/trainings/{trainings_id}' start_time: float = field(default_factory=time.time) - base_model_id: Optional[str] = None + base_model_id: Optional[str] = None # model uuid to download into base_model.json data: Optional[TrainingData] = None training_number: Optional[int] = None training_state: Optional[str] = None model_id_for_detecting: Optional[str] = None hyperparameters: Optional[Dict] = None + def set_values_from_data(self, data: Dict): + self.data = TrainingData(categories=Category.from_list(data['categories'])) + self.data.hyperparameter = Hyperparameter.from_data(data=data) + self.training_number = data['training_number'] + self.base_model_id = data['id'] + self.training_state = TrainerState.Initialized + @dataclass(**KWONLY_SLOTS) class TrainingOut(): diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index ab53b243..840a0fe9 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -122,11 +122,11 @@ async def _download_one_image(self, path: str, image_id: str, image_folder: str) if not await is_valid_image(filename, self.check_jpeg): os.remove(filename) - async def download_model(self, target_folder: str, context: Context, model_id: str, model_format: str) -> List[str]: + async def download_model(self, target_folder: str, context: Context, model_uuid: str, model_format: str) -> List[str]: """Downloads a model and returns the paths of the downloaded files.""" - logging.info(f'Downloading model {model_id} to {target_folder}..') + logging.info(f'Downloading model {model_uuid} to {target_folder}..') - path = f'/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' + path = f'/{context.organization}/projects/{context.project}/models/{model_uuid}/{model_format}/file' response = await self.loop_communicator.get(path, requires_login=False) if response.status_code != 200: content = response.json() @@ -150,7 +150,7 @@ async def download_model(self, target_folder: str, context: Context, model_id: s new_file = shutil.move(file, target_folder) created_files.append(new_file) - logging.info(f'---- downloaded model {model_id}/{model_format} to {tmp_path}. Moved to {target_folder}.') + logging.info(f'---- downloaded model {model_uuid}/{model_format} to {tmp_path}. Moved to {target_folder}.') return created_files async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]: diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py index 1f2e297d..5b996092 100644 --- a/learning_loop_node/helpers/misc.py +++ b/learning_loop_node/helpers/misc.py @@ -76,7 +76,6 @@ async def is_valid_image(filename: str, check_jpeg: bool) -> bool: return "OK" in out.decode() -@staticmethod async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) -> None: logging.info('deleting corrupt images') n_deleted = 0 @@ -189,7 +188,6 @@ def activate_asyncio_warnings() -> None: logging.exception('could not activate asyncio warnings. Exception:') -@staticmethod def images_for_ids(image_ids, image_folder) -> List[str]: logging.info(f'### Going to get images for {len(image_ids)} images ids') start = perf_counter() @@ -200,7 +198,6 @@ def images_for_ids(image_ids, image_folder) -> List[str]: return images -@staticmethod def generate_training(project_folder: str, context: Context) -> Training: training_uuid = str(uuid4()) return Training( @@ -212,7 +209,6 @@ def generate_training(project_folder: str, context: Context) -> Training: ) -@staticmethod def delete_all_training_folders(project_folder: str): if not os.path.exists(f'{project_folder}/trainings'): return @@ -220,7 +216,6 @@ def delete_all_training_folders(project_folder: str): shutil.rmtree(f'{project_folder}/trainings/{uuid}', ignore_errors=True) -@staticmethod def create_training_folder(project_folder: str, trainings_id: str) -> str: training_folder = f'{project_folder}/trainings/{trainings_id}' os.makedirs(training_folder, exist_ok=True) diff --git a/learning_loop_node/py.typed b/learning_loop_node/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/learning_loop_node/trainer/io_helpers.py b/learning_loop_node/trainer/io_helpers.py index 6ec7a5c3..453add80 100644 --- a/learning_loop_node/trainer/io_helpers.py +++ b/learning_loop_node/trainer/io_helpers.py @@ -14,6 +14,16 @@ from ..loop_communication import LoopCommunicator +class EnvironmentVars: + def __init__(self) -> None: + self.restart_after_training = os.environ.get( + 'RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1'] + self.keep_old_trainings = os.environ.get( + 'KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1'] + self.inference_batch_size = int( + os.environ.get('INFERENCE_BATCH_SIZE', '10')) + + class LastTrainingIO: def __init__(self, node_uuid: str) -> None: diff --git a/learning_loop_node/trainer/tests/conftest.py b/learning_loop_node/trainer/tests/conftest.py index 75937920..f07af98f 100644 --- a/learning_loop_node/trainer/tests/conftest.py +++ b/learning_loop_node/trainer/tests/conftest.py @@ -25,13 +25,13 @@ async def test_initialized_trainer_node(): trainer = TestingTrainerLogic() node = TrainerNode(name='test', trainer_logic=trainer, uuid='NOD30000-0000-0000-0000-000000000000') trainer._node = node # pylint: disable=protected-access - trainer.init_new_training(context=Context(organization='zauberzeug', project='demo'), - details={'categories': [], - 'id': '917d5c7f-403d-7e92-f95f-577f79c2273a', # version 1.2 of demo project - 'training_number': 0, - 'resolution': 800, - 'flip_rl': False, - 'flip_ud': False}) + trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'), + details={'categories': [], + 'id': '917d5c7f-403d-7e92-f95f-577f79c2273a', # version 1.2 of demo project + 'training_number': 0, + 'resolution': 800, + 'flip_rl': False, + 'flip_ud': False}) # pylint: disable=protected-access await node._on_startup() @@ -47,13 +47,13 @@ async def test_initialized_trainer(): # pylint: disable=protected-access await node._on_startup() trainer._node = node # pylint: disable=protected-access - trainer.init_new_training(context=Context(organization='zauberzeug', project='demo'), - details={'categories': [], - 'id': '917d5c7f-403d-7e92-f95f-577f79c2273a', # version 1.2 of demo project - 'training_number': 0, - 'resolution': 800, - 'flip_rl': False, - 'flip_ud': False}) + trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'), + details={'categories': [], + 'id': '917d5c7f-403d-7e92-f95f-577f79c2273a', # version 1.2 of demo project + 'training_number': 0, + 'resolution': 800, + 'flip_rl': False, + 'flip_ud': False}) yield trainer # await node._on_shutdown() diff --git a/learning_loop_node/trainer/tests/state_helper.py b/learning_loop_node/trainer/tests/state_helper.py index a5b982ec..01c9001d 100644 --- a/learning_loop_node/trainer/tests/state_helper.py +++ b/learning_loop_node/trainer/tests/state_helper.py @@ -7,7 +7,7 @@ def create_active_training_file(trainer: TrainerLogic, **kwargs) -> None: update_attributes(trainer._training, **kwargs) # pylint: disable=protected-access - trainer.node.last_training_io.save(training=trainer.active_training) + trainer.node.last_training_io.save(training=trainer.training) async def assert_training_state(training: Training, state: str, timeout: float, interval: float) -> None: diff --git a/learning_loop_node/trainer/tests/states/test_state_cleanup.py b/learning_loop_node/trainer/tests/states/test_state_cleanup.py index 3326d156..9fbf076d 100644 --- a/learning_loop_node/trainer/tests/states/test_state_cleanup.py +++ b/learning_loop_node/trainer/tests/states/test_state_cleanup.py @@ -5,7 +5,7 @@ async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state='ready_for_cleanup') - trainer.init_from_last_training() + trainer._init_from_last_training() trainer.active_training_io.save_detections(detections=[]) trainer.active_training_io.save_detection_upload_progress(count=42) @@ -16,7 +16,7 @@ async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic assert trainer.active_training_io.detection_upload_progress_exist() is True assert trainer.active_training_io.detections_upload_file_index_exists() is True - await trainer.clear_training() + await trainer._clear_training() assert trainer._training is None # pylint: disable=protected-access assert trainer.node.last_training_io.exists() is False diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py index fbb8e9c0..efd9b966 100644 --- a/learning_loop_node/trainer/tests/states/test_state_detecting.py +++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py @@ -19,28 +19,28 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi model_id_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a') # trainer.load_active_training() _ = asyncio.get_running_loop().create_task( - trainer.perform_state('do_detections', TrainerState.Detecting, - TrainerState.Detected, trainer._do_detections) + trainer._perform_state('do_detections', TrainerState.Detecting, + TrainerState.Detected, trainer._do_detections) ) - await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=10, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detecting, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detected, timeout=10, interval=0.001) assert trainer_has_error(trainer) is False - assert trainer.active_training.training_state == TrainerState.Detected - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.Detected + assert trainer.node.last_training_io.load() == trainer.training assert trainer.active_training_io.detections_exist() async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded) - trainer.init_from_last_training() - trainer.active_training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242' + trainer._init_from_last_training() + trainer.training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242' - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=5, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detecting, timeout=5, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -53,24 +53,24 @@ async def test_model_not_downloadable_error(test_initialized_trainer: TestingTra trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded, model_id_for_detecting='00000000-0000-0000-0000-000000000000') # bad model id - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, 'detecting', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, 'train_model_uploaded', timeout=1, interval=0.001) + await assert_training_state(trainer.training, 'detecting', timeout=1, interval=0.001) + await assert_training_state(trainer.training, 'train_model_uploaded', timeout=1, interval=0.001) await asyncio.sleep(0.1) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.TrainModelUploaded - assert trainer.active_training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000' - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainModelUploaded + assert trainer.training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000' + assert trainer.node.last_training_io.load() == trainer.training def test_save_load_detections(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer) - trainer.init_from_last_training() + trainer._init_from_last_training() detections = [get_dummy_detections(), get_dummy_detections()] diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py index 12e9b745..f5ef302b 100644 --- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py @@ -12,31 +12,31 @@ async def test_downloading_is_successful(test_initialized_trainer: TestingTraine create_active_training_file(trainer, training_state=TrainerState.DataDownloaded) trainer.model_format = 'mocked' - trainer.init_from_last_training() + trainer._init_from_last_training() asyncio.get_running_loop().create_task( - trainer.perform_state('download_model', - TrainerState.TrainModelDownloading, - TrainerState.TrainModelDownloaded, trainer._download_model)) - await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, 'train_model_downloaded', timeout=1, interval=0.001) + trainer._perform_state('download_model', + TrainerState.TrainModelDownloading, + TrainerState.TrainModelDownloaded, trainer._download_model)) + await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) + await assert_training_state(trainer.training, 'train_model_downloaded', timeout=1, interval=0.001) - assert trainer.active_training.training_state == TrainerState.TrainModelDownloaded - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainModelDownloaded + assert trainer.node.last_training_io.load() == trainer.training # file on disk - assert os.path.exists(f'{trainer.active_training.training_folder}/base_model.json') - assert os.path.exists(f'{trainer.active_training.training_folder}/file_1.txt') - assert os.path.exists(f'{trainer.active_training.training_folder}/file_2.txt') + assert os.path.exists(f'{trainer.training.training_folder}/base_model.json') + assert os.path.exists(f'{trainer.training.training_folder}/file_1.txt') + assert os.path.exists(f'{trainer.training.training_folder}/file_2.txt') async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state='data_downloaded') - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -49,13 +49,13 @@ async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic) trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.DataDownloaded, base_model_id='00000000-0000-0000-0000-000000000000') # bad model id) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.DataDownloaded, timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.DataDownloaded, timeout=1, interval=0.001) assert trainer.errors.has_error_for('download_model') assert trainer._training is not None # pylint: disable=protected-access - assert trainer.active_training.training_state == TrainerState.DataDownloaded - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.DataDownloaded + assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_prepare.py b/learning_loop_node/trainer/tests/states/test_state_prepare.py index 8c490c92..c6648ea4 100644 --- a/learning_loop_node/trainer/tests/states/test_state_prepare.py +++ b/learning_loop_node/trainer/tests/states/test_state_prepare.py @@ -15,22 +15,22 @@ def trainer_has_error(trainer: TrainerLogic): async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer) - trainer.init_from_last_training() + trainer._init_from_last_training() - await trainer.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare) + await trainer._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare) assert trainer_has_error(trainer) is False - assert trainer.active_training.training_state == TrainerState.DataDownloaded - assert trainer.active_training.data is not None - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.DataDownloaded + assert trainer.training.data is not None + assert trainer.node.last_training_io.load() == trainer.training async def test_abort_preparing(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -43,13 +43,13 @@ async def test_request_error(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, context=Context( organization='zauberzeug', project='some_bad_project')) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=3, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.Initialized, timeout=3, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=3, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Initialized, timeout=3, interval=0.001) assert trainer_has_error(trainer) assert trainer._training is not None # pylint: disable=protected-access - assert trainer.active_training.training_state == TrainerState.Initialized - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.Initialized + assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py index cc145233..2fe586aa 100644 --- a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py +++ b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py @@ -23,14 +23,14 @@ async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic): # TODO this requires trainer to have _training # trainer.load_active_training() create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) assert trainer_has_error(trainer) is False - assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced + assert trainer.node.last_training_io.load() == trainer.training async def test_unsynced_model_available__sync_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture): @@ -40,15 +40,15 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine await mock_socket_io_call(mocker, test_initialized_trainer_node, {'success': True}) create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) - trainer.init_from_last_training() + trainer._init_from_last_training() trainer.has_new_model = True - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) assert trainer_has_error(trainer) is False # assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.node.last_training_io.load() == trainer.training async def test_unsynced_model_available__sio_not_connected(test_initialized_trainer_node: TrainerNode): @@ -60,14 +60,14 @@ async def test_unsynced_model_available__sio_not_connected(test_initialized_trai assert test_initialized_trainer_node.sio_client.connected is False trainer.has_new_model = True - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.training async def test_unsynced_model_available__request_is_not_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture): @@ -79,14 +79,14 @@ async def test_unsynced_model_available__request_is_not_successful(test_initiali create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) trainer.has_new_model = True - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.training async def test_basic_mock(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture): diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py index 46a7f953..168a81d4 100644 --- a/learning_loop_node/trainer/tests/states/test_state_train.py +++ b/learning_loop_node/trainer/tests/states/test_state_train.py @@ -10,11 +10,11 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) await asyncio.sleep(0.1) # give tests a bit time to to check for the state assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'start_training' @@ -22,30 +22,30 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic # pylint: disable=protected-access assert trainer._executor is not None trainer._executor.stop() # NOTE normally a training terminates itself - await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.active_training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.training async def test_stop_running_training(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) # pylint: disable=protected-access - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'start_training' await trainer.stop() - await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.active_training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.training async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrainerLogic): @@ -53,20 +53,20 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain # NOTE e.g. when a node-computer is restarted create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) - trainer.init_from_last_training() + trainer._init_from_last_training() trainer._can_resume = True # pylint: disable=protected-access - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) # pylint: disable=protected-access - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'resume' # pylint: disable=protected-access assert trainer._executor is not None trainer._executor.stop() # NOTE normally a training terminates itself e.g - await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.active_training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py index 757cf968..8567e69d 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py @@ -44,14 +44,14 @@ async def create_valid_detection_file(trainer: TrainerLogic, number_of_entries: async def test_upload_successful(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.Detected) - trainer.init_from_last_training() + trainer._init_from_last_training() await create_valid_detection_file(trainer) await asyncio.get_running_loop().create_task( - trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions)) + trainer._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions)) - assert trainer.active_training.training_state == TrainerState.ReadyForCleanup - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.ReadyForCleanup + assert trainer.node.last_training_io.load() == trainer.training @pytest.mark.asyncio @@ -59,14 +59,14 @@ async def test_detection_upload_progress_is_stored(test_initialized_trainer: Tes trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.Detected) - trainer.init_from_last_training() + trainer._init_from_last_training() await create_valid_detection_file(trainer) assert trainer.active_training_io.load_detections_upload_file_index() == 0 # await trainer.upload_detections() await asyncio.get_running_loop().create_task( - trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions)) + trainer._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions)) assert trainer.active_training_io.load_detection_upload_progress() == 0 # Progress is reset for every file assert trainer.active_training_io.load_detections_upload_file_index() == 1 @@ -77,7 +77,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.Detected) - trainer.init_from_last_training() + trainer._init_from_last_training() await create_valid_detection_file(trainer, 2, 0) await create_valid_detection_file(trainer, 2, 1) @@ -91,7 +91,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test for i in range(skip_detections, len(detections), batch_size): batch_detections = detections[i:i+batch_size] # pylint: disable=protected-access - await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size) + await trainer.active_training_io._upload_detections(trainer.training.context, batch_detections, i + batch_size) expected_value = i + batch_size if i + batch_size < len(detections) else 0 # Progress is reset for every file assert trainer.active_training_io.load_detection_upload_progress() == expected_value @@ -107,7 +107,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test for i in range(skip_detections, len(detections), batch_size): batch_detections = detections[i:i+batch_size] # pylint: disable=protected-access - await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size) + await trainer.active_training_io._upload_detections(trainer.training.context, batch_detections, i + batch_size) expected_value = i + batch_size if i + batch_size < len(detections) else 0 # Progress is reset for every file assert trainer.active_training_io.load_detection_upload_progress() == expected_value @@ -120,16 +120,16 @@ async def test_bad_status_from_LearningLoop(test_initialized_trainer: TestingTra create_active_training_file(trainer, training_state=TrainerState.Detected, context=Context( organization='zauberzeug', project='some_bad_project')) - trainer.init_from_last_training() + trainer._init_from_last_training() trainer.active_training_io.save_detections([get_dummy_detections()]) - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.Detected - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.Detected + assert trainer.node.last_training_io.load() == trainer.training async def test_other_errors(test_initialized_trainer: TestingTrainerLogic): @@ -137,27 +137,27 @@ async def test_other_errors(test_initialized_trainer: TestingTrainerLogic): # e.g. missing detection file create_active_training_file(trainer, training_state=TrainerState.Detected) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.Detected - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.Detected + assert trainer.node.last_training_io.load() == trainer.training async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.Detected) - trainer.init_from_last_training() + trainer._init_from_last_training() await create_valid_detection_file(trainer) - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py index 9faa656f..ac147065 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py @@ -19,29 +19,29 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer mock_upload_model_for_training(mocker, 'new_model_id') create_active_training_file(trainer) - trainer.init_from_last_training() + trainer._init_from_last_training() train_task = asyncio.get_running_loop().create_task( - trainer.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, trainer._upload_model)) + trainer._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, trainer._upload_model)) - await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) await train_task assert trainer_has_error(trainer) is False - assert trainer.active_training.training_state == TrainerState.TrainModelUploaded - assert trainer.active_training.model_id_for_detecting is not None - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainModelUploaded + assert trainer.training.model_id_for_detecting is not None + assert trainer.node.last_training_io.load() == trainer.training async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -57,18 +57,18 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) # TODO goes to finished because of the error - await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001) + await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced - assert trainer.active_training.model_id_for_detecting is None - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced + assert trainer.training.model_id_for_detecting is None + assert trainer.node.last_training_io.load() == trainer.training async def test_mock_loop_response_example(mocker: MockerFixture, test_initialized_trainer: TestingTrainerLogic): @@ -77,7 +77,7 @@ async def test_mock_loop_response_example(mocker: MockerFixture, test_initialize mock_upload_model_for_training(mocker, 'new_model_id') create_active_training_file(trainer) - trainer.init_from_last_training() + trainer._init_from_last_training() # pylint: disable=protected-access result = await trainer._upload_model_return_new_model_uuid(Context(organization='zauberzeug', project='demo')) diff --git a/learning_loop_node/trainer/tests/test_errors.py b/learning_loop_node/trainer/tests/test_errors.py index 1ba85572..bdb40c95 100644 --- a/learning_loop_node/trainer/tests/test_errors.py +++ b/learning_loop_node/trainer/tests/test_errors.py @@ -9,30 +9,30 @@ async def test_training_process_is_stopped_when_trainer_reports_error(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) - trainer.init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + trainer._init_from_last_training() + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) trainer.error_msg = 'some_error' - await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) async def test_log_can_provide_only_data_for_current_run(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) - trainer.init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + trainer._init_from_last_training() + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) await asyncio.sleep(0.1) # give tests a bit time to to check for the state assert trainer._executor is not None assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) == 1 trainer.error_msg = 'some_error' - await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) trainer.error_msg = None - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) await asyncio.sleep(1) assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) > 1 diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 82fd8aad..c5b47df9 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -5,7 +5,7 @@ import shutil from abc import abstractmethod from datetime import datetime -from typing import Coroutine, Dict, List, Optional +from typing import Coroutine, List, Optional from dacite import from_dict @@ -22,30 +22,24 @@ def __init__(self, model_format: str) -> None: self.model_format: str = model_format # NOTE: String to be used in the file path for the model on the server: # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' - + self._detection_progress: Optional[float] = None self._executor: Optional[Executor] = None self.start_training_task: Optional[Coroutine] = None + @property + def detection_progress(self) -> Optional[float]: + return self._detection_progress + @property def executor(self) -> Executor: assert self._executor is not None, 'executor must be set, call `run_training` first' return self._executor - @property - def hyperparameters(self) -> Optional[Dict]: - if self._training and self._training.data and self._training.data.hyperparameter: - information = {} - information['resolution'] = self._training.data.hyperparameter.resolution - information['flipRl'] = self._training.data.hyperparameter.flip_rl - information['flipUd'] = self._training.data.hyperparameter.flip_ud - return information - return None - async def _train(self) -> None: previous_state = TrainerState.TrainModelDownloaded error_key = 'run_training' - self._executor = Executor(self.active_training.training_folder) - self.active_training.training_state = TrainerState.TrainingRunning + self._executor = Executor(self.training.training_folder) + self.training.training_state = TrainerState.TrainingRunning try: await self._start_training() @@ -81,7 +75,7 @@ async def _train(self) -> None: logging.exception('Error in TrainingProcess') if self.executor.is_process_running(): self.executor.stop() - self.active_training.training_state = previous_state + self.training.training_state = previous_state raise async def _start_training(self): @@ -89,7 +83,7 @@ async def _start_training(self): if self.can_resume(): self.start_training_task = self.resume() else: - base_model_id = self.active_training.base_model_id + base_model_id = self.training.base_model_id if not is_valid_uuid4(base_model_id): # TODO this check was done earlier! assert isinstance(base_model_id, str) # TODO this could be removed here and accessed via self.training.base_model_id @@ -99,8 +93,8 @@ async def _start_training(self): await self.start_training_task async def _do_detections(self) -> None: - context = self.active_training.context - model_id = self.active_training.model_id_for_detecting + context = self.training.context + model_id = self.training.model_id_for_detecting assert model_id, 'model_id must be set' tmp_folder = f'/tmp/model_for_auto_detections_{model_id}_{self.model_format}' @@ -108,22 +102,22 @@ async def _do_detections(self) -> None: os.makedirs(tmp_folder) logging.info(f'downloading detection model to {tmp_folder}') - await self.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format) + await self.node.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format) with open(f'{tmp_folder}/model.json', 'r') as f: model_information = from_dict(data_class=ModelInformation, data=json.load(f)) project_folder = create_project_folder(context) image_folder = create_image_folder(project_folder) - self.data_exchanger.set_context(context) + self.node.data_exchanger.set_context(context) image_ids = [] for state, p in zip(['inbox', 'annotate', 'review', 'complete'], [0.1, 0.2, 0.3, 0.4]): - self.detection_progress = p + self._detection_progress = p logging.info(f'fetching image ids of {state}') - new_ids = await self.data_exchanger.fetch_image_uuids(query_params=f'state={state}') + new_ids = await self.node.data_exchanger.fetch_image_uuids(query_params=f'state={state}') image_ids += new_ids logging.info(f'downloading {len(new_ids)} images') - await self.data_exchanger.download_images(new_ids, image_folder) - self.detection_progress = 0.42 + await self.node.data_exchanger.download_images(new_ids, image_folder) + self._detection_progress = 0.42 # await delete_corrupt_images(image_folder) images = await asyncio.get_event_loop().run_in_executor(None, images_for_ids, image_ids, image_folder) @@ -133,7 +127,7 @@ async def _do_detections(self) -> None: batch_size = 200 for idx, i in enumerate(range(0, num_images, batch_size)): - self.detection_progress = 0.5 + (i/num_images)*0.5 + self._detection_progress = 0.5 + (i/num_images)*0.5 batch_images = images[i:i+batch_size] batch_detections = await self._detect(model_information, batch_images, tmp_folder) self.active_training_io.save_detections(batch_detections, idx) diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py deleted file mode 100644 index 64349e3d..00000000 --- a/learning_loop_node/trainer/trainer_logic_abstraction.py +++ /dev/null @@ -1,146 +0,0 @@ -import os -import time -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Dict, List, Optional - -from socketio import AsyncClient - -from ..data_classes import Context, Errors, PretrainedModel, TrainerState, Training, TrainingData -from ..data_exchanger import DataExchanger -from ..loop_communication import LoopCommunicator -from .io_helpers import ActiveTrainingIO, LastTrainingIO - -if TYPE_CHECKING: - from .trainer_node import TrainerNode - - -class TrainerLogicAbstraction(ABC): - - def __init__(self, model_format: str): - - # NOTE: String to be used in the file path for the model on the server: - # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' - self.model_format: str = model_format - - self._node: Optional['TrainerNode'] = None # type: ignore - self._last_training_io: Optional[LastTrainingIO] = None # type: ignore - self.errors = Errors() - - self._training: Optional[Training] = None - self._active_training_io: Optional[ActiveTrainingIO] = None - - self.restart_after_training = os.environ.get('RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1'] - self.keep_old_trainings = os.environ.get('KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1'] - self.inference_batch_size = int(os.environ.get('INFERENCE_BATCH_SIZE', '10')) - - @property - def node(self) -> 'TrainerNode': - assert self._node is not None, 'node should be set by TrainerNode before initialization' - return self._node - - @property - def last_training_io(self) -> LastTrainingIO: - assert self._last_training_io is not None, 'last_training_io should be set by TrainerNode before initialization' - return self._last_training_io - - @property - def data_exchanger(self) -> DataExchanger: - return self.node.data_exchanger - - @property - def loop_communicator(self) -> LoopCommunicator: - return self.node.loop_communicator - - @property - def node_uuid(self) -> str: - return self.node.uuid - - @property - def sio_client(self) -> AsyncClient: - return self.node.sio_client - - @property - def active_training_io(self) -> ActiveTrainingIO: - assert self._active_training_io is not None, 'active_training_io must be set, call `init` first' - return self._active_training_io - - @property - def training_active(self) -> bool: - """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'""" - return self._training is not None and self._active_training_io is not None - - @property - def state(self) -> str: - if (not self.training_active) or (self.active_training.training_state is None): - return TrainerState.Idle.value - else: - return self.active_training.training_state - - @property - def active_training(self) -> Training: - assert self._training is not None, 'training must be initialized, call `init` first' - return self._training - - @property - def training_uptime(self) -> Optional[float]: - if self.training_active: - return time.time() - self.active_training.start_time - return None - - @property - def training_data(self) -> Optional[TrainingData]: - if self.training_active and self.active_training.data: - return self.active_training.data - return None - - @property - def training_context(self) -> Optional[Context]: - if self.training_active: - return self.active_training.context - return None - - # --- ABSTRACT PROPERTIES - # --------- implemented in TrainerLogicGeneric - - @property - @abstractmethod - def general_progress(self) -> Optional[float]: - """Returns the general progress of the training per state or None if idle""" - - # --------- implemented in TrainerLogic(with Executor) - @property - @abstractmethod - def hyperparameters(self) -> Optional[Dict]: - """Returns the currently used hyperparameters if available""" - - # --------- not implemented in any abstract class - @property - @abstractmethod - def model_architecture(self) -> Optional[str]: - """Returns the architecture name of the model if available""" - - @property - @abstractmethod - def provided_pretrained_models(self) -> List[PretrainedModel]: - """Returns the list of provided pretrained models""" - - # --- ABSTRACT METHODS ----- - # --------- implemented in TrainerLogicGeneric --- - - @abstractmethod - async def on_shutdown(self): - """Called when the trainer is shut down""" - - @abstractmethod - async def begin_training(self, organization: str, project: str, details: dict): - """Starts the training process""" - - @abstractmethod - async def try_continue_run_if_incomplete(self) -> bool: - """Start training continuation if possible, returns True if continuation started""" - - # --- implemented in TrainerLogic(with Executor) --- - - @abstractmethod - async def stop(self): - """Stops the training process""" diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index 7221e6ec..d9abff34 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -3,37 +3,123 @@ import logging import shutil import sys -from abc import abstractmethod +import time +from abc import ABC, abstractmethod from dataclasses import asdict -from typing import Callable, Coroutine, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Callable, Coroutine, Dict, List, Optional, Union -from dacite import from_dict from fastapi.encoders import jsonable_encoder -from ..data_classes import BasicModel, Category, Context, Hyperparameter, TrainerState, TrainingData, TrainingOut +from ..data_classes import (BasicModel, Context, Errors, PretrainedModel, TrainerState, Training, TrainingData, + TrainingOut) from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4 from .downloader import TrainingsDownloader -from .io_helpers import ActiveTrainingIO -from .trainer_logic_abstraction import TrainerLogicAbstraction +from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO +if TYPE_CHECKING: + from .trainer_node import TrainerNode -class TrainerLogicGeneric(TrainerLogicAbstraction): - def __init__(self, model_format: str) -> None: - super().__init__(model_format) +class TrainerLogicGeneric(ABC): + + def __init__(self, model_format: str): + + # NOTE: model_format is used in the file path for the model on the server: + # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' + self.model_format: str = model_format + self.errors = Errors() + self.training_task: Optional[asyncio.Task] = None - self.detection_progress = 0.0 self.shutdown_event: asyncio.Event = asyncio.Event() + self._node: Optional['TrainerNode'] = None # type: ignore + self._last_training_io: Optional[LastTrainingIO] = None # type: ignore + + self._training: Optional[Training] = None + self._active_training_io: Optional[ActiveTrainingIO] = None + self._environment_vars = EnvironmentVars() + + # ---------------------------------------- PROPERTIES TO AVOID CHECKING FOR NONE ---------------------------------------- + + @property + def node(self) -> 'TrainerNode': + assert self._node is not None, 'node should be set by TrainerNode before initialization' + return self._node + + @property + def last_training_io(self) -> LastTrainingIO: + assert self._last_training_io is not None, 'last_training_io should be set by TrainerNode before initialization' + return self._last_training_io + + @property + def active_training_io(self) -> ActiveTrainingIO: + assert self._active_training_io is not None, 'active_training_io must be set, call `init` first' + return self._active_training_io + + @property + def training(self) -> Training: + assert self._training is not None, 'training must be initialized, call `init` first' + return self._training + + @property + def training_data(self) -> Optional[TrainingData]: + if self.training_active and self.training.data: + return self.training.data + return None + + @property + def training_context(self) -> Optional[Context]: + if self.training_active: + return self.training.context + return None + # ---------------------------------------- PROPERTIES ---------------------------------------- + + @property + def training_active(self) -> bool: + """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'. + """ + return self._training is not None and self._active_training_io is not None + + @property + def state(self) -> str: + """Returns the current state of the training. Used solely by the node in send_status(). + """ + if (not self.training_active) or (self.training.training_state is None): + return TrainerState.Idle.value + else: + return self.training.training_state + + @property + def training_uptime(self) -> Optional[float]: + """Livetime of current Training object. Start time is set during initialization of Training object. + """ + if self.training_active: + return time.time() - self.training.start_time + return None + + @property + def hyperparameters(self) -> Optional[Dict]: + """Used in sync_confusion_matrix and send_status to provide information about the training configuration. + """ + if self._training and self._training.data and self._training.data.hyperparameter: + information = {} + information['resolution'] = self._training.data.hyperparameter.resolution + information['flipRl'] = self._training.data.hyperparameter.flip_rl + information['flipUd'] = self._training.data.hyperparameter.flip_ud + return information + return None + @property def general_progress(self) -> Optional[float]: - """Represents the progress for different states.""" + """Represents the progress for different states, should run from 0 to 100 for each state. + Note that training_progress and detection_progress need to be implemented in the specific trainer. + """ if not self.training_active: return None - t_state = self.active_training.training_state + t_state = self.training.training_state if t_state == TrainerState.DataDownloading: - return self.data_exchanger.progress + return self.node.data_exchanger.progress if t_state == TrainerState.TrainingRunning: return self.training_progress if t_state == TrainerState.Detecting: @@ -41,45 +127,83 @@ def general_progress(self) -> Optional[float]: return None - def init_new_training(self, context: Context, details: Dict) -> None: - """Called on `begin_training` event from the Learning Loop. - Note that details needs the entries 'categories' and 'training_number'""" + # ---------------------------------------- ABSTRACT PROPERTIES ---------------------------------------- - project_folder = create_project_folder(context) - if not self.keep_old_trainings: - # NOTE: We delete all existing training folders because they are not needed anymore. - delete_all_training_folders(project_folder) - self._training = generate_training(project_folder, context) - self._training.data = TrainingData(categories=Category.from_list(details['categories'])) - self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details) - self._training.training_number = details['training_number'] - self._training.base_model_id = details['id'] - self._training.training_state = TrainerState.Initialized - self._active_training_io = ActiveTrainingIO( - self._training.training_folder, self.loop_communicator, context) - logging.info(f'training initialized: {self._training}') + @property + @abstractmethod + def training_progress(self) -> Optional[float]: + """Represents the training progress.""" + raise NotImplementedError + + @property + @abstractmethod + def detection_progress(self) -> Optional[float]: + """Represents the detection progress.""" + raise NotImplementedError + + @property + @abstractmethod + def model_architecture(self) -> Optional[str]: + """Returns the architecture name of the model if available""" + raise NotImplementedError + + @property + @abstractmethod + def provided_pretrained_models(self) -> List[PretrainedModel]: + """Returns the list of provided pretrained models""" + raise NotImplementedError + + # ---------------------------------------- METHODS ---------------------------------------- + + # NOTE: Trainings are started by the Learning Loop via the begin_training event + # or by the trainer itself via try_continue_run_if_incomplete. + # The trainer will then initialize a new training object and start the training loop. + # Initializing a new training object will create the folder structure for the training. + # The training loop will then run through the states of the training. async def try_continue_run_if_incomplete(self) -> bool: + """Tries to continue a training if the last training was not finished. + """ if not self.training_active and self.last_training_io.exists(): + self._init_from_last_training() logging.info('found incomplete training, continuing now.') - self.init_from_last_training() - asyncio.get_event_loop().create_task(self.run()) + asyncio.get_event_loop().create_task(self._run()) return True return False - def init_from_last_training(self) -> None: + def _init_from_last_training(self) -> None: + """Initializes a new training object from the last training saved on disc via last_training_io. + """ self._training = self.last_training_io.load() assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder' self._active_training_io = ActiveTrainingIO( - self._training.training_folder, self.loop_communicator, self._training.context) + self._training.training_folder, self.node.loop_communicator, self._training.context) async def begin_training(self, organization: str, project: str, details: Dict) -> None: - """Called on `begin_training` event from the Learning Loop.""" + """Called on `begin_training` event from the Learning Loop. + """ + self._init_new_training(Context(organization=organization, project=project), details) + asyncio.get_event_loop().create_task(self._run()) + + def _init_new_training(self, context: Context, details: Dict) -> None: + """Called on `begin_training` event from the Learning Loop. + Note that details needs the entries 'categories' and 'training_number', + but also the hyperparameter entries. + """ + project_folder = create_project_folder(context) + if not self._environment_vars.keep_old_trainings: + delete_all_training_folders(project_folder) + self._training = generate_training(project_folder, context) + self._training.set_values_from_data(details) - self.init_new_training(Context(organization=organization, project=project), details) - asyncio.get_event_loop().create_task(self.run()) + self._active_training_io = ActiveTrainingIO( + self._training.training_folder, self.node.loop_communicator, context) + logging.info(f'new training initialized: {self._training}') - async def run(self) -> None: + async def _run(self) -> None: + """Called on `begin_training` event from the Learning Loop. + Either via `begin_training` or `try_continue_run_if_incomplete`. + """ self.errors.reset_all() try: self.training_task = asyncio.get_running_loop().create_task(self._training_loop()) @@ -87,46 +211,47 @@ async def run(self) -> None: except asyncio.CancelledError: if not self.shutdown_event.is_set(): logging.info('training task was cancelled but not by shutdown event') - self.active_training.training_state = TrainerState.ReadyForCleanup - self.last_training_io.save(self.active_training) - await self.clear_training() + self.training.training_state = TrainerState.ReadyForCleanup + self.last_training_io.save(self.training) + await self._clear_training() except Exception as e: logging.exception(f'Error in train: {e}') # ---------------------------------------- TRAINING STATES ---------------------------------------- async def _training_loop(self) -> None: - """asyncio.CancelledError is catched in run""" - + """Cycle through the training states until the training is finished or + an asyncio.CancelledError is raised. + """ assert self.training_active while self._training is not None: - tstate = self.active_training.training_state - logging.info(f'STATE LOOP: {tstate}, eerrors: {self.errors.errors}') + tstate = self.training.training_state await asyncio.sleep(0.6) # Note: Required for pytests! + if tstate == TrainerState.Initialized: # -> DataDownloading -> DataDownloaded - await self.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare) + await self._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare) elif tstate == TrainerState.DataDownloaded: # -> TrainModelDownloading -> TrainModelDownloaded - await self.perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model) + await self._perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model) elif tstate == TrainerState.TrainModelDownloaded: # -> TrainingRunning -> TrainingFinished - await self.perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train) + await self._perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train) elif tstate == TrainerState.TrainingFinished: # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced - await self.perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix) + await self._perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix) elif tstate == TrainerState.ConfusionMatrixSynced: # -> TrainModelUploading -> TrainModelUploaded - await self.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model) + await self._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model) elif tstate == TrainerState.TrainModelUploaded: # -> Detecting -> Detected - await self.perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections) + await self._perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections) elif tstate == TrainerState.Detected: # -> DetectionUploading -> ReadyForCleanup - await self.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions) + await self._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions) elif tstate == TrainerState.ReadyForCleanup: # -> RESTART or TrainingFinished - await self.clear_training() + await self._clear_training() self.may_restart() - async def perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False): + async def _perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False): await asyncio.sleep(0.1) logging.info(f'Performing state: {state_during}') - previous_state = self.active_training.training_state - self.active_training.training_state = state_during + previous_state = self.training.training_state + self.training.training_state = state_during await asyncio.sleep(0.1) if reset_early: self.errors.reset(error_key) @@ -141,71 +266,78 @@ async def perform_state(self, error_key: str, state_during: TrainerState, state_ except Exception as e: self.errors.set(error_key, str(e)) logging.exception(f'Error in {state_during} - Exception:') - self.active_training.training_state = previous_state + self.training.training_state = previous_state else: if not reset_early: self.errors.reset(error_key) - self.active_training.training_state = state_after - self.last_training_io.save(self.active_training) + self.training.training_state = state_after + self.last_training_io.save(self.training) async def _prepare(self) -> None: - self.data_exchanger.set_context(self.active_training.context) - downloader = TrainingsDownloader(self.data_exchanger) - image_data, skipped_image_count = await downloader.download_training_data(self.active_training.images_folder) - assert self.active_training.data is not None, 'training.data must be set' - self.active_training.data.image_data = image_data - self.active_training.data.skipped_image_count = skipped_image_count + """Downloads images to the images_folder and saves annotations to training.data.image_data. + """ + self.node.data_exchanger.set_context(self.training.context) + downloader = TrainingsDownloader(self.node.data_exchanger) + image_data, skipped_image_count = await downloader.download_training_data(self.training.images_folder) + assert self.training.data is not None, 'training.data must be set' + self.training.data.image_data = image_data + self.training.data.skipped_image_count = skipped_image_count async def _download_model(self) -> None: - model_id = self.active_training.base_model_id - assert model_id is not None, 'model_id must be set' - if is_valid_uuid4( - self.active_training.base_model_id): # TODO this checks if we continue a training -> make more explicit + """If training is continued, the model is downloaded from the Learning Loop to the training_folder. + The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training. + """ + model_id = self.training.base_model_id + # TODO this checks if we continue a training -> make more explicit + if model_id and is_valid_uuid4(self.training.base_model_id): logging.info('loading model from Learning Loop') logging.info(f'downloading model {model_id} as {self.model_format}') - await self.data_exchanger.download_model(self.active_training.training_folder, self.active_training.context, model_id, self.model_format) - shutil.move(f'{self.active_training.training_folder}/model.json', - f'{self.active_training.training_folder}/base_model.json') + await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, model_id, self.model_format) + shutil.move(f'{self.training.training_folder}/model.json', + f'{self.training.training_folder}/base_model.json') else: - logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download') + logging.info( + f'base_model_id {model_id} is not a valid uuid4 (or no base model was not provided), skipping download') - async def _sync_confusion_matrix(self): - '''NOTE: This stage sets the errors explicitly because it may be used inside the training stage.''' + async def _sync_confusion_matrix(self) -> None: + """Syncronizes the confusion matrix with the Learning Loop via the update_training endpoint. + NOTE: This stage sets the errors explicitly because it may be used inside the training stage. + """ error_key = 'sync_confusion_matrix' try: new_best_model = self.get_new_best_model() - if new_best_model and self.active_training.data: - new_training = TrainingOut(trainer_id=self.node_uuid, + if new_best_model and self.training.data: + new_training = TrainingOut(trainer_id=self.node.uuid, confusion_matrix=new_best_model.confusion_matrix, - train_image_count=self.active_training.data.train_image_count(), - test_image_count=self.active_training.data.test_image_count(), + train_image_count=self.training.data.train_image_count(), + test_image_count=self.training.data.test_image_count(), hyperparameters=self.hyperparameters) await asyncio.sleep(0.1) # NOTE needed for tests. - result = await self.sio_client.call('update_training', ( - self.active_training.context.organization, self.active_training.context.project, jsonable_encoder(new_training))) + result = await self.node.sio_client.call('update_training', ( + self.training.context.organization, self.training.context.project, jsonable_encoder(new_training))) if isinstance(result, dict) and result['success']: - logging.info(f'successfully updated training {asdict(new_training)}') + logging.info( + f'successfully updated training {asdict(new_training)}') self.on_model_published(new_best_model) else: - raise Exception(f'Error for update_training: Response from loop was : {result}') + raise Exception( + f'Error for update_training: Response from loop was : {result}') except Exception as e: logging.exception('Error during confusion matrix syncronization') self.errors.set(error_key, str(e)) raise self.errors.reset(error_key) - async def _upload_model(self) -> None | bool: - """Returns True if the training should be cleaned up.""" - - new_model_id = await self._upload_model_return_new_model_uuid(self.active_training.context) + async def _upload_model(self) -> None: + """Uploads the latest model to the Learning Loop. + """ + new_model_id = await self._upload_model_return_new_model_uuid(self.training.context) if new_model_id is None: - self.active_training.training_state = TrainerState.ReadyForCleanup + self.training.training_state = TrainerState.ReadyForCleanup logging.error('could not upload model - maybe training failed.. cleaning up') - return True logging.info(f'Successfully uploaded model and received new model id: {new_model_id}') - self.active_training.model_id_for_detecting = new_model_id - return None + self.training.model_id_for_detecting = new_model_id async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]: """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file. @@ -213,6 +345,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona The conversion from .wts to .engine is done by the detector (needs to be done on target hardware). Note that trainer may train with different classes, which is why we send an initial model.json file. """ + # NOTE: I guess this is in executor because originally the conversion happened here.. files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files) if files is None: return None @@ -225,10 +358,10 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona model_uuid = None for file_format in [f for f in files if f not in already_uploaded_formats]: - _files = files[file_format] + [self.dump_categories_to_json()] + _files = files[file_format] + [self._dump_categories_to_json()] assert len([f for f in _files if 'model.json' in f]) == 1, "model.json must be included exactly once" - model_uuid = await self.data_exchanger.upload_model_get_uuid(context, _files, self.active_training.training_number, file_format) + model_uuid = await self.node.data_exchanger.upload_model_get_uuid(context, _files, self.training.training_number, file_format) if model_uuid is None: return None @@ -237,20 +370,23 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona return model_uuid - def dump_categories_to_json(self) -> str: + def _dump_categories_to_json(self) -> str: + """Dumps the categories to a json file and returns the path to the file. + """ content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None json_path = '/tmp/model.json' with open(json_path, 'w') as f: json.dump(content, f) return json_path - async def clear_training(self): + async def _clear_training(self): + """Clears the training data after a training has finished. + """ self.active_training_io.delete_detections() self.active_training_io.delete_detection_upload_progress() self.active_training_io.delete_detections_upload_file_index() - await self.clear_training_data(self.active_training.training_folder) + await self.clear_training_data(self.training.training_folder) self.last_training_io.delete() - # self.training.training_state = TrainingState.TrainingFinished await self.node.send_status() self._training = None @@ -258,7 +394,9 @@ async def clear_training(self): # ---------------------------------------- OTHER METHODS ---------------------------------------- def may_restart(self) -> None: - if self.restart_after_training: + """If the environment variable RESTART_AFTER_TRAINING is set, the trainer will restart after a training. + """ + if self._environment_vars.restart_after_training: logging.info('restarting') sys.exit(0) else: @@ -269,49 +407,64 @@ async def on_shutdown(self) -> None: await self.stop() await self.stop() - # ---------------------------------------- ABSTRACT PROPERTIES ---------------------------------------- - - @property - @abstractmethod - def training_progress(self) -> Optional[float]: - """Represents the training progress.""" - raise NotImplementedError + async def stop(self): + """Stops the training process by canceling training task. + """ + if not self.training_active: + return + if self.training_task: + logging.info('cancelling training task') + if self.training_task.cancel(): + try: + await self.training_task + except asyncio.CancelledError: + pass + logging.info('cancelled training task') + self.may_restart() # ---------------------------------------- ABSTRACT METHODS ---------------------------------------- @abstractmethod async def _train(self) -> None: - '''Should be used to execute a training. + """Should be used to execute a training. + At this point, images are already downloaded to the images_folder and annotations are saved in training.data.image_data. + If a training is continued, the model is already downloaded. The model should be synchronized with the Learning Loop via self._sync_confusion_matrix() every now and then. - asyncio.CancelledError should be catched and re-raised.''' + asyncio.CancelledError should be catched and re-raised. + """ + raise NotImplementedError @abstractmethod async def _do_detections(self) -> None: - '''Should be used to execute detections. + """Should be used to execute detections. active_training_io.save_detections(...) should be used to store the detections. - asyncio.CancelledError should be catched and re-raised.''' + asyncio.CancelledError should be catched and re-raised. + """ + raise NotImplementedError @abstractmethod def get_new_best_model(self) -> Optional[BasicModel]: - '''Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe. + """Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe. Returns None if no new model could be found. Otherwise BasicModel(confusion_matrix, meta_information). `confusion_matrix` contains a dict of all classes: - The classes must be identified by their id, not their name. - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives). `meta_information` can hold any data which is helpful for self.on_model_published to store weight file etc for later upload via self.get_model_files - ''' + """ + raise NotImplementedError @abstractmethod def on_model_published(self, basic_model: BasicModel) -> None: - '''Called after a BasicModel has been successfully send to the Learning Loop. - The files for this model should be stored. - self.get_latest_model_files is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop. - In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model - ''' + """Called after the confusion matrix corresponding to BasicModel has been successfully send to the Learning Loop. + The respective files for this model should be stored so they can be later uploaded in get_latest_model_files. + """ + raise NotImplementedError @abstractmethod def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]: - '''Called when the Learning Loop requests to backup the latest model for the training. + """Called when the Learning Loop requests to backup the latest model for the training. + This function is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop. + In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model Should return a list of file paths which describe the model. These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.) and will be stored in the Learning Loop unter the format of this trainer. @@ -319,9 +472,13 @@ def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str For example "model.pt" for pytorch or "model.weights" for darknet/yolo. If a trainer can also generate other formats (for example for an detector), - a dictionary mapping format -> list of files can be returned.''' + a dictionary mapping format -> list of files can be returned. + """ + raise NotImplementedError @abstractmethod async def clear_training_data(self, training_folder: str) -> None: - '''Called after a training has finished. Deletes all data that is not needed anymore after a training run. - This can be old weightfiles or any additional files.''' + """Called after a training has finished. Deletes all data that is not needed anymore after a training run. + This can be old weightfiles or any additional files. + """ + raise NotImplementedError diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py index c87124c1..6112d449 100644 --- a/learning_loop_node/trainer/trainer_node.py +++ b/learning_loop_node/trainer/trainer_node.py @@ -9,12 +9,12 @@ from ..node import Node from .io_helpers import LastTrainingIO from .rest import backdoor_controls, controls -from .trainer_logic_abstraction import TrainerLogicAbstraction +from .trainer_logic_generic import TrainerLogicGeneric class TrainerNode(Node): - def __init__(self, name: str, trainer_logic: TrainerLogicAbstraction, uuid: Optional[str] = None, use_backdoor_controls: bool = False): + def __init__(self, name: str, trainer_logic: TrainerLogicGeneric, uuid: Optional[str] = None, use_backdoor_controls: bool = False): super().__init__(name, uuid, 'trainer') trainer_logic._node = self self.trainer_logic = trainer_logic diff --git a/mock_trainer/app_code/progress_simulator.py b/mock_trainer/app_code/progress_simulator.py index 6eaf5ced..042a0b29 100644 --- a/mock_trainer/app_code/progress_simulator.py +++ b/mock_trainer/app_code/progress_simulator.py @@ -10,8 +10,8 @@ def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) - return None confusion_matrix = {} - assert trainer.active_training.data is not None - for category in trainer.active_training.data.categories: + assert trainer.training.data is not None + for category in trainer.training.data.categories: try: minimum = latest_known_confusion_matrix[category.id]['tp'] except Exception: diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py index 42fbfe8b..5b5aa461 100644 --- a/mock_trainer/app_code/tests/test_detections.py +++ b/mock_trainer/app_code/tests/test_detections.py @@ -29,7 +29,7 @@ async def test_all(setup_test_project1, glc: LoopCommunicator): # pylint: disab 'flip_rl': False, 'flip_ud': False} trainer._node = node # pylint: disable=protected-access - trainer.init_new_training(context=context, details=details) + trainer._init_new_training(context=context, details=details) project_folder = create_project_folder(context) training = generate_training(project_folder, context) diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py index f20797b0..fecbe868 100644 --- a/mock_trainer/app_code/tests/test_mock_trainer.py +++ b/mock_trainer/app_code/tests/test_mock_trainer.py @@ -37,6 +37,6 @@ async def test_get_new_model(setup_test_project2): project_folder="", images_folder="", training_folder="",) - mock_trainer.active_training.data = TrainingData(image_data=[], categories=[]) + mock_trainer.training.data = TrainingData(image_data=[], categories=[]) model = mock_trainer.get_new_best_model() assert model is not None From 151393c63bb9e4f88f773e59ff5ae7ec85b647af Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 22 Mar 2024 20:45:15 +0100 Subject: [PATCH 20/62] fix all mypi and linting issues --- .vscode/settings.json | 11 +++- .../annotation/annotator_logic.py | 4 +- learning_loop_node/data_classes/__init__.py | 4 +- learning_loop_node/data_classes/detections.py | 9 +++- learning_loop_node/data_classes/general.py | 4 -- learning_loop_node/data_classes/training.py | 22 ++++---- learning_loop_node/data_exchanger.py | 10 ++-- learning_loop_node/detector/__init__.py | 1 - learning_loop_node/detector/detector_node.py | 4 +- .../inbox_filter/cam_observation_history.py | 11 ++-- learning_loop_node/detector/outbox.py | 1 - learning_loop_node/detector/tests/conftest.py | 1 - .../tests/test_client_communication.py | 4 +- .../detector/tests/test_outbox.py | 2 + learning_loop_node/globals.py | 4 +- .../helpers/gdrive_downloader.py | 2 +- learning_loop_node/helpers/misc.py | 24 +++------ learning_loop_node/loop_communication.py | 22 ++++---- learning_loop_node/node.py | 2 +- learning_loop_node/tests/test_helper.py | 1 - learning_loop_node/trainer/executor.py | 2 +- learning_loop_node/trainer/io_helpers.py | 10 ++-- .../trainer/rest/backdoor_controls.py | 1 - learning_loop_node/trainer/rest/controls.py | 2 + learning_loop_node/trainer/tests/conftest.py | 19 ++----- .../tests/states/test_state_cleanup.py | 4 +- .../tests/states/test_state_detecting.py | 3 +- .../states/test_state_download_train_model.py | 2 + .../tests/states/test_state_prepare.py | 1 + .../test_state_sync_confusion_matrix.py | 2 + .../trainer/tests/states/test_state_train.py | 10 ++-- .../states/test_state_upload_detections.py | 1 + .../tests/states/test_state_upload_model.py | 1 + .../trainer/tests/test_errors.py | 2 + .../trainer/tests/testing_trainer_logic.py | 20 +++---- learning_loop_node/trainer/trainer_logic.py | 14 ++--- .../trainer/trainer_logic_generic.py | 52 +++++++++--------- .../trainer/training_syncronizer.py | 53 ------------------- mock_trainer/app_code/mock_trainer_logic.py | 51 +++++++++--------- mock_trainer/app_code/progress_simulator.py | 6 +-- .../app_code/tests/test_mock_trainer.py | 4 +- 41 files changed, 173 insertions(+), 230 deletions(-) delete mode 100644 learning_loop_node/trainer/training_syncronizer.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 45eb6e46..ff950a35 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,10 +9,19 @@ "--disable=C0111", // Missing docstring (in function/class/method) "--disable=C0114", // Missing module docstring "--disable=C0301", // Line too long (exceeds character limit) + "--disable=W0511", // TODO/FIXME not being used "--disable=W0718", // Catching too general exception "--disable=W0719", // Raising too general exception "--disable=W1203", // Use % formatting in logging functions and pass the % parameters as arguments - "--disable=W1514" // Using open without explicitly specifying an encoding + "--disable=W1514", // Using open without explicitly specifying an encoding + "--disable=R0902", // Too many instance attributes + "--disable=R0903", // Too few public methods + "--disable=R0912", // Too many branches + "--disable=R0913", // Too many arguments + "--disable=R0914", // Too many local variables + "--disable=R0915", // Too many statements + "--disable=R1732", // Consider using with for resource-allocating operations + "--disable=R0801" // Similar lines in 2 files ], "[python]": { "editor.defaultFormatter": "ms-python.autopep8", diff --git a/learning_loop_node/annotation/annotator_logic.py b/learning_loop_node/annotation/annotator_logic.py index 932abce9..a80cc13b 100644 --- a/learning_loop_node/annotation/annotator_logic.py +++ b/learning_loop_node/annotation/annotator_logic.py @@ -7,10 +7,10 @@ class AnnotatorLogic(): - def __init__(self): + def __init__(self) -> None: self._node: Optional[Node] = None - def init(self, node: Node): + def init(self, node: Node) -> None: self._node = node @abstractmethod diff --git a/learning_loop_node/data_classes/__init__.py b/learning_loop_node/data_classes/__init__.py index 0e0a10e9..32188896 100644 --- a/learning_loop_node/data_classes/__init__.py +++ b/learning_loop_node/data_classes/__init__.py @@ -4,5 +4,5 @@ from .general import (AnnotationNodeStatus, Category, CategoryType, Context, DetectionStatus, ErrorConfiguration, ModelInformation, NodeState, NodeStatus) from .socket_response import SocketResponse -from .training import (BasicModel, Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData, - TrainingError, TrainingOut, TrainingStatus) +from .training import (Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData, + TrainingError, TrainingOut, TrainingStateData, TrainingStatus) diff --git a/learning_loop_node/data_classes/detections.py b/learning_loop_node/data_classes/detections.py index 21924720..0872b256 100644 --- a/learning_loop_node/data_classes/detections.py +++ b/learning_loop_node/data_classes/detections.py @@ -13,8 +13,11 @@ @dataclass(**KWONLY_SLOTS) class BoxDetection(): + """Coordinates according to COCO format. x,y is the top left corner of the box. + x increases to the right, y increases downwards. + """ category_name: str - x: int # TODO add definition of x,y,w,h + x: int y: int width: int height: int @@ -47,6 +50,8 @@ def __str__(self): @dataclass(**KWONLY_SLOTS) class PointDetection(): + """Coordinates according to COCO format. x,y is the center of the point. + x increases to the right, y increases downwards.""" category_name: str x: float y: float @@ -111,7 +116,7 @@ class Detections(): point_detections: List[PointDetection] = field(default_factory=list) segmentation_detections: List[SegmentationDetection] = field(default_factory=list) classification_detections: List[ClassificationDetection] = field(default_factory=list) - tags: Optional[List[str]] = field(default_factory=list) + tags: List[str] = field(default_factory=list) date: Optional[str] = field(default_factory=current_datetime) image_id: Optional[str] = None # used for detection of trainers diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py index 9d5c893e..41141395 100644 --- a/learning_loop_node/data_classes/general.py +++ b/learning_loop_node/data_classes/general.py @@ -34,10 +34,6 @@ def from_list(values: List[dict]) -> List['Category']: return [from_dict(data_class=Category, data=value) for value in values] -def create_category(identifier: str, name: str, ctype: Union[CategoryType, str]): # TODO: This is probably unused - return Category(id=identifier, name=name, description='', hotkey='', color='', type=ctype, point_size=None) - - @dataclass(**KWONLY_SLOTS) class Context(): organization: str diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index 2ce1c95b..4df5a289 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -72,7 +72,7 @@ class TrainerState(str, Enum): @dataclass(**KWONLY_SLOTS) class TrainingStatus(): - id: str # TODO this must not be changed, but tests wont detect it -> update tests! + id: str # NOTE this must not be changed, but tests wont detect a change -> update tests! name: str state: Optional[str] errors: Optional[Dict] @@ -87,7 +87,7 @@ class TrainingStatus(): architecture: Optional[str] = None context: Optional[Context] = None - def short_str(self): + def short_str(self) -> str: prgr = f'{self.progress * 100:.0f}%' if self.progress else '' trtesk = f'{self.train_image_count}/{self.test_image_count}/{self.skipped_image_count}' if self.train_image_count else 'n.a.' cntxt = f'{self.context.organization}/{self.context.project}' if self.context else '' @@ -106,14 +106,14 @@ class Training(): training_folder: str # f'{project_folder}/trainings/{trainings_id}' start_time: float = field(default_factory=time.time) - base_model_id: Optional[str] = None # model uuid to download into base_model.json + base_model_id: Optional[str] = None # model uuid to download (to continue training) data: Optional[TrainingData] = None training_number: Optional[int] = None training_state: Optional[str] = None model_id_for_detecting: Optional[str] = None hyperparameters: Optional[Dict] = None - def set_values_from_data(self, data: Dict): + def set_values_from_data(self, data: Dict) -> None: self.data = TrainingData(categories=Category.from_list(data['categories'])) self.data.hyperparameter = Hyperparameter.from_data(data=data) self.training_number = data['training_number'] @@ -123,7 +123,7 @@ def set_values_from_data(self, data: Dict): @dataclass(**KWONLY_SLOTS) class TrainingOut(): - confusion_matrix: Optional[Dict] = None + confusion_matrix: Optional[Dict] = None # This is actually just class-wise metrics train_image_count: Optional[int] = None test_image_count: Optional[int] = None trainer_id: Optional[str] = None @@ -131,8 +131,8 @@ class TrainingOut(): @dataclass(**KWONLY_SLOTS) -class BasicModel(): - confusion_matrix: Optional[Dict] = None +class TrainingStateData(): + confusion_matrix: Optional[Dict] = None # This is actually just class-wise metrics meta_information: Optional[Dict] = None @@ -148,8 +148,8 @@ class Model(): class Errors(): - def __init__(self): - self._errors: Dict = {} + def __init__(self) -> None: + self._errors: Dict[str, str] = {} def set(self, key: str, value: str): self._errors[key] = value @@ -158,7 +158,7 @@ def set(self, key: str, value: str): def errors(self) -> Dict: return self._errors - def reset(self, key: str): + def reset(self, key: str) -> None: try: del self._errors[key] except AttributeError: @@ -166,7 +166,7 @@ def reset(self, key: str): except KeyError: pass - def reset_all(self): + def reset_all(self) -> None: self._errors = {} def has_error_for(self, key: str) -> bool: diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index 840a0fe9..8a5633d4 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -161,8 +161,8 @@ async def upload_model_get_uuid(self, context: Context, files: List[str], traini f'---- could not upload model for training {training_number} and format {mformat}. Details: {response.text}') response.raise_for_status() return None - else: - uploaded_model = response.json() - logging.info( - f'---- uploaded model for training {training_number} and format {mformat}. Model id is {uploaded_model}') - return uploaded_model['id'] + + uploaded_model = response.json() + logging.info( + f'---- uploaded model for training {training_number} and format {mformat}. Model id is {uploaded_model}') + return uploaded_model['id'] diff --git a/learning_loop_node/detector/__init__.py b/learning_loop_node/detector/__init__.py index 8b137891..e69de29b 100644 --- a/learning_loop_node/detector/__init__.py +++ b/learning_loop_node/detector/__init__.py @@ -1 +0,0 @@ - diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py index 18b8ab6c..db657698 100644 --- a/learning_loop_node/detector/detector_node.py +++ b/learning_loop_node/detector/detector_node.py @@ -186,7 +186,9 @@ async def _check_for_update(self) -> None: if not update_to_model_id: self.log.info('could not check for updates') return - if self.detector_logic.is_initialized: # TODO: solve race condition !!! + + # TODO: solve race condition (it should not be required to recheck if model_info is not None, but it is!) + if self.detector_logic.is_initialized: model_info = self.detector_logic._model_info # pylint: disable=protected-access if model_info is not None: self.log.info(f'Current model: {model_info.version} with id {model_info.id}') diff --git a/learning_loop_node/detector/inbox_filter/cam_observation_history.py b/learning_loop_node/detector/inbox_filter/cam_observation_history.py index 88bbe881..a87c72ee 100644 --- a/learning_loop_node/detector/inbox_filter/cam_observation_history.py +++ b/learning_loop_node/detector/inbox_filter/cam_observation_history.py @@ -1,20 +1,17 @@ import os from typing import List, Union -from learning_loop_node.data_classes import (BoxDetection, - ClassificationDetection, - Detections, Observation, - PointDetection, - SegmentationDetection) +from learning_loop_node.data_classes import (BoxDetection, ClassificationDetection, Detections, Observation, + PointDetection, SegmentationDetection) class CamObservationHistory: - def __init__(self): + def __init__(self) -> None: self.reset_time = 3600 self.recent_observations: List[Observation] = [] self.iou_threshold = 0.5 - def forget_old_detections(self): + def forget_old_detections(self) -> None: self.recent_observations = [detection for detection in self.recent_observations if not detection.is_older_than(self.reset_time)] diff --git a/learning_loop_node/detector/outbox.py b/learning_loop_node/detector/outbox.py index 23138c85..ca1a200d 100644 --- a/learning_loop_node/detector/outbox.py +++ b/learning_loop_node/detector/outbox.py @@ -53,7 +53,6 @@ def save(self, image: bytes, detections: Optional[Detections] = None, tags: Opti with open(tmp + '/image.json', 'w') as f: json.dump(jsonable_encoder(asdict(detections)), f) - # TODO sometimes No such file or directory: '/tmp/learning_loop_lib_data/tmp/2023-09-07_13:27:38.399/image.jpg' with open(tmp + '/image.jpg', 'wb') as f: f.write(image) diff --git a/learning_loop_node/detector/tests/conftest.py b/learning_loop_node/detector/tests/conftest.py index ad183fe2..1611f265 100644 --- a/learning_loop_node/detector/tests/conftest.py +++ b/learning_loop_node/detector/tests/conftest.py @@ -12,7 +12,6 @@ import uvicorn from learning_loop_node import DetectorNode -from learning_loop_node.data_classes.general import Category, ModelInformation from learning_loop_node.detector.outbox import Outbox from learning_loop_node.globals import GLOBALS diff --git a/learning_loop_node/detector/tests/test_client_communication.py b/learning_loop_node/detector/tests/test_client_communication.py index 97e3f074..24fbd095 100644 --- a/learning_loop_node/detector/tests/test_client_communication.py +++ b/learning_loop_node/detector/tests/test_client_communication.py @@ -2,7 +2,7 @@ import json import pytest -import requests # type: ignore +import requests from learning_loop_node import DetectorNode from learning_loop_node.data_classes import ModelInformation @@ -101,4 +101,4 @@ async def test_about_endpoint(test_detector_node: DetectorNode): assert response_dict['operation_mode'] == 'idle' assert response_dict['state'] == 'online' assert response_dict['target_model'] == '1.1' - assert any([c.name == 'purple point' for c in model_information.categories]) + assert any(c.name == 'purple point' for c in model_information.categories) diff --git a/learning_loop_node/detector/tests/test_outbox.py b/learning_loop_node/detector/tests/test_outbox.py index 9db7dd09..adf56744 100644 --- a/learning_loop_node/detector/tests/test_outbox.py +++ b/learning_loop_node/detector/tests/test_outbox.py @@ -9,6 +9,8 @@ from learning_loop_node.detector.detector_node import DetectorNode from learning_loop_node.detector.outbox import Outbox +# pylint: disable=redefined-outer-name + @pytest.fixture() def test_outbox(): diff --git a/learning_loop_node/globals.py b/learning_loop_node/globals.py index eee9511a..336df3fa 100644 --- a/learning_loop_node/globals.py +++ b/learning_loop_node/globals.py @@ -1,8 +1,8 @@ class Globals(): - def __init__(self): + def __init__(self) -> None: self.data_folder: str = '/data' - self.detector_port: int = 5004 # TODO move to tests + self.detector_port: int = 5004 # NOTE used for tests GLOBALS = Globals() diff --git a/learning_loop_node/helpers/gdrive_downloader.py b/learning_loop_node/helpers/gdrive_downloader.py index 8e5b3120..deefed68 100755 --- a/learning_loop_node/helpers/gdrive_downloader.py +++ b/learning_loop_node/helpers/gdrive_downloader.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -import requests +import requests # type: ignore # https://stackoverflow.com/a/39225272/4082686 diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py index 5b996092..0f75509b 100644 --- a/learning_loop_node/helpers/misc.py +++ b/learning_loop_node/helpers/misc.py @@ -55,7 +55,7 @@ def _handle_task_result(task: asyncio.Task, logger.exception(message, *message_args) -def get_free_memory_mb() -> float: # TODO check if this is used +def get_free_memory_mb() -> float: # NOTE used by yolov5 pynvml.nvmlInit() h = pynvml.nvmlDeviceGetHandleByIndex(0) info = pynvml.nvmlDeviceGetMemoryInfo(h) @@ -89,15 +89,7 @@ async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) -> def create_resource_paths(organization_name: str, project_name: str, image_ids: List[str]) -> Tuple[List[str], List[str]]: - # TODO: experimental: return [f'/{organization_name}/projects/{project_name}/images/{id}/main' for id in image_ids], image_ids - # if not image_ids: - # return [], [] - # url_ids: List[Tuple(str, str)] = [(f'/{organization_name}/projects/{project_name}/images/{id}/main', id) - # for id in image_ids] - # urls, ids = list(map(list, zip(*url_ids))) - - # return urls, ids def create_image_folder(project_folder: str) -> str: @@ -140,17 +132,17 @@ async def wrapper_ensure_socket_response(*args, **kwargs): if isinstance(value, str): return asdict(SocketResponse.for_success(value)) - elif isinstance(value, bool): + if isinstance(value, bool): return asdict(SocketResponse.from_bool(value)) - elif isinstance(value, SocketResponse): + if isinstance(value, SocketResponse): return value - elif (args[0] in ['connect', 'disconnect', 'connect_error']): + if (args[0] in ['connect', 'disconnect', 'connect_error']): return value - elif value is None: + if value is None: return None - else: - raise Exception( - f"Return type for sio must be str, bool, SocketResponse or None', but was {type(value)}'") + + raise Exception( + f"Return type for sio must be str, bool, SocketResponse or None', but was {type(value)}'") except Exception as e: logging.exception(f'An error occured for {args[0]}') diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index 75c57189..99d9f70b 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -80,8 +80,15 @@ async def put(self, path, files: Optional[List[str]] = None, requires_login=True if files is None: return await self.async_client.put(api_prefix+path, **kwargs) - file_list = [('files', open(f, 'rb')) for f in files] # TODO: does this properly close the files after upload? - return await self.async_client.put(api_prefix+path, files=file_list) + file_handles = [open(f, 'rb') for f in files] # Open files and store handles + try: + file_list = [('files', fh) for fh in file_handles] # Use file handles + response = await self.async_client.put(api_prefix+path, files=file_list) + finally: + for fh in file_handles: + fh.close() # Ensure all files are closed + + return response async def post(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response: if requires_login: @@ -92,14 +99,3 @@ async def delete(self, path, requires_login=True, api_prefix='/api', **kwargs) - if requires_login: await self.ensure_login() return await self.async_client.delete(api_prefix+path, **kwargs) - - # --------------------------------- unused?! --------------------------------- #TODO remove? - - # def get_data(self, path): - # return asyncio.get_event_loop().run_until_complete(self._get_data_async(path)) - - # async def _get_data_async(self, path) -> bytes: - # response = await self.get(f'{self.project_path}{path}') - # if response.status_code != 200: - # raise LoopCommunicationException('bad response: ' + str(response)) - # return response.content diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py index 38742fa4..5424c110 100644 --- a/learning_loop_node/node.py +++ b/learning_loop_node/node.py @@ -62,7 +62,7 @@ def sio_client(self) -> AsyncClient: # --------------------------------------------------- APPLICATION LIFECYCLE --------------------------------------------------- @asynccontextmanager - async def lifespan(self, app: FastAPI): + async def lifespan(self, app: FastAPI): # pylint: disable=unused-argument try: await self._on_startup() self.repeat_task = asyncio.create_task(self.repeat_loop()) diff --git a/learning_loop_node/tests/test_helper.py b/learning_loop_node/tests/test_helper.py index e802c7a0..c52037ed 100644 --- a/learning_loop_node/tests/test_helper.py +++ b/learning_loop_node/tests/test_helper.py @@ -9,7 +9,6 @@ from learning_loop_node.data_classes import Context from learning_loop_node.helpers.misc import create_image_folder, create_project_folder, create_training_folder from learning_loop_node.loop_communication import LoopCommunicator -from learning_loop_node.trainer.trainer_logic import TrainerLogic def get_files_in_folder(folder: str): diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py index c768332c..628ef022 100644 --- a/learning_loop_node/trainer/executor.py +++ b/learning_loop_node/trainer/executor.py @@ -11,7 +11,7 @@ def create_signal_handler(sig=signal.SIGTERM): - if platform == "linux" or platform == "linux2": + if platform in ('linux', 'linux2'): # "The system will send a signal to the child once the parent exits for any reason (even sigkill)." # https://stackoverflow.com/a/19448096 libc = ctypes.CDLL("libc.so.6") diff --git a/learning_loop_node/trainer/io_helpers.py b/learning_loop_node/trainer/io_helpers.py index 453add80..1ae3bd43 100644 --- a/learning_loop_node/trainer/io_helpers.py +++ b/learning_loop_node/trainer/io_helpers.py @@ -174,9 +174,9 @@ async def _upload_detections(self, context: Context, batch_detections: List[Dete msg = f'could not upload detections. {str(response)}' logging.error(msg) raise Exception(msg) + + logging.info('successfully uploaded detections') + if up_progress > len(batch_detections): + self.save_detection_upload_progress(0) else: - logging.info('successfully uploaded detections') - if up_progress > len(batch_detections): - self.save_detection_upload_progress(0) - else: - self.save_detection_upload_progress(up_progress) + self.save_detection_upload_progress(up_progress) diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py index a796fc4d..726cdb8e 100644 --- a/learning_loop_node/trainer/rest/backdoor_controls.py +++ b/learning_loop_node/trainer/rest/backdoor_controls.py @@ -5,7 +5,6 @@ from dataclasses import asdict from typing import TYPE_CHECKING, Dict -from dacite import from_dict from fastapi import APIRouter, HTTPException, Request from ...data_classes import ErrorConfiguration, NodeState diff --git a/learning_loop_node/trainer/rest/controls.py b/learning_loop_node/trainer/rest/controls.py index b8fbbec8..6c92d9a8 100644 --- a/learning_loop_node/trainer/rest/controls.py +++ b/learning_loop_node/trainer/rest/controls.py @@ -7,6 +7,8 @@ router = APIRouter() +# pylint: disable=protected-access + @router.post("/controls/detect/{organization}/{project}/{version}") async def operation_mode(organization: str, project: str, version: str, request: Request): diff --git a/learning_loop_node/trainer/tests/conftest.py b/learning_loop_node/trainer/tests/conftest.py index f07af98f..aca1919c 100644 --- a/learning_loop_node/trainer/tests/conftest.py +++ b/learning_loop_node/trainer/tests/conftest.py @@ -10,6 +10,8 @@ from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_node import TrainerNode +# pylint: disable=protected-access + logging.basicConfig(level=logging.INFO) # show ouptut from uvicorn server https://stackoverflow.com/a/66132186/364388 log_to_stderr(logging.INFO) @@ -24,7 +26,7 @@ async def test_initialized_trainer_node(): trainer = TestingTrainerLogic() node = TrainerNode(name='test', trainer_logic=trainer, uuid='NOD30000-0000-0000-0000-000000000000') - trainer._node = node # pylint: disable=protected-access + trainer._node = node trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'), details={'categories': [], 'id': '917d5c7f-403d-7e92-f95f-577f79c2273a', # version 1.2 of demo project @@ -32,8 +34,6 @@ async def test_initialized_trainer_node(): 'resolution': 800, 'flip_rl': False, 'flip_ud': False}) - - # pylint: disable=protected-access await node._on_startup() yield node await node._on_shutdown() @@ -44,9 +44,9 @@ async def test_initialized_trainer(): trainer = TestingTrainerLogic() node = TrainerNode(name='test', trainer_logic=trainer, uuid='NODE-000-0000-0000-0000-000000000000') - # pylint: disable=protected-access + await node._on_startup() - trainer._node = node # pylint: disable=protected-access + trainer._node = node trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'), details={'categories': [], 'id': '917d5c7f-403d-7e92-f95f-577f79c2273a', # version 1.2 of demo project @@ -54,9 +54,7 @@ async def test_initialized_trainer(): 'resolution': 800, 'flip_rl': False, 'flip_ud': False}) - yield trainer - # await node._on_shutdown() try: await node._on_shutdown() except Exception: @@ -66,10 +64,3 @@ async def test_initialized_trainer(): def is_port_in_use(port): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: return s.connect_ex(('localhost', port)) == 0 - - -# @pytest.fixture(autouse=True, scope='session') -# def initialize_active_training(): -# from learning_loop_node.trainer import active_training_module -# active_training_module.init('00000000-0000-0000-0000-000000000000') -# yield diff --git a/learning_loop_node/trainer/tests/states/test_state_cleanup.py b/learning_loop_node/trainer/tests/states/test_state_cleanup.py index 9fbf076d..f3911a54 100644 --- a/learning_loop_node/trainer/tests/states/test_state_cleanup.py +++ b/learning_loop_node/trainer/tests/states/test_state_cleanup.py @@ -1,6 +1,8 @@ from learning_loop_node.trainer.tests.state_helper import create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic +# pylint: disable=protected-access + async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer @@ -18,7 +20,7 @@ async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic await trainer._clear_training() - assert trainer._training is None # pylint: disable=protected-access + assert trainer._training is None assert trainer.node.last_training_io.exists() is False assert trainer.active_training_io.detections_exist() is False assert trainer.active_training_io.detection_upload_progress_exist() is False diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py index efd9b966..40a62e63 100644 --- a/learning_loop_node/trainer/tests/states/test_state_detecting.py +++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py @@ -6,6 +6,7 @@ from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic +# pylint: disable=protected-access error_key = 'detecting' @@ -13,7 +14,7 @@ def trainer_has_error(trainer: TrainerLogic): return trainer.errors.has_error_for(error_key) -async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogic): # TODO Flaky test +async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogic): # NOTE was a flaky test trainer = test_initialized_trainer create_active_training_file(trainer, training_state='train_model_uploaded', model_id_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a') diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py index f5ef302b..1679f70b 100644 --- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py @@ -6,6 +6,8 @@ from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic +# pylint: disable=protected-access + async def test_downloading_is_successful(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer diff --git a/learning_loop_node/trainer/tests/states/test_state_prepare.py b/learning_loop_node/trainer/tests/states/test_state_prepare.py index c6648ea4..d3222f9a 100644 --- a/learning_loop_node/trainer/tests/states/test_state_prepare.py +++ b/learning_loop_node/trainer/tests/states/test_state_prepare.py @@ -5,6 +5,7 @@ from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic +# pylint: disable=protected-access error_key = 'prepare' diff --git a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py index 2fe586aa..6a292be5 100644 --- a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py +++ b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py @@ -10,6 +10,8 @@ from ..state_helper import assert_training_state, create_active_training_file from ..testing_trainer_logic import TestingTrainerLogic +# pylint: disable=protected-access + error_key = 'sync_confusion_matrix' diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py index 168a81d4..603d18e9 100644 --- a/learning_loop_node/trainer/tests/states/test_state_train.py +++ b/learning_loop_node/trainer/tests/states/test_state_train.py @@ -5,6 +5,8 @@ from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic +# pylint: disable=protected-access + async def test_successful_training(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer @@ -19,7 +21,6 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'start_training' - # pylint: disable=protected-access assert trainer._executor is not None trainer._executor.stop() # NOTE normally a training terminates itself await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) @@ -36,7 +37,7 @@ async def test_stop_running_training(test_initialized_trainer: TestingTrainerLog _ = asyncio.get_running_loop().create_task(trainer._run()) - await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) # pylint: disable=protected-access + await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'start_training' @@ -54,16 +55,15 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain # NOTE e.g. when a node-computer is restarted create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) trainer._init_from_last_training() - trainer._can_resume = True # pylint: disable=protected-access + trainer._can_resume = True _ = asyncio.get_running_loop().create_task(trainer._run()) - await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) # pylint: disable=protected-access + await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'resume' - # pylint: disable=protected-access assert trainer._executor is not None trainer._executor.stop() # NOTE normally a training terminates itself e.g await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py index 8567e69d..8918eece 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py @@ -10,6 +10,7 @@ from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic +# pylint: disable=protected-access error_key = 'upload_detections' diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py index ac147065..36c625f4 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py @@ -7,6 +7,7 @@ from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic +# pylint: disable=protected-access error_key = 'upload_model' diff --git a/learning_loop_node/trainer/tests/test_errors.py b/learning_loop_node/trainer/tests/test_errors.py index bdb40c95..507c494a 100644 --- a/learning_loop_node/trainer/tests/test_errors.py +++ b/learning_loop_node/trainer/tests/test_errors.py @@ -5,6 +5,8 @@ from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic +# pylint: disable=protected-access + async def test_training_process_is_stopped_when_trainer_reports_error(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index c7faeca8..188d37d7 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -2,7 +2,7 @@ import time from typing import Dict, List, Optional, Union -from learning_loop_node.data_classes import BasicModel, Context, Detections, ModelInformation, PretrainedModel +from learning_loop_node.data_classes import Context, Detections, ModelInformation, PretrainedModel, TrainingStateData from learning_loop_node.trainer.trainer_logic import TrainerLogic @@ -35,15 +35,17 @@ async def start_training(self, model: str = 'model.model') -> None: assert self._executor is not None self._executor.start('while true; do sleep 1; done') - async def start_training_from_scratch(self, base_model_id: str) -> None: + async def start_training_from_scratch(self) -> None: + base_model_id = self.training.base_model_id + assert base_model_id is not None await self.start_training(model=f'model_{base_model_id}.pt') - def get_new_best_model(self) -> Optional[BasicModel]: + def _get_new_best_model(self) -> Optional[TrainingStateData]: if self.has_new_model: - return BasicModel(confusion_matrix={}) + return TrainingStateData(confusion_matrix={}) return None - def on_model_published(self, basic_model: BasicModel) -> None: + def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: pass async def _prepare(self) -> None: @@ -54,9 +56,9 @@ async def _download_model(self) -> None: await super()._download_model() await asyncio.sleep(0.1) # give tests a bit time to to check for the state - async def upload_model(self) -> None: + async def _upload_model(self) -> None: await asyncio.sleep(0.1) # give tests a bit time to to check for the state - await super().upload_model() + await super()._upload_model() await asyncio.sleep(0.1) # give tests a bit time to to check for the state async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]: @@ -66,7 +68,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona assert isinstance(result, str) return result - def get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: + def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: time.sleep(1) # NOTE reduce flakyness in Backend tests du to wrong order of events. fake_weight_file = '/tmp/weightfile.weights' with open(fake_weight_file, 'wb') as f: @@ -87,7 +89,7 @@ async def _detect(self, model_information: ModelInformation, images: List[str], detections: List[Detections] = [] return detections - async def clear_training_data(self, training_folder: str) -> None: + async def _clear_training_data(self, training_folder: str) -> None: return def get_executor_error_from_log(self) -> Optional[str]: diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index c5b47df9..ee408cf9 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -66,7 +66,8 @@ async def _train(self) -> None: error = self.get_executor_error_from_log() if error: raise TrainingError(cause=error) - # TODO check if this works: + + # TODO check if this works to catch errors from the executor: # if self.executor.return_code != 0: # self.errors.set(error_key, f'Executor return code was {self.executor.return_code}') # raise TrainingError(cause=f'Executor return code was {self.executor.return_code}') @@ -85,9 +86,7 @@ async def _start_training(self): else: base_model_id = self.training.base_model_id if not is_valid_uuid4(base_model_id): # TODO this check was done earlier! - assert isinstance(base_model_id, str) - # TODO this could be removed here and accessed via self.training.base_model_id - self.start_training_task = self.start_training_from_scratch(base_model_id) + self.start_training_task = self.start_training_from_scratch() else: self.start_training_task = self.start_training() await self.start_training_task @@ -146,7 +145,7 @@ async def stop(self) -> None: except asyncio.CancelledError: pass logging.info('cancelled training task') - self.may_restart() + self._may_restart() def get_log(self) -> str: return self.executor.get_log() @@ -158,9 +157,10 @@ async def start_training(self) -> None: '''Should be used to start a training.''' @abstractmethod - async def start_training_from_scratch(self, base_model_id: str) -> None: + async def start_training_from_scratch(self) -> None: '''Should be used to start a training from scratch. - base_model_id is the id of a pretrained model provided by self.provided_pretrained_models.''' + NOTE base_model_id is now accessible via self.training.base_model_id + the id of a pretrained model provided by self.provided_pretrained_models.''' @abstractmethod def can_resume(self) -> bool: diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index d9abff34..a526fa62 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -10,8 +10,8 @@ from fastapi.encoders import jsonable_encoder -from ..data_classes import (BasicModel, Context, Errors, PretrainedModel, TrainerState, Training, TrainingData, - TrainingOut) +from ..data_classes import (Context, Errors, PretrainedModel, TrainerState, Training, TrainingData, TrainingOut, + TrainingStateData) from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4 from .downloader import TrainingsDownloader from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO @@ -22,7 +22,7 @@ class TrainerLogicGeneric(ABC): - def __init__(self, model_format: str): + def __init__(self, model_format: str) -> None: # NOTE: model_format is used in the file path for the model on the server: # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' @@ -86,8 +86,7 @@ def state(self) -> str: """ if (not self.training_active) or (self.training.training_state is None): return TrainerState.Idle.value - else: - return self.training.training_state + return self.training.training_state @property def training_uptime(self) -> Optional[float]: @@ -245,7 +244,7 @@ async def _training_loop(self) -> None: await self._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions) elif tstate == TrainerState.ReadyForCleanup: # -> RESTART or TrainingFinished await self._clear_training() - self.may_restart() + self._may_restart() async def _perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False): await asyncio.sleep(0.1) @@ -305,7 +304,7 @@ async def _sync_confusion_matrix(self) -> None: """ error_key = 'sync_confusion_matrix' try: - new_best_model = self.get_new_best_model() + new_best_model = self._get_new_best_model() if new_best_model and self.training.data: new_training = TrainingOut(trainer_id=self.node.uuid, confusion_matrix=new_best_model.confusion_matrix, @@ -319,7 +318,7 @@ async def _sync_confusion_matrix(self) -> None: if isinstance(result, dict) and result['success']: logging.info( f'successfully updated training {asdict(new_training)}') - self.on_model_published(new_best_model) + self._on_metrics_published(new_best_model) else: raise Exception( f'Error for update_training: Response from loop was : {result}') @@ -346,7 +345,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona Note that trainer may train with different classes, which is why we send an initial model.json file. """ # NOTE: I guess this is in executor because originally the conversion happened here.. - files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files) + files = await asyncio.get_running_loop().run_in_executor(None, self._get_latest_model_files) if files is None: return None @@ -385,7 +384,7 @@ async def _clear_training(self): self.active_training_io.delete_detections() self.active_training_io.delete_detection_upload_progress() self.active_training_io.delete_detections_upload_file_index() - await self.clear_training_data(self.training.training_folder) + await self._clear_training_data(self.training.training_folder) self.last_training_io.delete() await self.node.send_status() @@ -393,15 +392,6 @@ async def _clear_training(self): # ---------------------------------------- OTHER METHODS ---------------------------------------- - def may_restart(self) -> None: - """If the environment variable RESTART_AFTER_TRAINING is set, the trainer will restart after a training. - """ - if self._environment_vars.restart_after_training: - logging.info('restarting') - sys.exit(0) - else: - logging.info('not restarting') - async def on_shutdown(self) -> None: self.shutdown_event.set() await self.stop() @@ -420,8 +410,16 @@ async def stop(self): except asyncio.CancelledError: pass logging.info('cancelled training task') - self.may_restart() + self._may_restart() + def _may_restart(self) -> None: + """If the environment variable RESTART_AFTER_TRAINING is set, the trainer will restart after a training. + """ + if self._environment_vars.restart_after_training: + logging.info('restarting') + sys.exit(0) + else: + logging.info('not restarting') # ---------------------------------------- ABSTRACT METHODS ---------------------------------------- @abstractmethod @@ -443,9 +441,9 @@ async def _do_detections(self) -> None: raise NotImplementedError @abstractmethod - def get_new_best_model(self) -> Optional[BasicModel]: + def _get_new_best_model(self) -> Optional[TrainingStateData]: """Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe. - Returns None if no new model could be found. Otherwise BasicModel(confusion_matrix, meta_information). + Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information). `confusion_matrix` contains a dict of all classes: - The classes must be identified by their id, not their name. - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives). @@ -454,17 +452,17 @@ def get_new_best_model(self) -> Optional[BasicModel]: raise NotImplementedError @abstractmethod - def on_model_published(self, basic_model: BasicModel) -> None: - """Called after the confusion matrix corresponding to BasicModel has been successfully send to the Learning Loop. + def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: + """Called after the metrics corresponding to TrainingStateData have been successfully send to the Learning Loop. The respective files for this model should be stored so they can be later uploaded in get_latest_model_files. """ raise NotImplementedError @abstractmethod - def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]: + def _get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]: """Called when the Learning Loop requests to backup the latest model for the training. This function is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop. - In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model + In the simplest implementation this method just renames the weight file (e.g. stored in TrainingStateData.meta_information) into a file name like latest_published_model Should return a list of file paths which describe the model. These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.) and will be stored in the Learning Loop unter the format of this trainer. @@ -477,7 +475,7 @@ def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str raise NotImplementedError @abstractmethod - async def clear_training_data(self, training_folder: str) -> None: + async def _clear_training_data(self, training_folder: str) -> None: """Called after a training has finished. Deletes all data that is not needed anymore after a training run. This can be old weightfiles or any additional files. """ diff --git a/learning_loop_node/trainer/training_syncronizer.py b/learning_loop_node/trainer/training_syncronizer.py deleted file mode 100644 index 97041bb9..00000000 --- a/learning_loop_node/trainer/training_syncronizer.py +++ /dev/null @@ -1,53 +0,0 @@ - -import asyncio -import logging -from dataclasses import asdict -from typing import TYPE_CHECKING - -import socketio -from dacite import from_dict -from fastapi.encoders import jsonable_encoder - -from ..data_classes import TrainingOut -from ..data_classes.socket_response import SocketResponse - -if TYPE_CHECKING: - from .trainer_logic import TrainerLogic - - -class TrainingSyncronizer: - def __init__(self, trainer_node_uuid: str, sio_client: socketio.AsyncClient): - self.trainer_node_uuid = trainer_node_uuid - self.sio_client = sio_client - - async def sync_model(model, current_training): - new_training = TrainingOut( - trainer_id=self.trainer_node_uuid, - confusion_matrix=model.confusion_matrix, - train_image_count=current_training.data.train_image_count(), - test_image_count=current_training.data.test_image_count(), - hyperparameters=trainer.hyperparameters) - - await asyncio.sleep(0.1) # NOTE needed for tests. - - result = await self.sio_client.call('update_training', (current_training.context.organization, current_training.context.project, jsonable_encoder(new_training))) - response = from_dict(data_class=SocketResponse, data=result) - - return response - - -async def try_sync_model(mo): - try: - model = trainer.get_new_model() - except Exception as exc: - logging.exception('error while getting new model') - raise Exception(f'Could not get new model: {str(exc)}') from exc - logging.debug(f'new model {model}') - - if model: - response = await sync_model(trainer, trainer_node_uuid, sio_client, model) - - if not response.success: - error_msg = f'Error for update_training: Response from loop was : {asdict(response)}' - logging.error(error_msg) - raise Exception(error_msg) diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py index e88a2de3..3d992f4e 100644 --- a/mock_trainer/app_code/mock_trainer_logic.py +++ b/mock_trainer/app_code/mock_trainer_logic.py @@ -4,9 +4,9 @@ import time from typing import Dict, List, Optional, Union -from learning_loop_node.data_classes import (BasicModel, BoxDetection, CategoryType, ClassificationDetection, - Detections, ErrorConfiguration, ModelInformation, Point, PointDetection, - PretrainedModel, SegmentationDetection, Shape) +from learning_loop_node.data_classes import (BoxDetection, CategoryType, ClassificationDetection, Detections, + ErrorConfiguration, ModelInformation, Point, PointDetection, + PretrainedModel, SegmentationDetection, Shape, TrainingStateData) from learning_loop_node.trainer.trainer_logic import TrainerLogic from . import progress_simulator @@ -35,7 +35,7 @@ async def start_training(self) -> None: raise Exception('Could not start training') self.executor.start('while true; do sleep 1; done') - async def start_training_from_scratch(self, base_model_id: str) -> None: + async def start_training_from_scratch(self) -> None: self.current_iteration = 0 self.executor.start('while true; do sleep 1; done') @@ -44,7 +44,7 @@ def get_executor_error_from_log(self) -> Optional[str]: return 'mocked crash' return None - def get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: + def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: if self.error_configuration.save_model: raise Exception() @@ -66,37 +66,34 @@ async def _detect(self, model_information: ModelInformation, images: List[str], for image in images: image_id = image.split('/')[-1].replace('.jpg', '') - box_detections = [] - point_detections = [] - segmentation_detections = [] - classification_detections = [] - det_entry = { - 'image_id': image_id, 'box_detections': box_detections, 'point_detections': point_detections, - 'segmentation_detections': segmentation_detections, - 'classification_detections': classification_detections} + box_detections: List[BoxDetection] = [] + point_detections: List[PointDetection] = [] + segmentation_detections: List[SegmentationDetection] = [] + classification_detections: List[ClassificationDetection] = [] + for c in model_information.categories: if c.type == CategoryType.Box: - d = BoxDetection(category_name=c.name, x=1, y=2, width=30, height=40, - model_name=model_information.version, confidence=.99, category_id=c.id) - box_detections.append(d) + bd = BoxDetection(category_name=c.name, x=1, y=2, width=30, height=40, + model_name=model_information.version, confidence=.99, category_id=c.id) + box_detections.append(bd) elif c.type == CategoryType.Point: - d = PointDetection(category_name=c.name, x=100, y=200, - model_name=model_information.version, confidence=.97, category_id=c.id) - point_detections.append(d) + pd = PointDetection(category_name=c.name, x=100, y=200, + model_name=model_information.version, confidence=.97, category_id=c.id) + point_detections.append(pd) elif c.type == CategoryType.Segmentation: - d = SegmentationDetection(category_name=c.name, shape=Shape(points=[Point(x=1, y=2), Point( + sd = SegmentationDetection(category_name=c.name, shape=Shape(points=[Point(x=1, y=2), Point( x=3, y=4)]), model_name=model_information.version, confidence=.96, category_id=c.id) - segmentation_detections.append(d) + segmentation_detections.append(sd) elif c.type == CategoryType.Classification: - d = ClassificationDetection(category_name=c.name, model_name=model_information.version, - confidence=.95, category_id=c.id) - classification_detections.append(d) + cd = ClassificationDetection(category_name=c.name, model_name=model_information.version, + confidence=.95, category_id=c.id) + classification_detections.append(cd) detections.append(Detections(box_detections=box_detections, point_detections=point_detections, segmentation_detections=segmentation_detections, classification_detections=classification_detections, image_id=image_id)) return detections - async def clear_training_data(self, training_folder: str): + async def _clear_training_data(self, training_folder: str): pass @property @@ -111,7 +108,7 @@ def training_progress(self) -> float: print(f'prog. is {self.current_iteration} / {self.max_iterations} = {self.current_iteration / self.max_iterations}') return self.current_iteration / self.max_iterations - def get_new_best_model(self) -> Optional[BasicModel]: + def _get_new_best_model(self) -> Optional[TrainingStateData]: logging.warning('get_new_model called') if self.error_configuration.get_new_model: raise Exception('Could not get new model') @@ -120,7 +117,7 @@ def get_new_best_model(self) -> Optional[BasicModel]: self.current_iteration += 1 return progress_simulator.increment_time(self, self.latest_known_confusion_matrix) - def on_model_published(self, basic_model: BasicModel) -> None: + def _on_metrics_published(self, basic_model: TrainingStateData) -> None: assert isinstance(basic_model.confusion_matrix, Dict) self.latest_known_confusion_matrix = basic_model.confusion_matrix diff --git a/mock_trainer/app_code/progress_simulator.py b/mock_trainer/app_code/progress_simulator.py index 042a0b29..76f8be52 100644 --- a/mock_trainer/app_code/progress_simulator.py +++ b/mock_trainer/app_code/progress_simulator.py @@ -1,11 +1,11 @@ import random from typing import Dict, Optional -from learning_loop_node.data_classes import BasicModel +from learning_loop_node.data_classes import TrainingStateData from learning_loop_node.trainer.trainer_logic import TrainerLogic -def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -> Optional[BasicModel]: +def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -> Optional[TrainingStateData]: if not trainer._training or not trainer._training.data: # pylint: disable=protected-access return None @@ -23,7 +23,7 @@ def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) - 'fn': max(random.randint(10-maximum, 10-minimum), 2), } - new_model = BasicModel( + new_model = TrainingStateData( confusion_matrix=confusion_matrix, ) diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py index fecbe868..9f08b779 100644 --- a/mock_trainer/app_code/tests/test_mock_trainer.py +++ b/mock_trainer/app_code/tests/test_mock_trainer.py @@ -16,7 +16,7 @@ async def create_mock_trainer() -> MockTrainerLogic: async def test_get_model_files(setup_test_project2): mock_trainer = await create_mock_trainer() - files = mock_trainer.get_latest_model_files() + files = mock_trainer._get_latest_model_files() assert isinstance(files, Dict) @@ -38,5 +38,5 @@ async def test_get_new_model(setup_test_project2): images_folder="", training_folder="",) mock_trainer.training.data = TrainingData(image_data=[], categories=[]) - model = mock_trainer.get_new_best_model() + model = mock_trainer._get_new_best_model() assert model is not None From 14a27eab1958e5a2629010c5aaca50cb3e4231d1 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 22 Mar 2024 20:51:41 +0100 Subject: [PATCH 21/62] solve all linting errors in mock nodes --- mock_detector/app_code/tests/test_detector.py | 2 ++ mock_trainer/app_code/mock_trainer_logic.py | 6 +++--- mock_trainer/app_code/tests/conftest.py | 3 ++- mock_trainer/app_code/tests/test_detections.py | 10 ++++++---- mock_trainer/app_code/tests/test_mock_trainer.py | 3 +++ 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/mock_detector/app_code/tests/test_detector.py b/mock_detector/app_code/tests/test_detector.py index 3d05d99e..75816212 100644 --- a/mock_detector/app_code/tests/test_detector.py +++ b/mock_detector/app_code/tests/test_detector.py @@ -5,6 +5,8 @@ from learning_loop_node.detector.detector_node import DetectorNode from learning_loop_node.globals import GLOBALS +# pylint: disable=unused-argument + @pytest.fixture(scope="session") def event_loop(request): diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py index 3d992f4e..4f0d2708 100644 --- a/mock_trainer/app_code/mock_trainer_logic.py +++ b/mock_trainer/app_code/mock_trainer_logic.py @@ -117,9 +117,9 @@ def _get_new_best_model(self) -> Optional[TrainingStateData]: self.current_iteration += 1 return progress_simulator.increment_time(self, self.latest_known_confusion_matrix) - def _on_metrics_published(self, basic_model: TrainingStateData) -> None: - assert isinstance(basic_model.confusion_matrix, Dict) - self.latest_known_confusion_matrix = basic_model.confusion_matrix + def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: + assert isinstance(training_state_data.confusion_matrix, Dict) + self.latest_known_confusion_matrix = training_state_data.confusion_matrix @property def model_architecture(self) -> str: diff --git a/mock_trainer/app_code/tests/conftest.py b/mock_trainer/app_code/tests/conftest.py index 86c62dc2..6c23ca7e 100644 --- a/mock_trainer/app_code/tests/conftest.py +++ b/mock_trainer/app_code/tests/conftest.py @@ -1,5 +1,4 @@ import asyncio -import logging import shutil import pytest @@ -7,6 +6,8 @@ from learning_loop_node.globals import GLOBALS from learning_loop_node.loop_communication import LoopCommunicator +# pylint: disable=redefined-outer-name + @pytest.fixture() async def glc(): diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py index 5b5aa461..7b909db3 100644 --- a/mock_trainer/app_code/tests/test_detections.py +++ b/mock_trainer/app_code/tests/test_detections.py @@ -12,8 +12,10 @@ from ..mock_trainer_logic import MockTrainerLogic +# pylint: disable=protected-access,redefined-outer-name,unused-argument -async def test_all(setup_test_project1, glc: LoopCommunicator): # pylint: disable=unused-argument, redefined-outer-name + +async def test_all(setup_test_project1, glc: LoopCommunicator): assert_image_count(0) assert GLOBALS.data_folder == '/tmp/learning_loop_lib_data' @@ -28,14 +30,14 @@ async def test_all(setup_test_project1, glc: LoopCommunicator): # pylint: disab 'resolution': 800, 'flip_rl': False, 'flip_ud': False} - trainer._node = node # pylint: disable=protected-access + trainer._node = node trainer._init_new_training(context=context, details=details) project_folder = create_project_folder(context) training = generate_training(project_folder, context) training.model_id_for_detecting = latest_model_id - trainer._training = training # pylint: disable=protected-access - await trainer._do_detections() # pylint: disable=protected-access + trainer._training = training + await trainer._do_detections() detections = trainer.active_training_io.load_detections() assert_image_count(10) # TODO This assert fails frequently on Drone diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py index 9f08b779..60029db2 100644 --- a/mock_trainer/app_code/tests/test_mock_trainer.py +++ b/mock_trainer/app_code/tests/test_mock_trainer.py @@ -7,6 +7,9 @@ from ..mock_trainer_logic import MockTrainerLogic +# pylint: disable=protected-access +# pylint: disable=unused-argument + async def create_mock_trainer() -> MockTrainerLogic: mock_trainer = MockTrainerLogic(model_format='mocked') From dfcb37f008070e6f4e735173696a825f2f0841af Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Mon, 25 Mar 2024 11:59:23 +0100 Subject: [PATCH 22/62] further improvements of documentation and refactoring --- learning_loop_node/data_classes/__init__.py | 11 +++++++++++ learning_loop_node/trainer/trainer_logic.py | 4 ++-- .../trainer/trainer_logic_generic.py | 19 +++++++++++++------ learning_loop_node/trainer/trainer_node.py | 2 +- 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/learning_loop_node/data_classes/__init__.py b/learning_loop_node/data_classes/__init__.py index 32188896..524cb8bb 100644 --- a/learning_loop_node/data_classes/__init__.py +++ b/learning_loop_node/data_classes/__init__.py @@ -6,3 +6,14 @@ from .socket_response import SocketResponse from .training import (Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData, TrainingError, TrainingOut, TrainingStateData, TrainingStatus) + +__all__ = [ + 'AnnotationData', 'AnnotationEventType', 'SegmentationAnnotation', 'ToolOutput', 'UserInput', + 'BoxDetection', 'ClassificationDetection', 'Detections', 'Observation', 'Point', 'PointDetection', + 'SegmentationDetection', 'Shape', + 'AnnotationNodeStatus', 'Category', 'CategoryType', 'Context', 'DetectionStatus', 'ErrorConfiguration', + 'ModelInformation', 'NodeState', 'NodeStatus', + 'SocketResponse', + 'Errors', 'Hyperparameter', 'Model', 'PretrainedModel', 'TrainerState', 'Training', 'TrainingData', + 'TrainingError', 'TrainingOut', 'TrainingStateData', 'TrainingStatus', +] diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index ee408cf9..35587a97 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -154,11 +154,11 @@ def get_log(self) -> str: @abstractmethod async def start_training(self) -> None: - '''Should be used to start a training.''' + '''Should be used to start a training on executer, e.g. self.executor.start(cmd).''' @abstractmethod async def start_training_from_scratch(self) -> None: - '''Should be used to start a training from scratch. + '''Should be used to start a training from scratch on executer, e.g. self.executor.start(cmd). NOTE base_model_id is now accessible via self.training.base_model_id the id of a pretrained model provided by self.provided_pretrained_models.''' diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index a526fa62..0eada30b 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -10,8 +10,8 @@ from fastapi.encoders import jsonable_encoder -from ..data_classes import (Context, Errors, PretrainedModel, TrainerState, Training, TrainingData, TrainingOut, - TrainingStateData) +from ..data_classes import (Context, Errors, Hyperparameter, PretrainedModel, TrainerState, Training, TrainingData, + TrainingOut, TrainingStateData) from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4 from .downloader import TrainingsDownloader from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO @@ -61,6 +61,14 @@ def training(self) -> Training: assert self._training is not None, 'training must be initialized, call `init` first' return self._training + @property + def hyperparameter(self) -> Hyperparameter: + assert self.training_data is not None, 'Training should have data' + assert self.training_data.hyperparameter is not None, 'Training.data should have hyperparameter' + return self.training_data.hyperparameter + + # ---------------------------------------- PROPERTIES ---------------------------------------- + @property def training_data(self) -> Optional[TrainingData]: if self.training_active and self.training.data: @@ -72,7 +80,6 @@ def training_context(self) -> Optional[Context]: if self.training_active: return self.training.context return None - # ---------------------------------------- PROPERTIES ---------------------------------------- @property def training_active(self) -> bool: @@ -97,7 +104,7 @@ def training_uptime(self) -> Optional[float]: return None @property - def hyperparameters(self) -> Optional[Dict]: + def hyperparameters_for_state_sync(self) -> Optional[Dict]: """Used in sync_confusion_matrix and send_status to provide information about the training configuration. """ if self._training and self._training.data and self._training.data.hyperparameter: @@ -310,7 +317,7 @@ async def _sync_confusion_matrix(self) -> None: confusion_matrix=new_best_model.confusion_matrix, train_image_count=self.training.data.train_image_count(), test_image_count=self.training.data.test_image_count(), - hyperparameters=self.hyperparameters) + hyperparameters=self.hyperparameters_for_state_sync) await asyncio.sleep(0.1) # NOTE needed for tests. result = await self.node.sio_client.call('update_training', ( @@ -447,7 +454,7 @@ def _get_new_best_model(self) -> Optional[TrainingStateData]: `confusion_matrix` contains a dict of all classes: - The classes must be identified by their id, not their name. - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives). - `meta_information` can hold any data which is helpful for self.on_model_published to store weight file etc for later upload via self.get_model_files + `meta_information` can hold any data which is helpful for self._on_metrics_published to store weight file etc for later upload via self.get_model_files """ raise NotImplementedError diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py index 6112d449..f69cf103 100644 --- a/learning_loop_node/trainer/trainer_node.py +++ b/learning_loop_node/trainer/trainer_node.py @@ -84,7 +84,7 @@ async def send_status(self): status.train_image_count = data.train_image_count() status.test_image_count = data.test_image_count() status.skipped_image_count = data.skipped_image_count - status.hyperparameters = self.trainer_logic.hyperparameters + status.hyperparameters = self.trainer_logic.hyperparameters_for_state_sync status.errors = self.trainer_logic.errors.errors status.context = self.trainer_logic.training_context From 70aa44d30400139b7363b8559ae3e7094f122d99 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Mon, 25 Mar 2024 18:03:29 +0100 Subject: [PATCH 23/62] Further refactoring and API improvements --- learning_loop_node/data_classes/general.py | 2 + learning_loop_node/data_classes/training.py | 12 +++- learning_loop_node/tests/test_executor.py | 4 +- learning_loop_node/trainer/executor.py | 34 +++++----- .../trainer/rest/backdoor_controls.py | 4 +- .../trainer/tests/states/test_state_train.py | 4 +- .../trainer/tests/testing_trainer_logic.py | 16 ++--- learning_loop_node/trainer/trainer_logic.py | 63 ++++++++++--------- .../trainer/trainer_logic_generic.py | 22 ++++--- mock_trainer/app_code/mock_trainer_logic.py | 12 ++-- .../app_code/tests/test_mock_trainer.py | 4 +- 11 files changed, 96 insertions(+), 81 deletions(-) diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py index 41141395..5c616841 100644 --- a/learning_loop_node/data_classes/general.py +++ b/learning_loop_node/data_classes/general.py @@ -60,6 +60,8 @@ def context(self): @staticmethod def load_from_disk(model_root_path: str) -> Optional['ModelInformation']: + """Load model.json from model_root_path and return ModelInformation object. + """ model_info_file_path = f'{model_root_path}/model.json' if not os.path.exists(model_info_file_path): logging.warning(f"could not find model information file '{model_info_file_path}'") diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index 4df5a289..c192ba41 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -3,6 +3,7 @@ import time from dataclasses import dataclass, field from enum import Enum +from pathlib import Path from typing import Dict, List, Optional # pylint: disable=no-name-in-module @@ -106,13 +107,18 @@ class Training(): training_folder: str # f'{project_folder}/trainings/{trainings_id}' start_time: float = field(default_factory=time.time) - base_model_id: Optional[str] = None # model uuid to download (to continue training) + # model uuid to download (to continue training) | is '' when training from scratch + base_model_id: Optional[str] = None data: Optional[TrainingData] = None training_number: Optional[int] = None training_state: Optional[str] = None model_id_for_detecting: Optional[str] = None hyperparameters: Optional[Dict] = None + @property + def training_folder_path(self) -> Path: + return Path(self.training_folder) + def set_values_from_data(self, data: Dict) -> None: self.data = TrainingData(categories=Category.from_list(data['categories'])) self.data.hyperparameter = Hyperparameter.from_data(data=data) @@ -132,8 +138,8 @@ class TrainingOut(): @dataclass(**KWONLY_SLOTS) class TrainingStateData(): - confusion_matrix: Optional[Dict] = None # This is actually just class-wise metrics - meta_information: Optional[Dict] = None + confusion_matrix: Dict = field(default_factory=dict) + meta_information: Dict = field(default_factory=dict) @dataclass(**KWONLY_SLOTS) diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py index b661c818..1079ea1c 100644 --- a/learning_loop_node/tests/test_executor.py +++ b/learning_loop_node/tests/test_executor.py @@ -32,7 +32,7 @@ def test_executor_lifecycle(): executor.start(cmd) - assert executor.is_process_running() + assert executor.is_running() assert_process_is_running('some_executable.sh') sleep(1) @@ -40,7 +40,7 @@ def test_executor_lifecycle(): executor.stop() - assert not executor.is_process_running() + assert not executor.is_running() sleep(1) assert_process_is_running('some_executable.sh', False) diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py index 628ef022..e8dc66ae 100644 --- a/learning_loop_node/trainer/executor.py +++ b/learning_loop_node/trainer/executor.py @@ -28,33 +28,33 @@ class Executor: def __init__(self, base_path: str) -> None: self.path = base_path os.makedirs(self.path, exist_ok=True) - self.process: Optional[subprocess.Popen[bytes]] = None + self._process: Optional[subprocess.Popen[bytes]] = None - def start(self, cmd: str): + def start(self, cmd: str) -> None: with open(f'{self.path}/last_training.log', 'a') as f: f.write(f'\nStarting executor with command: {cmd}\n') + # pylint: disable=subprocess-popen-preexec-fn - self.process = subprocess.Popen( + self._process = subprocess.Popen( f'cd {self.path}; {cmd} >> last_training.log 2>&1', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable='/bin/bash', - preexec_fn=create_signal_handler(), - ) + preexec_fn=create_signal_handler()) - def is_process_running(self): - if self.process is None: + def is_running(self) -> bool: + if self._process is None: return False - if self.process.poll() is not None: + if self._process.poll() is not None: return False try: - psutil.Process(self.process.pid) + psutil.Process(self._process.pid) except psutil.NoSuchProcess: # self.process.terminate() # TODO does this make sense? - # self.process = None + self._process = None return False return True @@ -82,24 +82,24 @@ def get_log_by_lines(self, since_last_start=False) -> List[str]: # TODO do not return [] def stop(self): - if self.process is None: + if self._process is None: logging.info('no process running ... nothing to stop') return logging.info('terminating process') try: - os.killpg(os.getpgid(self.process.pid), signal.SIGTERM) + os.killpg(os.getpgid(self._process.pid), signal.SIGTERM) except ProcessLookupError: pass - self.process.terminate() - _, _ = self.process.communicate(timeout=3) + self._process.terminate() + _, _ = self._process.communicate(timeout=3) @property def return_code(self): - if not self.process: + if not self._process: return None - if self.is_process_running(): + if self.is_running(): return None - return self.process.poll() + return self._process.poll() diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py index 726cdb8e..e3b17ed3 100644 --- a/learning_loop_node/trainer/rest/backdoor_controls.py +++ b/learning_loop_node/trainer/rest/backdoor_controls.py @@ -97,7 +97,7 @@ async def add_steps(request: Request): assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic' - if not trainer_logic._executor or not trainer_logic._executor.is_process_running(): # pylint: disable=protected-access + if not trainer_logic._executor or not trainer_logic._executor.is_running(): # pylint: disable=protected-access training = trainer_logic._training # pylint: disable=protected-access logging.error(f'cannot add steps when not running, state: {training.training_state if training else "None"}') raise HTTPException(status_code=409, detail="trainer is not running") @@ -126,7 +126,7 @@ async def kill_process(request: Request): trainer_node = trainer_node_from_request(request) trainer_logic = trainer_node.trainer_logic assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic' - if not trainer_logic._executor or not trainer_logic._executor.is_process_running(): + if not trainer_logic._executor or not trainer_logic._executor.is_running(): raise HTTPException(status_code=409, detail="trainer is not running") trainer_logic._executor.stop() diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py index 603d18e9..3c6b579a 100644 --- a/learning_loop_node/trainer/tests/states/test_state_train.py +++ b/learning_loop_node/trainer/tests/states/test_state_train.py @@ -37,7 +37,7 @@ async def test_stop_running_training(test_initialized_trainer: TestingTrainerLog _ = asyncio.get_running_loop().create_task(trainer._run()) - await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) + await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01) await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'start_training' @@ -59,7 +59,7 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain _ = asyncio.get_running_loop().create_task(trainer._run()) - await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) + await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01) await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None assert trainer.start_training_task.__name__ == 'resume' diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index 188d37d7..d4daaff1 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -31,16 +31,16 @@ def provided_pretrained_models(self) -> List[PretrainedModel]: PretrainedModel(name='large', label='Large', description='a large model')] # pylint: disable=unused-argument - async def start_training(self, model: str = 'model.model') -> None: + async def _start_training_from_base_model(self, model: str = 'model.model') -> None: assert self._executor is not None self._executor.start('while true; do sleep 1; done') - async def start_training_from_scratch(self) -> None: + async def _start_training_from_scratch(self) -> None: base_model_id = self.training.base_model_id assert base_model_id is not None - await self.start_training(model=f'model_{base_model_id}.pt') + await self._start_training_from_base_model(model=f'model_{base_model_id}.pt') - def _get_new_best_model(self) -> Optional[TrainingStateData]: + def _get_new_best_training_state(self) -> Optional[TrainingStateData]: if self.has_new_model: return TrainingStateData(confusion_matrix={}) return None @@ -79,11 +79,11 @@ def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: f.write('zweiundvierzig') return {'mocked': [fake_weight_file, more_data_file], 'mocked_2': [fake_weight_file, more_data_file]} - def can_resume(self) -> bool: + def _can_resume(self) -> bool: return self._can_resume - async def resume(self) -> None: - return await self.start_training() + async def _resume(self) -> None: + return await self._start_training_from_base_model() async def _detect(self, model_information: ModelInformation, images: List[str], model_folder: str) -> List[Detections]: detections: List[Detections] = [] @@ -92,5 +92,5 @@ async def _detect(self, model_information: ModelInformation, images: List[str], async def _clear_training_data(self, training_folder: str) -> None: return - def get_executor_error_from_log(self) -> Optional[str]: + def _get_executor_error_from_log(self) -> Optional[str]: return self.error_msg diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 35587a97..985479b8 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -26,15 +26,21 @@ def __init__(self, model_format: str) -> None: self._executor: Optional[Executor] = None self.start_training_task: Optional[Coroutine] = None + # ---------------------------------------- IMPLEMENTED ABSTRACT PROPERTIES ---------------------------------------- + @property def detection_progress(self) -> Optional[float]: return self._detection_progress + # ---------------------------------------- PROPERTIES ---------------------------------------- + @property def executor(self) -> Executor: assert self._executor is not None, 'executor must be set, call `run_training` first' return self._executor + # ---------------------------------------- IMPLEMENTED ABSTRACT MEHTODS ---------------------------------------- + async def _train(self) -> None: previous_state = TrainerState.TrainModelDownloaded error_key = 'run_training' @@ -46,11 +52,11 @@ async def _train(self) -> None: last_sync_time = datetime.now() while True: - if not self.executor.is_process_running(): + if not self.executor.is_running(): break if (datetime.now() - last_sync_time).total_seconds() > 5: last_sync_time = datetime.now() - if self.get_executor_error_from_log(): + if self._get_executor_error_from_log(): break self.errors.reset(error_key) try: @@ -63,34 +69,20 @@ async def _train(self) -> None: else: await asyncio.sleep(0.1) - error = self.get_executor_error_from_log() + error = self._get_executor_error_from_log() if error: raise TrainingError(cause=error) - # TODO check if this works to catch errors from the executor: - # if self.executor.return_code != 0: - # self.errors.set(error_key, f'Executor return code was {self.executor.return_code}') - # raise TrainingError(cause=f'Executor return code was {self.executor.return_code}') + if self.executor.return_code != 0: # TODO check if this works to catch errors from the executor: + raise TrainingError(cause=f'Executor returned with error code: {self.executor.return_code}') except TrainingError: logging.exception('Error in TrainingProcess') - if self.executor.is_process_running(): + if self.executor.is_running(): self.executor.stop() self.training.training_state = previous_state raise - async def _start_training(self): - self.start_training_task = None # NOTE: this is used i.e. by tests - if self.can_resume(): - self.start_training_task = self.resume() - else: - base_model_id = self.training.base_model_id - if not is_valid_uuid4(base_model_id): # TODO this check was done earlier! - self.start_training_task = self.start_training_from_scratch() - else: - self.start_training_task = self.start_training() - await self.start_training_task - async def _do_detections(self) -> None: context = self.training.context model_id = self.training.model_id_for_detecting @@ -131,11 +123,27 @@ async def _do_detections(self) -> None: batch_detections = await self._detect(model_information, batch_images, tmp_folder) self.active_training_io.save_detections(batch_detections, idx) + # ---------------------------------------- METHODS ---------------------------------------- + + async def _start_training(self): + self.start_training_task = None # NOTE: this is used i.e. by tests + if self._can_resume(): + self.start_training_task = self._resume() + else: + base_model_id = self.training.base_model_id + if not is_valid_uuid4(base_model_id): + self.start_training_task = self._start_training_from_scratch() + else: + self.start_training_task = self._start_training_from_base_model() + await self.start_training_task + + # ---------------------------------------- OVERWRITTEN METHODS ---------------------------------------- + async def stop(self) -> None: """If executor is running, stop it. Else cancel training task.""" if not self.training_active: return - if self._executor and self._executor.is_process_running(): + if self._executor and self._executor.is_running(): self.executor.stop() elif self.training_task: logging.info('cancelling training task') @@ -147,32 +155,29 @@ async def stop(self) -> None: logging.info('cancelled training task') self._may_restart() - def get_log(self) -> str: - return self.executor.get_log() - # ---------------------------------------- ABSTRACT METHODS ---------------------------------------- @abstractmethod - async def start_training(self) -> None: + async def _start_training_from_base_model(self) -> None: '''Should be used to start a training on executer, e.g. self.executor.start(cmd).''' @abstractmethod - async def start_training_from_scratch(self) -> None: + async def _start_training_from_scratch(self) -> None: '''Should be used to start a training from scratch on executer, e.g. self.executor.start(cmd). NOTE base_model_id is now accessible via self.training.base_model_id the id of a pretrained model provided by self.provided_pretrained_models.''' @abstractmethod - def can_resume(self) -> bool: + def _can_resume(self) -> bool: '''Override this method to return True if the trainer can resume training.''' @abstractmethod - async def resume(self) -> None: + async def _resume(self) -> None: '''Is called when self.can_resume() returns True. One may resume the training on a previously trained model stored by self.on_model_published(basic_model).''' @abstractmethod - def get_executor_error_from_log(self) -> Optional[str]: + def _get_executor_error_from_log(self) -> Optional[str]: '''Should be used to provide error informations to the Learning Loop by extracting data from self.executor.get_log().''' @abstractmethod diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index 0eada30b..e1798c46 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -293,17 +293,17 @@ async def _download_model(self) -> None: """If training is continued, the model is downloaded from the Learning Loop to the training_folder. The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training. """ - model_id = self.training.base_model_id + base_model_id = self.training.base_model_id # TODO this checks if we continue a training -> make more explicit - if model_id and is_valid_uuid4(self.training.base_model_id): + if base_model_id and is_valid_uuid4(self.training.base_model_id): logging.info('loading model from Learning Loop') - logging.info(f'downloading model {model_id} as {self.model_format}') - await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, model_id, self.model_format) + logging.info(f'downloading model {base_model_id} as {self.model_format}') + await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_id, self.model_format) shutil.move(f'{self.training.training_folder}/model.json', f'{self.training.training_folder}/base_model.json') else: logging.info( - f'base_model_id {model_id} is not a valid uuid4 (or no base model was not provided), skipping download') + f'base_model_id {base_model_id} is not a valid uuid4 (or no base model was not provided), skipping download') async def _sync_confusion_matrix(self) -> None: """Syncronizes the confusion matrix with the Learning Loop via the update_training endpoint. @@ -311,7 +311,7 @@ async def _sync_confusion_matrix(self) -> None: """ error_key = 'sync_confusion_matrix' try: - new_best_model = self._get_new_best_model() + new_best_model = self._get_new_best_training_state() if new_best_model and self.training.data: new_training = TrainingOut(trainer_id=self.node.uuid, confusion_matrix=new_best_model.confusion_matrix, @@ -441,15 +441,15 @@ async def _train(self) -> None: @abstractmethod async def _do_detections(self) -> None: - """Should be used to execute detections. + """Should be used to infer detections of all images and save them to drive. active_training_io.save_detections(...) should be used to store the detections. asyncio.CancelledError should be catched and re-raised. """ raise NotImplementedError @abstractmethod - def _get_new_best_model(self) -> Optional[TrainingStateData]: - """Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe. + def _get_new_best_training_state(self) -> Optional[TrainingStateData]: + """Is called frequently by `_sync_confusion_matrix` to check if a new "best" model is availabe. Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information). `confusion_matrix` contains a dict of all classes: - The classes must be identified by their id, not their name. @@ -461,6 +461,8 @@ def _get_new_best_model(self) -> Optional[TrainingStateData]: @abstractmethod def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: """Called after the metrics corresponding to TrainingStateData have been successfully send to the Learning Loop. + Receives the TrainingStateData object which was returned by self._get_new_best_training_state. + If above function returns None, this function is not called. The respective files for this model should be stored so they can be later uploaded in get_latest_model_files. """ raise NotImplementedError @@ -468,7 +470,7 @@ def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: @abstractmethod def _get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]: """Called when the Learning Loop requests to backup the latest model for the training. - This function is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop. + This function is used to generate and gather all files needed for transfering the actual data from the trainer node to the Learning Loop. In the simplest implementation this method just renames the weight file (e.g. stored in TrainingStateData.meta_information) into a file name like latest_published_model Should return a list of file paths which describe the model. These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.) diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py index 4f0d2708..f4fb3fc8 100644 --- a/mock_trainer/app_code/mock_trainer_logic.py +++ b/mock_trainer/app_code/mock_trainer_logic.py @@ -23,23 +23,23 @@ def __init__(self, model_format: str) -> None: self.current_iteration = 0 self.provide_new_model = True - def can_resume(self) -> bool: + def _can_resume(self) -> bool: return False - async def resume(self) -> None: + async def _resume(self) -> None: pass - async def start_training(self) -> None: + async def _start_training_from_base_model(self) -> None: self.current_iteration = 0 if self.error_configuration.begin_training: raise Exception('Could not start training') self.executor.start('while true; do sleep 1; done') - async def start_training_from_scratch(self) -> None: + async def _start_training_from_scratch(self) -> None: self.current_iteration = 0 self.executor.start('while true; do sleep 1; done') - def get_executor_error_from_log(self) -> Optional[str]: + def _get_executor_error_from_log(self) -> Optional[str]: if self.error_configuration.crash_training: return 'mocked crash' return None @@ -108,7 +108,7 @@ def training_progress(self) -> float: print(f'prog. is {self.current_iteration} / {self.max_iterations} = {self.current_iteration / self.max_iterations}') return self.current_iteration / self.max_iterations - def _get_new_best_model(self) -> Optional[TrainingStateData]: + def _get_new_best_training_state(self) -> Optional[TrainingStateData]: logging.warning('get_new_model called') if self.error_configuration.get_new_model: raise Exception('Could not get new model') diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py index 60029db2..0946f991 100644 --- a/mock_trainer/app_code/tests/test_mock_trainer.py +++ b/mock_trainer/app_code/tests/test_mock_trainer.py @@ -30,7 +30,7 @@ async def test_get_model_files(setup_test_project2): async def test_get_new_model(setup_test_project2): mock_trainer = await create_mock_trainer() - await mock_trainer.start_training() + await mock_trainer._start_training_from_base_model() model = Model(uuid=(str(uuid4()))) context = Context(organization="", project="") @@ -41,5 +41,5 @@ async def test_get_new_model(setup_test_project2): images_folder="", training_folder="",) mock_trainer.training.data = TrainingData(image_data=[], categories=[]) - model = mock_trainer._get_new_best_model() + model = mock_trainer._get_new_best_training_state() assert model is not None From 44466e956ba2cb41396dccbaecec84a3fdcd5585 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Mon, 25 Mar 2024 18:45:38 +0100 Subject: [PATCH 24/62] Further refactoring and API improvements --- learning_loop_node/data_classes/training.py | 2 +- learning_loop_node/trainer/executor.py | 1 + .../trainer/trainer_logic_generic.py | 29 +++++++++++-------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index c192ba41..f88a7a00 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -107,7 +107,7 @@ class Training(): training_folder: str # f'{project_folder}/trainings/{trainings_id}' start_time: float = field(default_factory=time.time) - # model uuid to download (to continue training) | is '' when training from scratch + # model uuid to download (to continue training) | is not a uuid when training from scratch (blank or pt-name ?!) base_model_id: Optional[str] = None data: Optional[TrainingData] = None training_number: Optional[int] = None diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py index e8dc66ae..e823c0d4 100644 --- a/learning_loop_node/trainer/executor.py +++ b/learning_loop_node/trainer/executor.py @@ -31,6 +31,7 @@ def __init__(self, base_path: str) -> None: self._process: Optional[subprocess.Popen[bytes]] = None def start(self, cmd: str) -> None: + logging.info(f'Starting executor with command: {cmd}') with open(f'{self.path}/last_training.log', 'a') as f: f.write(f'\nStarting executor with command: {cmd}\n') diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index e1798c46..34535bbc 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -25,6 +25,7 @@ class TrainerLogicGeneric(ABC): def __init__(self, model_format: str) -> None: # NOTE: model_format is used in the file path for the model on the server: + # It acts as a key for list of files (cf. _get_latest_model_files) # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' self.model_format: str = model_format self.errors = Errors() @@ -294,16 +295,17 @@ async def _download_model(self) -> None: The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training. """ base_model_id = self.training.base_model_id + # TODO this checks if we continue a training -> make more explicit - if base_model_id and is_valid_uuid4(self.training.base_model_id): - logging.info('loading model from Learning Loop') - logging.info(f'downloading model {base_model_id} as {self.model_format}') - await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_id, self.model_format) - shutil.move(f'{self.training.training_folder}/model.json', - f'{self.training.training_folder}/base_model.json') - else: - logging.info( - f'base_model_id {base_model_id} is not a valid uuid4 (or no base model was not provided), skipping download') + if not base_model_id or not is_valid_uuid4(base_model_id): + logging.info(f'skipping model download. No base model id provided: {base_model_id}') + return + + logging.info('loading model from Learning Loop') + logging.info(f'downloading model {base_model_id} as {self.model_format}') + await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_id, self.model_format) + shutil.move(f'{self.training.training_folder}/model.json', + f'{self.training.training_folder}/base_model.json') async def _sync_confusion_matrix(self) -> None: """Syncronizes the confusion matrix with the Learning Loop via the update_training endpoint. @@ -468,11 +470,12 @@ def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: raise NotImplementedError @abstractmethod - def _get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]: + def _get_latest_model_files(self) -> Dict[str, List[str]]: """Called when the Learning Loop requests to backup the latest model for the training. - This function is used to generate and gather all files needed for transfering the actual data from the trainer node to the Learning Loop. + This function is used to __generate and gather__ all files needed for transfering the actual data from the trainer node to the Learning Loop. In the simplest implementation this method just renames the weight file (e.g. stored in TrainingStateData.meta_information) into a file name like latest_published_model - Should return a list of file paths which describe the model. + + The function should return a list of file paths which describe the model per format. These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.) and will be stored in the Learning Loop unter the format of this trainer. Note: by convention the weightfile should be named "model." where extension is the file format of the weightfile. @@ -480,6 +483,8 @@ def _get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[st If a trainer can also generate other formats (for example for an detector), a dictionary mapping format -> list of files can be returned. + + If the function returns an empty dict, something went wrong and the model upload will be skipped. """ raise NotImplementedError From 793e8bbff7fd3085a01f1874a11e48bbb67b4d6f Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Tue, 26 Mar 2024 09:19:01 +0100 Subject: [PATCH 25/62] Further refactoring and API improvements --- learning_loop_node/data_classes/training.py | 2 +- learning_loop_node/data_exchanger.py | 6 ++---- .../tests/states/test_state_detecting.py | 4 ++-- .../tests/states/test_state_upload_model.py | 4 ++-- learning_loop_node/trainer/trainer_logic.py | 20 +++++++++---------- .../trainer/trainer_logic_generic.py | 8 ++++---- .../app_code/tests/test_detections.py | 2 +- 7 files changed, 21 insertions(+), 25 deletions(-) diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index f88a7a00..f65503b5 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -112,7 +112,7 @@ class Training(): data: Optional[TrainingData] = None training_number: Optional[int] = None training_state: Optional[str] = None - model_id_for_detecting: Optional[str] = None + model_uuid_for_detecting: Optional[str] = None hyperparameters: Optional[Dict] = None @property diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index 8a5633d4..0d4d2add 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -157,12 +157,10 @@ async def upload_model_get_uuid(self, context: Context, files: List[str], traini """Used by the trainers. Function returns the new model uuid to use for detection.""" response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/trainings/{training_number}/models/latest/{mformat}/file', files=files) if response.status_code != 200: - logging.error( - f'---- could not upload model for training {training_number} and format {mformat}. Details: {response.text}') + logging.error(f'Could not upload model for training {training_number}, format {mformat}: {response.text}') response.raise_for_status() return None uploaded_model = response.json() - logging.info( - f'---- uploaded model for training {training_number} and format {mformat}. Model id is {uploaded_model}') + logging.info(f'Uploaded model for training {training_number}, format {mformat}. Response is: {uploaded_model}') return uploaded_model['id'] diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py index 40a62e63..770429c8 100644 --- a/learning_loop_node/trainer/tests/states/test_state_detecting.py +++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py @@ -37,7 +37,7 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded) trainer._init_from_last_training() - trainer.training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242' + trainer.training.model_uuid_for_detecting = '12345678-bobo-7e92-f95f-424242424242' _ = asyncio.get_running_loop().create_task(trainer._run()) @@ -64,7 +64,7 @@ async def test_model_not_downloadable_error(test_initialized_trainer: TestingTra assert trainer_has_error(trainer) assert trainer.training.training_state == TrainerState.TrainModelUploaded - assert trainer.training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000' + assert trainer.training.model_uuid_for_detecting == '00000000-0000-0000-0000-000000000000' assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py index 36c625f4..b2bfa4c7 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py @@ -30,7 +30,7 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer assert trainer_has_error(trainer) is False assert trainer.training.training_state == TrainerState.TrainModelUploaded - assert trainer.training.model_id_for_detecting is not None + assert trainer.training.model_uuid_for_detecting is not None assert trainer.node.last_training_io.load() == trainer.training @@ -68,7 +68,7 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai assert trainer_has_error(trainer) assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced - assert trainer.training.model_id_for_detecting is None + assert trainer.training.model_uuid_for_detecting is None assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 985479b8..32377b01 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -18,10 +18,10 @@ class TrainerLogic(TrainerLogicGeneric): def __init__(self, model_format: str) -> None: + """This class is the base class for all trainers that use an executor to run training processes. + The executor is used to run the training process in a separate process.""" + super().__init__(model_format) - self.model_format: str = model_format - # NOTE: String to be used in the file path for the model on the server: - # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' self._detection_progress: Optional[float] = None self._executor: Optional[Executor] = None self.start_training_task: Optional[Coroutine] = None @@ -49,9 +49,10 @@ async def _train(self) -> None: try: await self._start_training() - last_sync_time = datetime.now() + while True: + await asyncio.sleep(0.1) if not self.executor.is_running(): break if (datetime.now() - last_sync_time).total_seconds() > 5: @@ -65,19 +66,16 @@ async def _train(self) -> None: logging.warning('CancelledError in run_training') raise except Exception: - pass - else: - await asyncio.sleep(0.1) + logging.error('Error in sync_confusion_matrix (this error is ignored)') - error = self._get_executor_error_from_log() - if error: + if error := self._get_executor_error_from_log(): raise TrainingError(cause=error) if self.executor.return_code != 0: # TODO check if this works to catch errors from the executor: raise TrainingError(cause=f'Executor returned with error code: {self.executor.return_code}') except TrainingError: - logging.exception('Error in TrainingProcess') + logging.exception('Exception in trainer_logic._train') if self.executor.is_running(): self.executor.stop() self.training.training_state = previous_state @@ -85,7 +83,7 @@ async def _train(self) -> None: async def _do_detections(self) -> None: context = self.training.context - model_id = self.training.model_id_for_detecting + model_id = self.training.model_uuid_for_detecting assert model_id, 'model_id must be set' tmp_folder = f'/tmp/model_for_auto_detections_{model_id}_{self.model_format}' diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index 34535bbc..44cc7463 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -340,12 +340,12 @@ async def _sync_confusion_matrix(self) -> None: async def _upload_model(self) -> None: """Uploads the latest model to the Learning Loop. """ - new_model_id = await self._upload_model_return_new_model_uuid(self.training.context) - if new_model_id is None: + new_model_uuid = await self._upload_model_return_new_model_uuid(self.training.context) + if new_model_uuid is None: self.training.training_state = TrainerState.ReadyForCleanup logging.error('could not upload model - maybe training failed.. cleaning up') - logging.info(f'Successfully uploaded model and received new model id: {new_model_id}') - self.training.model_id_for_detecting = new_model_id + logging.info(f'Successfully uploaded model and received new model id: {new_model_uuid}') + self.training.model_uuid_for_detecting = new_model_uuid async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]: """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file. diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py index 7b909db3..a1e3b471 100644 --- a/mock_trainer/app_code/tests/test_detections.py +++ b/mock_trainer/app_code/tests/test_detections.py @@ -35,7 +35,7 @@ async def test_all(setup_test_project1, glc: LoopCommunicator): project_folder = create_project_folder(context) training = generate_training(project_folder, context) - training.model_id_for_detecting = latest_model_id + training.model_uuid_for_detecting = latest_model_id trainer._training = training await trainer._do_detections() detections = trainer.active_training_io.load_detections() From 161dde5b4bf810a9d7c91c74ce55ba01a74ec8f7 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Tue, 26 Mar 2024 09:40:03 +0100 Subject: [PATCH 26/62] fix tests --- .../trainer/tests/states/test_state_detecting.py | 10 ++++------ .../trainer/tests/states/test_state_train.py | 9 +++------ .../trainer/tests/testing_trainer_logic.py | 6 +++--- learning_loop_node/trainer/trainer_logic.py | 7 +++++-- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py index 770429c8..5492f8dc 100644 --- a/learning_loop_node/trainer/tests/states/test_state_detecting.py +++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py @@ -17,12 +17,10 @@ def trainer_has_error(trainer: TrainerLogic): async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogic): # NOTE was a flaky test trainer = test_initialized_trainer create_active_training_file(trainer, training_state='train_model_uploaded', - model_id_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a') + model_uuid_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a') # trainer.load_active_training() _ = asyncio.get_running_loop().create_task( - trainer._perform_state('do_detections', TrainerState.Detecting, - TrainerState.Detected, trainer._do_detections) - ) + trainer._perform_state('do_detections', TrainerState.Detecting, TrainerState.Detected, trainer._do_detections)) await assert_training_state(trainer.training, TrainerState.Detecting, timeout=1, interval=0.001) await assert_training_state(trainer.training, TrainerState.Detected, timeout=10, interval=0.001) @@ -45,7 +43,7 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer await trainer.stop() await asyncio.sleep(0.1) - assert trainer._training is None # pylint: disable=protected-access + assert trainer._training is None assert trainer.active_training_io.detections_exist() is False assert trainer.node.last_training_io.exists() is False @@ -53,7 +51,7 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer async def test_model_not_downloadable_error(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded, - model_id_for_detecting='00000000-0000-0000-0000-000000000000') # bad model id + model_uuid_for_detecting='00000000-0000-0000-0000-000000000000') # bad model id trainer._init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer._run()) diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py index 3c6b579a..f5ac282f 100644 --- a/learning_loop_node/trainer/tests/states/test_state_train.py +++ b/learning_loop_node/trainer/tests/states/test_state_train.py @@ -19,7 +19,6 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) await asyncio.sleep(0.1) # give tests a bit time to to check for the state assert trainer.start_training_task is not None - assert trainer.start_training_task.__name__ == 'start_training' assert trainer._executor is not None trainer._executor.stop() # NOTE normally a training terminates itself @@ -38,12 +37,11 @@ async def test_stop_running_training(test_initialized_trainer: TestingTrainerLog _ = asyncio.get_running_loop().create_task(trainer._run()) await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01) - await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.01) assert trainer.start_training_task is not None - assert trainer.start_training_task.__name__ == 'start_training' await trainer.stop() - await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=2, interval=0.01) assert trainer.training.training_state == TrainerState.TrainingFinished assert trainer.node.last_training_io.load() == trainer.training @@ -55,14 +53,13 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain # NOTE e.g. when a node-computer is restarted create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) trainer._init_from_last_training() - trainer._can_resume = True + trainer._can_resume_flag = True _ = asyncio.get_running_loop().create_task(trainer._run()) await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01) await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None - assert trainer.start_training_task.__name__ == 'resume' assert trainer._executor is not None trainer._executor.stop() # NOTE normally a training terminates itself e.g diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index d4daaff1..73b1b5a9 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -11,7 +11,7 @@ class TestingTrainerLogic(TrainerLogic): def __init__(self, can_resume: bool = False) -> None: super().__init__('mocked') - self._can_resume: bool = can_resume + self._can_resume_flag: bool = can_resume self.has_new_model: bool = False self.error_msg: Optional[str] = None @@ -68,7 +68,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona assert isinstance(result, str) return result - def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: + def _get_latest_model_files(self) -> Dict[str, List[str]]: time.sleep(1) # NOTE reduce flakyness in Backend tests du to wrong order of events. fake_weight_file = '/tmp/weightfile.weights' with open(fake_weight_file, 'wb') as f: @@ -80,7 +80,7 @@ def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: return {'mocked': [fake_weight_file, more_data_file], 'mocked_2': [fake_weight_file, more_data_file]} def _can_resume(self) -> bool: - return self._can_resume + return self._can_resume_flag async def _resume(self) -> None: return await self._start_training_from_base_model() diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 32377b01..c2e73eb1 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -71,8 +71,9 @@ async def _train(self) -> None: if error := self._get_executor_error_from_log(): raise TrainingError(cause=error) - if self.executor.return_code != 0: # TODO check if this works to catch errors from the executor: - raise TrainingError(cause=f'Executor returned with error code: {self.executor.return_code}') + # NOTE: This is problematic, because the return code is not 0 when executor was stoppen e.g. via self.stop() + # if self.executor.return_code != 0: + # raise TrainingError(cause=f'Executor returned with error code: {self.executor.return_code}') except TrainingError: logging.exception('Exception in trainer_logic._train') @@ -139,6 +140,8 @@ async def _start_training(self): async def stop(self) -> None: """If executor is running, stop it. Else cancel training task.""" + print('===============> stop received in trainer_logic.', flush=True) + if not self.training_active: return if self._executor and self._executor.is_running(): From 875f184a84870d82200076f9f9ddc0a821bf09e0 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Tue, 26 Mar 2024 10:47:00 +0100 Subject: [PATCH 27/62] Refactoring --- learning_loop_node/data_classes/training.py | 7 ++++--- learning_loop_node/trainer/tests/testing_trainer_logic.py | 2 +- learning_loop_node/trainer/trainer_logic.py | 2 +- learning_loop_node/trainer/trainer_logic_generic.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index f65503b5..ecb3025c 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -107,8 +107,9 @@ class Training(): training_folder: str # f'{project_folder}/trainings/{trainings_id}' start_time: float = field(default_factory=time.time) - # model uuid to download (to continue training) | is not a uuid when training from scratch (blank or pt-name ?!) - base_model_id: Optional[str] = None + # model uuid to download (to continue training) | is not a uuid when training from scratch (blank or pt-name from provided_pretrained_models->name) + base_model_uuid_or_name: Optional[str] = None + data: Optional[TrainingData] = None training_number: Optional[int] = None training_state: Optional[str] = None @@ -123,7 +124,7 @@ def set_values_from_data(self, data: Dict) -> None: self.data = TrainingData(categories=Category.from_list(data['categories'])) self.data.hyperparameter = Hyperparameter.from_data(data=data) self.training_number = data['training_number'] - self.base_model_id = data['id'] + self.base_model_uuid_or_name = data['id'] self.training_state = TrainerState.Initialized diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index 73b1b5a9..02ffa2d4 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -36,7 +36,7 @@ async def _start_training_from_base_model(self, model: str = 'model.model') -> N self._executor.start('while true; do sleep 1; done') async def _start_training_from_scratch(self) -> None: - base_model_id = self.training.base_model_id + base_model_id = self.training.base_model_uuid_or_name assert base_model_id is not None await self._start_training_from_base_model(model=f'model_{base_model_id}.pt') diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index c2e73eb1..0e67d992 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -129,7 +129,7 @@ async def _start_training(self): if self._can_resume(): self.start_training_task = self._resume() else: - base_model_id = self.training.base_model_id + base_model_id = self.training.base_model_uuid_or_name if not is_valid_uuid4(base_model_id): self.start_training_task = self._start_training_from_scratch() else: diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index 44cc7463..d329e488 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -294,7 +294,7 @@ async def _download_model(self) -> None: """If training is continued, the model is downloaded from the Learning Loop to the training_folder. The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training. """ - base_model_id = self.training.base_model_id + base_model_id = self.training.base_model_uuid_or_name # TODO this checks if we continue a training -> make more explicit if not base_model_id or not is_valid_uuid4(base_model_id): From 101c7faf084358bc1d5f2296d91b44840f9c8b90 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Tue, 26 Mar 2024 10:56:55 +0100 Subject: [PATCH 28/62] Refactoring, fix tests --- learning_loop_node/helpers/misc.py | 2 ++ .../states/test_state_download_train_model.py | 2 +- .../trainer/tests/testing_trainer_logic.py | 5 ++--- .../trainer/trainer_logic_generic.py | 14 ++++++++------ 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py index 0f75509b..aea20e60 100644 --- a/learning_loop_node/helpers/misc.py +++ b/learning_loop_node/helpers/misc.py @@ -152,6 +152,8 @@ async def wrapper_ensure_socket_response(*args, **kwargs): def is_valid_uuid4(val): + if not val: + return False try: _ = UUID(str(val)).version return True diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py index 1679f70b..282a2288 100644 --- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py @@ -50,7 +50,7 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.DataDownloaded, - base_model_id='00000000-0000-0000-0000-000000000000') # bad model id) + base_model_uuid_or_name='00000000-0000-0000-0000-000000000000') # bad model id) trainer._init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer._run()) diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index 02ffa2d4..dacfd2b6 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -36,9 +36,8 @@ async def _start_training_from_base_model(self, model: str = 'model.model') -> N self._executor.start('while true; do sleep 1; done') async def _start_training_from_scratch(self) -> None: - base_model_id = self.training.base_model_uuid_or_name - assert base_model_id is not None - await self._start_training_from_base_model(model=f'model_{base_model_id}.pt') + assert self.training.base_model_uuid_or_name is not None, 'base_model_uuid_or_name must be set' + await self._start_training_from_base_model(model=f'model_{self.training.base_model_uuid_or_name}.pt') def _get_new_best_training_state(self) -> Optional[TrainingStateData]: if self.has_new_model: diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index d329e488..e9abdc1a 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -157,7 +157,9 @@ def model_architecture(self) -> Optional[str]: @property @abstractmethod def provided_pretrained_models(self) -> List[PretrainedModel]: - """Returns the list of provided pretrained models""" + """Returns the list of provided pretrained models. + The names of the models will come back as model_uuid_or_name in the training details. + """ raise NotImplementedError # ---------------------------------------- METHODS ---------------------------------------- @@ -294,16 +296,16 @@ async def _download_model(self) -> None: """If training is continued, the model is downloaded from the Learning Loop to the training_folder. The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training. """ - base_model_id = self.training.base_model_uuid_or_name + base_model_uuid = self.training.base_model_uuid_or_name # TODO this checks if we continue a training -> make more explicit - if not base_model_id or not is_valid_uuid4(base_model_id): - logging.info(f'skipping model download. No base model id provided: {base_model_id}') + if not is_valid_uuid4(base_model_uuid): + logging.info(f'skipping model download. No base model provided (in form of uuid): {base_model_uuid}') return logging.info('loading model from Learning Loop') - logging.info(f'downloading model {base_model_id} as {self.model_format}') - await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_id, self.model_format) + logging.info(f'downloading model {base_model_uuid} as {self.model_format}') + await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_uuid, self.model_format) shutil.move(f'{self.training.training_folder}/model.json', f'{self.training.training_folder}/base_model.json') From 7b64f9cd14659ac8c898a83f9708bdb920b6a223 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Tue, 26 Mar 2024 13:00:09 +0100 Subject: [PATCH 29/62] Minor fixes --- learning_loop_node/__init__.py | 4 ++-- learning_loop_node/trainer/trainer_logic_generic.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/learning_loop_node/__init__.py b/learning_loop_node/__init__.py index 5f4433bc..2fa5362e 100644 --- a/learning_loop_node/__init__.py +++ b/learning_loop_node/__init__.py @@ -1,6 +1,4 @@ import logging -import os -import sys # from . import log_conf from .detector.detector_logic import DetectorLogic @@ -8,4 +6,6 @@ from .globals import GLOBALS from .trainer.trainer_node import TrainerNode +__all__ = ['TrainerNode', 'DetectorNode', 'DetectorLogic', 'GLOBALS'] + logging.info('>>>>>>>>>>>>>>>>>> LOOP INITIALIZED <<<<<<<<<<<<<<<<<<<<<<<') diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index e9abdc1a..6334e82e 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -456,7 +456,7 @@ def _get_new_best_training_state(self) -> Optional[TrainingStateData]: """Is called frequently by `_sync_confusion_matrix` to check if a new "best" model is availabe. Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information). `confusion_matrix` contains a dict of all classes: - - The classes must be identified by their id, not their name. + - The classes must be identified by their uuid, not their name. - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives). `meta_information` can hold any data which is helpful for self._on_metrics_published to store weight file etc for later upload via self.get_model_files """ From 676484f9bf2921bcdf79a42d40012693a5fcdafd Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Tue, 26 Mar 2024 18:06:35 +0100 Subject: [PATCH 30/62] Prevent deadlock when training is stopped before a valid model was created --- learning_loop_node/trainer/io_helpers.py | 3 ++- learning_loop_node/trainer/trainer_logic.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/learning_loop_node/trainer/io_helpers.py b/learning_loop_node/trainer/io_helpers.py index 1ae3bd43..4849d67a 100644 --- a/learning_loop_node/trainer/io_helpers.py +++ b/learning_loop_node/trainer/io_helpers.py @@ -147,7 +147,8 @@ async def upload_detetions(self): num_files = self.get_number_of_detection_files() print(f'num_files: {num_files}', flush=True) if not num_files: - raise Exception('no detection files found') + logging.error('no detection files found') + return current_json_file_index = self.load_detections_upload_file_index() for i in range(current_json_file_index, num_files): detections = self.load_detections(i) diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 0e67d992..8cbba5e2 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -85,7 +85,9 @@ async def _train(self) -> None: async def _do_detections(self) -> None: context = self.training.context model_id = self.training.model_uuid_for_detecting - assert model_id, 'model_id must be set' + if not model_id: + logging.error('model_id is not set! Cannot do detections.') + return tmp_folder = f'/tmp/model_for_auto_detections_{model_id}_{self.model_format}' shutil.rmtree(tmp_folder, ignore_errors=True) From 4a6636f921efe65059e28221e3ad0e16f0cceba5 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 15:59:37 +0200 Subject: [PATCH 31/62] make _get_latest_model_files async and don't run it on threadpool --- .../trainer/trainer_logic_generic.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index 6334e82e..27819b2b 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -6,7 +6,7 @@ import time from abc import ABC, abstractmethod from dataclasses import asdict -from typing import TYPE_CHECKING, Callable, Coroutine, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Callable, Coroutine, Dict, List, Optional from fastapi.encoders import jsonable_encoder @@ -327,12 +327,10 @@ async def _sync_confusion_matrix(self) -> None: result = await self.node.sio_client.call('update_training', ( self.training.context.organization, self.training.context.project, jsonable_encoder(new_training))) if isinstance(result, dict) and result['success']: - logging.info( - f'successfully updated training {asdict(new_training)}') + logging.info(f'successfully updated training {asdict(new_training)}') self._on_metrics_published(new_best_model) else: - raise Exception( - f'Error for update_training: Response from loop was : {result}') + raise Exception(f'Error for update_training: Response from loop was : {result}') except Exception as e: logging.exception('Error during confusion matrix syncronization') self.errors.set(error_key, str(e)) @@ -353,10 +351,9 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file. Note that with the latest trainers the conversion to (.wts) is done by the trainer. The conversion from .wts to .engine is done by the detector (needs to be done on target hardware). - Note that trainer may train with different classes, which is why we send an initial model.json file. - """ - # NOTE: I guess this is in executor because originally the conversion happened here.. - files = await asyncio.get_running_loop().run_in_executor(None, self._get_latest_model_files) + Note that trainer may train with different classes, which is why we send an initial model.json file.""" + + files = await self._get_latest_model_files() if files is None: return None @@ -472,7 +469,7 @@ def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: raise NotImplementedError @abstractmethod - def _get_latest_model_files(self) -> Dict[str, List[str]]: + async def _get_latest_model_files(self) -> Dict[str, List[str]]: """Called when the Learning Loop requests to backup the latest model for the training. This function is used to __generate and gather__ all files needed for transfering the actual data from the trainer node to the Learning Loop. In the simplest implementation this method just renames the weight file (e.g. stored in TrainingStateData.meta_information) into a file name like latest_published_model From 9cddd10092bf88fc55b020ad713b37881ae0e6ee Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 16:08:44 +0200 Subject: [PATCH 32/62] make sure no old cookies are used --- learning_loop_node/loop_communication.py | 1 + 1 file changed, 1 insertion(+) diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index 99d9f70b..57feaf4b 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -24,6 +24,7 @@ def __init__(self) -> None: self.project: str = environment_reader.project() # used by mock_detector self.base_url: str = f'http{"s" if "learning-loop.ai" in host else ""}://' + host self.async_client: httpx.AsyncClient = httpx.AsyncClient(base_url=self.base_url, timeout=Timeout(60.0)) + self.async_client.cookies.clear() logging.info(f'Loop interface initialized with base_url: {self.base_url} / user: {self.username}') From ccde1c34ad5fe0f9997a277e3c6935e43d084f5c Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 16:09:00 +0200 Subject: [PATCH 33/62] refactoring --- learning_loop_node/detector/detector_node.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py index db657698..92b5fa21 100644 --- a/learning_loop_node/detector/detector_node.py +++ b/learning_loop_node/detector/detector_node.py @@ -223,8 +223,7 @@ async def _check_for_update(self) -> None: await self.data_exchanger.download_model(target_model_folder, Context(organization=self.organization, project=self.project), - update_to_model_id, - self.detector_logic.model_format) + update_to_model_id, self.detector_logic.model_format) try: os.unlink(model_symlink) os.remove(model_symlink) From a4e2167e8bfe3dabb78cb112364449b7e25f6d1d Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 16:09:37 +0200 Subject: [PATCH 34/62] simplify process executor and use async process api --- learning_loop_node/trainer/executor.py | 149 +++++++++++-------------- 1 file changed, 67 insertions(+), 82 deletions(-) diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py index e823c0d4..2e50e498 100644 --- a/learning_loop_node/trainer/executor.py +++ b/learning_loop_node/trainer/executor.py @@ -1,106 +1,91 @@ - -import ctypes +import asyncio import logging import os -import signal -import subprocess -from sys import platform +import shlex +from io import BufferedWriter from typing import List, Optional -import psutil - - -def create_signal_handler(sig=signal.SIGTERM): - if platform in ('linux', 'linux2'): - # "The system will send a signal to the child once the parent exits for any reason (even sigkill)." - # https://stackoverflow.com/a/19448096 - libc = ctypes.CDLL("libc.so.6") - - def callable_(): - os.setsid() - return libc.prctl(1, sig) - - return callable_ - return os.setsid - class Executor: - def __init__(self, base_path: str) -> None: + def __init__(self, base_path: str, log_name='last_training.log') -> None: + """An executor that runs a command in a separate async subprocess. + The log of the process is written to 'last_training.log' in the base_path. + Tthe process is executed in the base_path directory. + The process should be awaited to finish using `wait` or stopped using `stop` to + avoid zombie processes and close the log file.""" + self.path = base_path + self.log_file_path = f'{self.path}/{log_name}' + self.log_file: None | BufferedWriter = None + self._process: Optional[asyncio.subprocess.Process] = None # pylint: disable=no-member os.makedirs(self.path, exist_ok=True) - self._process: Optional[subprocess.Popen[bytes]] = None - - def start(self, cmd: str) -> None: - logging.info(f'Starting executor with command: {cmd}') - with open(f'{self.path}/last_training.log', 'a') as f: - f.write(f'\nStarting executor with command: {cmd}\n') - - # pylint: disable=subprocess-popen-preexec-fn - self._process = subprocess.Popen( - f'cd {self.path}; {cmd} >> last_training.log 2>&1', - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - executable='/bin/bash', - preexec_fn=create_signal_handler()) - def is_running(self) -> bool: - if self._process is None: - return False + async def start(self, cmd: str, env: Optional[dict[str, str]] = None) -> None: + """Start the process with the given command and environment variables.""" - if self._process.poll() is not None: - return False + full_env = os.environ.copy() + if env is not None: + full_env.update(env) - try: - psutil.Process(self._process.pid) - except psutil.NoSuchProcess: - # self.process.terminate() # TODO does this make sense? - self._process = None - return False + logging.info(f'Starting executor with command: {cmd} in {self.path} - logging to {self.log_file_path}') + self.log_file = open(self.log_file_path, 'ab') - return True + self._process = await asyncio.create_subprocess_exec( + *shlex.split(cmd), + cwd=self.path, + stdout=self.log_file, + stderr=asyncio.subprocess.STDOUT, # Merge stderr with stdout + env=full_env + ) + + def is_running(self) -> bool: + """Check if the process is still running.""" + return self._process is not None and self._process.returncode is None def get_log(self) -> str: - try: - with open(f'{self.path}/last_training.log') as f: - return f.read() - except Exception: + """Get the log of the process as a string.""" + if not os.path.exists(self.log_file_path): return '' + with open(self.log_file_path, 'r') as f: + return f.read() - def get_log_by_lines(self, since_last_start=False) -> List[str]: # TODO do not read whole log again - try: - with open(f'{self.path}/last_training.log') as f: - lines = f.readlines() - if since_last_start: - lines_since_last_start = [] - for line in reversed(lines): - lines_since_last_start.append(line) - if line.startswith('Starting executor with command:'): - break - return list(reversed(lines_since_last_start)) - return lines - except Exception: + def get_log_by_lines(self, tail: Optional[int] = None) -> List[str]: + """Get the log of the process as a list of lines.""" + if not os.path.exists(self.log_file_path): return [] + with open(self.log_file_path) as f: + lines = f.readlines() + if tail is not None: + lines = lines[-tail:] + return lines - def stop(self): - if self._process is None: - logging.info('no process running ... nothing to stop') - return + def close_log(self): + """Close the log file.""" + if self.log_file is not None: + self.log_file.close() + self.log_file = None - logging.info('terminating process') + async def wait(self) -> Optional[int]: + """Wait for the process to finish. Returns the return code of the process.""" - try: - os.killpg(os.getpgid(self._process.pid), signal.SIGTERM) - except ProcessLookupError: - pass + if not self._process: + logging.info('No process started... nothing to wait for') + return None + return_code = await self._process.wait() + self.close_log() + return return_code - self._process.terminate() - _, _ = self._process.communicate(timeout=3) + async def stop(self) -> Optional[int]: + """Stop the process and wait for it to finish. Returns the return code of the process.""" - @property - def return_code(self): - if not self._process: + if self._process is None: + logging.info('No process started... nothing to stop') return None - if self.is_running(): + + try: + self._process.terminate() + except ProcessLookupError: + logging.info('Process not found... nothing to stop') return None - return self._process.poll() + + return await self.wait() From 4c254bb5050d94dc8eae6153299b3311be063cec Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 16:34:07 +0200 Subject: [PATCH 35/62] Refactor executor --- learning_loop_node/trainer/executor.py | 69 ++++++++++++++++---------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py index 2e50e498..0ffa4da6 100644 --- a/learning_loop_node/trainer/executor.py +++ b/learning_loop_node/trainer/executor.py @@ -19,6 +19,13 @@ def __init__(self, base_path: str, log_name='last_training.log') -> None: self.log_file: None | BufferedWriter = None self._process: Optional[asyncio.subprocess.Process] = None # pylint: disable=no-member os.makedirs(self.path, exist_ok=True) + return None + + def _get_running_process(self) -> Optional[asyncio.subprocess.Process]: # pylint: disable=no-member + """Get the running process if available.""" + if self._process is not None and self._process.returncode is None: + return self._process + return None async def start(self, cmd: str, env: Optional[dict[str, str]] = None) -> None: """Start the process with the given command and environment variables.""" @@ -42,6 +49,43 @@ def is_running(self) -> bool: """Check if the process is still running.""" return self._process is not None and self._process.returncode is None + def terminate(self) -> None: + """Terminate the process.""" + + if process := self._get_running_process(): + try: + process.terminate() + return + except ProcessLookupError: + logging.error('No process to terminate') + self._process = None + + async def wait(self) -> Optional[int]: + """Wait for the process to finish. Returns the return code of the process or None if no process is running.""" + + if not self._process: + logging.info('No process to wait for') + return None + + return_code = await self._process.wait() + + self.close_log() + self._process = None + + return return_code + + async def stop_and_wait(self) -> Optional[int]: + """Terminate the process and wait for it to finish. Returns the return code of the process.""" + + if not self.is_running(): + logging.info('No process to stop') + return None + + self.terminate() + return await self.wait() + + # -------------------------------------------------------------------------------------------- LOGGING + def get_log(self) -> str: """Get the log of the process as a string.""" if not os.path.exists(self.log_file_path): @@ -64,28 +108,3 @@ def close_log(self): if self.log_file is not None: self.log_file.close() self.log_file = None - - async def wait(self) -> Optional[int]: - """Wait for the process to finish. Returns the return code of the process.""" - - if not self._process: - logging.info('No process started... nothing to wait for') - return None - return_code = await self._process.wait() - self.close_log() - return return_code - - async def stop(self) -> Optional[int]: - """Stop the process and wait for it to finish. Returns the return code of the process.""" - - if self._process is None: - logging.info('No process started... nothing to stop') - return None - - try: - self._process.terminate() - except ProcessLookupError: - logging.info('Process not found... nothing to stop') - return None - - return await self.wait() From c8d64afdb74df8b22bc7f98120cf4b2b2d9a2120 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 16:34:59 +0200 Subject: [PATCH 36/62] adapt backdoor controls to async api --- learning_loop_node/trainer/rest/backdoor_controls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py index e3b17ed3..e2dafc26 100644 --- a/learning_loop_node/trainer/rest/backdoor_controls.py +++ b/learning_loop_node/trainer/rest/backdoor_controls.py @@ -128,7 +128,7 @@ async def kill_process(request: Request): assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic' if not trainer_logic._executor or not trainer_logic._executor.is_running(): raise HTTPException(status_code=409, detail="trainer is not running") - trainer_logic._executor.stop() + await trainer_logic._executor.stop_and_wait() @router.post("/force_status_update") From 25966bd0cc515c4ea5634a3da34ab20a1da5959c Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 16:36:21 +0200 Subject: [PATCH 37/62] adapt trainer_logic to async executor api --- learning_loop_node/trainer/trainer_logic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 8cbba5e2..286f05b3 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -77,8 +77,7 @@ async def _train(self) -> None: except TrainingError: logging.exception('Exception in trainer_logic._train') - if self.executor.is_running(): - self.executor.stop() + await self.executor.stop_and_wait() self.training.training_state = previous_state raise @@ -123,6 +122,7 @@ async def _do_detections(self) -> None: batch_images = images[i:i+batch_size] batch_detections = await self._detect(model_information, batch_images, tmp_folder) self.active_training_io.save_detections(batch_detections, idx) + break # ---------------------------------------- METHODS ---------------------------------------- @@ -147,7 +147,7 @@ async def stop(self) -> None: if not self.training_active: return if self._executor and self._executor.is_running(): - self.executor.stop() + await self.executor.stop_and_wait() elif self.training_task: logging.info('cancelling training task') if self.training_task.cancel(): From 5d99f839d49ecef9412624f5398dd6aebf87c1e4 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 16:37:07 +0200 Subject: [PATCH 38/62] adapt tests to async executor api --- learning_loop_node/tests/test_executor.py | 9 +++++---- .../trainer/tests/states/test_state_train.py | 4 ++-- .../trainer/tests/testing_trainer_logic.py | 9 ++++----- mock_trainer/app_code/mock_trainer_logic.py | 4 ++-- mock_trainer/app_code/tests/test_mock_trainer.py | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py index 1079ea1c..ab359c3c 100644 --- a/learning_loop_node/tests/test_executor.py +++ b/learning_loop_node/tests/test_executor.py @@ -21,16 +21,17 @@ def cleanup(): cleanup_process.communicate() -def test_executor_lifecycle(): +@pytest.mark.asyncio +async def test_executor_lifecycle(): assert_process_is_running('some_executable.sh', False) executor = Executor('/tmp/test_executor/' + str(uuid4())) cmd = executor.path + '/some_executable.sh' with open(cmd, 'w') as f: - f.write('while true; do echo "some output"; sleep 1; done') + f.write('/bin/bash -c "while true; do sleep 1; done"') os.chmod(cmd, 0o755) - executor.start(cmd) + await executor.start(cmd) assert executor.is_running() assert_process_is_running('some_executable.sh') @@ -38,7 +39,7 @@ def test_executor_lifecycle(): sleep(1) assert 'some output' in executor.get_log() - executor.stop() + await executor.stop_and_wait() assert not executor.is_running() sleep(1) diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py index f5ac282f..66fa2639 100644 --- a/learning_loop_node/trainer/tests/states/test_state_train.py +++ b/learning_loop_node/trainer/tests/states/test_state_train.py @@ -21,7 +21,7 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic assert trainer.start_training_task is not None assert trainer._executor is not None - trainer._executor.stop() # NOTE normally a training terminates itself + await trainer._executor.stop_and_wait() # NOTE normally a training terminates itself await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer.training.training_state == TrainerState.TrainingFinished @@ -62,7 +62,7 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain assert trainer.start_training_task is not None assert trainer._executor is not None - trainer._executor.stop() # NOTE normally a training terminates itself e.g + await trainer._executor.stop_and_wait() # NOTE normally a training terminates itself e.g await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer.training.training_state == TrainerState.TrainingFinished diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index dacfd2b6..62eba09a 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -25,15 +25,14 @@ def model_architecture(self) -> str: @property def provided_pretrained_models(self) -> List[PretrainedModel]: - return [ - PretrainedModel(name='small', label='Small', description='a small model'), - PretrainedModel(name='medium', label='Medium', description='a medium model'), - PretrainedModel(name='large', label='Large', description='a large model')] + return [PretrainedModel(name='small', label='Small', description='a small model'), + PretrainedModel(name='medium', label='Medium', description='a medium model'), + PretrainedModel(name='large', label='Large', description='a large model')] # pylint: disable=unused-argument async def _start_training_from_base_model(self, model: str = 'model.model') -> None: assert self._executor is not None - self._executor.start('while true; do sleep 1; done') + await self._executor.start('/bin/bash -c "while true; do sleep 1; done"') async def _start_training_from_scratch(self) -> None: assert self.training.base_model_uuid_or_name is not None, 'base_model_uuid_or_name must be set' diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py index f4fb3fc8..51840904 100644 --- a/mock_trainer/app_code/mock_trainer_logic.py +++ b/mock_trainer/app_code/mock_trainer_logic.py @@ -33,11 +33,11 @@ async def _start_training_from_base_model(self) -> None: self.current_iteration = 0 if self.error_configuration.begin_training: raise Exception('Could not start training') - self.executor.start('while true; do sleep 1; done') + await self.executor.start('/bin/bash -c "while true; do sleep 1; done"') async def _start_training_from_scratch(self) -> None: self.current_iteration = 0 - self.executor.start('while true; do sleep 1; done') + await self.executor.start('/bin/bash -c "while true; do sleep 1; done"') def _get_executor_error_from_log(self) -> Optional[str]: if self.error_configuration.crash_training: diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py index 0946f991..a5d397f5 100644 --- a/mock_trainer/app_code/tests/test_mock_trainer.py +++ b/mock_trainer/app_code/tests/test_mock_trainer.py @@ -13,7 +13,7 @@ async def create_mock_trainer() -> MockTrainerLogic: mock_trainer = MockTrainerLogic(model_format='mocked') - mock_trainer._executor = Executor(GLOBALS.data_folder) # pylint: disable=protected-access + mock_trainer._executor = Executor(GLOBALS.data_folder) return mock_trainer From 1f699f90edc762e10846b1d2c7f807a5ff6b43c3 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 16:38:48 +0200 Subject: [PATCH 39/62] adapt mock_trainer_logic to new api --- mock_trainer/app_code/mock_trainer_logic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py index 51840904..d293758e 100644 --- a/mock_trainer/app_code/mock_trainer_logic.py +++ b/mock_trainer/app_code/mock_trainer_logic.py @@ -2,7 +2,7 @@ import asyncio import logging import time -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional from learning_loop_node.data_classes import (BoxDetection, CategoryType, ClassificationDetection, Detections, ErrorConfiguration, ModelInformation, Point, PointDetection, @@ -44,7 +44,7 @@ def _get_executor_error_from_log(self) -> Optional[str]: return 'mocked crash' return None - def _get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: + async def _get_latest_model_files(self) -> Dict[str, List[str]]: if self.error_configuration.save_model: raise Exception() From ed5cb3a9889840015dd9ebb09ca1bd56c3fa6b21 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 16:39:37 +0200 Subject: [PATCH 40/62] adapt testing_trainer_logic to new api --- learning_loop_node/trainer/tests/testing_trainer_logic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index 62eba09a..fc71f277 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -66,7 +66,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona assert isinstance(result, str) return result - def _get_latest_model_files(self) -> Dict[str, List[str]]: + async def _get_latest_model_files(self) -> Dict[str, List[str]]: time.sleep(1) # NOTE reduce flakyness in Backend tests du to wrong order of events. fake_weight_file = '/tmp/weightfile.weights' with open(fake_weight_file, 'wb') as f: From 8266fa0bb1148486455dcfcfed6dc22216a54b84 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 16:40:30 +0200 Subject: [PATCH 41/62] fix typing error --- learning_loop_node/trainer/trainer_logic_generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index 27819b2b..f790bbd9 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -299,7 +299,7 @@ async def _download_model(self) -> None: base_model_uuid = self.training.base_model_uuid_or_name # TODO this checks if we continue a training -> make more explicit - if not is_valid_uuid4(base_model_uuid): + if not base_model_uuid or not is_valid_uuid4(base_model_uuid): logging.info(f'skipping model download. No base model provided (in form of uuid): {base_model_uuid}') return From f740e910b40c8fa1bc9d8b06e8bf334315f7684d Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 17:06:40 +0200 Subject: [PATCH 42/62] minor refactoring, fix of tests --- learning_loop_node/trainer/executor.py | 1 - learning_loop_node/trainer/tests/test_errors.py | 5 ++++- learning_loop_node/trainer/tests/testing_trainer_logic.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py index 0ffa4da6..082407ad 100644 --- a/learning_loop_node/trainer/executor.py +++ b/learning_loop_node/trainer/executor.py @@ -19,7 +19,6 @@ def __init__(self, base_path: str, log_name='last_training.log') -> None: self.log_file: None | BufferedWriter = None self._process: Optional[asyncio.subprocess.Process] = None # pylint: disable=no-member os.makedirs(self.path, exist_ok=True) - return None def _get_running_process(self) -> Optional[asyncio.subprocess.Process]: # pylint: disable=no-member """Get the running process if available.""" diff --git a/learning_loop_node/trainer/tests/test_errors.py b/learning_loop_node/trainer/tests/test_errors.py index 507c494a..9a9c1cd8 100644 --- a/learning_loop_node/trainer/tests/test_errors.py +++ b/learning_loop_node/trainer/tests/test_errors.py @@ -1,6 +1,8 @@ import asyncio import re +import pytest + from learning_loop_node.data_classes import TrainerState from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic @@ -19,6 +21,7 @@ async def test_training_process_is_stopped_when_trainer_reports_error(test_initi await assert_training_state(trainer.training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) +@pytest.mark.skip(reason='The since_last_start flag is deprecated.') async def test_log_can_provide_only_data_for_current_run(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) @@ -39,4 +42,4 @@ async def test_log_can_provide_only_data_for_current_run(test_initialized_traine assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) > 1 # Here only the current run is provided - assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines(since_last_start=True)))) == 1 + # assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines(since_last_start=True)))) == 1 diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index fc71f277..50171e08 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -1,6 +1,6 @@ import asyncio import time -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional from learning_loop_node.data_classes import Context, Detections, ModelInformation, PretrainedModel, TrainingStateData from learning_loop_node.trainer.trainer_logic import TrainerLogic From 0ed1673188708878f26289765ce5d883f6b41842 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 3 Apr 2024 17:47:40 +0200 Subject: [PATCH 43/62] simplify state string and do all detections --- learning_loop_node/data_classes/training.py | 2 +- learning_loop_node/trainer/trainer_logic.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index ecb3025c..d530ae7a 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -94,7 +94,7 @@ def short_str(self) -> str: cntxt = f'{self.context.organization}/{self.context.project}' if self.context else '' hyps = f'({self.hyperparameters})' if self.hyperparameters else '' arch = f'.{self.architecture} - ' if self.architecture else '' - return f'[{str(self.state)} {prgr}. {self.name}({self.id}). Tr/Ts/Tsk: {trtesk} {cntxt}{arch}{hyps}]' + return f'[{str(self.state).rsplit(".", maxsplit=1)[-1]} {prgr}. {self.name}({self.id}). Tr/Ts/Tsk: {trtesk} {cntxt}{arch}{hyps}]' @dataclass(**KWONLY_SLOTS) diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 286f05b3..93128b4b 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -122,7 +122,6 @@ async def _do_detections(self) -> None: batch_images = images[i:i+batch_size] batch_detections = await self._detect(model_information, batch_images, tmp_folder) self.active_training_io.save_detections(batch_detections, idx) - break # ---------------------------------------- METHODS ---------------------------------------- From dd6b998d4f58936a8bdf52ab0642e0ea35f4c43f Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Thu, 4 Apr 2024 13:18:47 +0200 Subject: [PATCH 44/62] fix test test_executor_lifecycle --- learning_loop_node/tests/test_executor.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py index ab359c3c..38d50b4b 100644 --- a/learning_loop_node/tests/test_executor.py +++ b/learning_loop_node/tests/test_executor.py @@ -25,10 +25,10 @@ def cleanup(): async def test_executor_lifecycle(): assert_process_is_running('some_executable.sh', False) - executor = Executor('/tmp/test_executor/' + str(uuid4())) - cmd = executor.path + '/some_executable.sh' - with open(cmd, 'w') as f: - f.write('/bin/bash -c "while true; do sleep 1; done"') + executor = Executor('/tmp/test_executor/' + str(uuid4())+'/') + cmd = 'bash some_executable.sh' + with open(executor.path+'some_executable.sh', 'w') as f: + f.write('/bin/bash -c "while true; do sleep 1; echo some output; done"') os.chmod(cmd, 0o755) await executor.start(cmd) @@ -49,6 +49,7 @@ async def test_executor_lifecycle(): def assert_process_is_running(process_name, running=True): if running: for process in psutil.process_iter(): + print(process.name(), process.cmdline()) process_name_match = process_name in process.name() process_cmd_match = process_name in str(process.cmdline()) if process_name_match or process_cmd_match: From c156d0b6434630ed45670b63f906550a0779a069 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Thu, 4 Apr 2024 13:23:10 +0200 Subject: [PATCH 45/62] fix test test_go_to_cleanup_if_no_detections_exist --- .../tests/states/test_state_upload_detections.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py index 8918eece..e2784514 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py @@ -133,7 +133,9 @@ async def test_bad_status_from_LearningLoop(test_initialized_trainer: TestingTra assert trainer.node.last_training_io.load() == trainer.training -async def test_other_errors(test_initialized_trainer: TestingTrainerLogic): +async def test_go_to_cleanup_if_no_detections_exist(test_initialized_trainer: TestingTrainerLogic): + """This test simulates a situation where the detection file is missing. + In this case, the trainer should report an error and move to the ReadyForCleanup state.""" trainer = test_initialized_trainer # e.g. missing detection file @@ -141,12 +143,7 @@ async def test_other_errors(test_initialized_trainer: TestingTrainerLogic): trainer._init_from_last_training() _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) - await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001) - - assert trainer_has_error(trainer) - assert trainer.training.training_state == TrainerState.Detected - assert trainer.node.last_training_io.load() == trainer.training + await assert_training_state(trainer.training, TrainerState.ReadyForCleanup, timeout=1, interval=0.001) async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic): From 185b9edcad44513075e43806f00957c5f315dee4 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Thu, 4 Apr 2024 18:56:15 +0200 Subject: [PATCH 46/62] deactivate asyncio warnings --- learning_loop_node/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py index 5424c110..9418123e 100644 --- a/learning_loop_node/node.py +++ b/learning_loop_node/node.py @@ -78,7 +78,7 @@ async def lifespan(self, app: FastAPI): # pylint: disable=unused-argument async def _on_startup(self): self.log.info('received "startup" lifecycle-event') - activate_asyncio_warnings() + # activate_asyncio_warnings() if self.needs_login: await self.loop_communicator.backend_ready() self.log.info('ensuring login') From 6510f88503a791e13cdfdd0e8341901fce7f3dd0 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 5 Apr 2024 11:06:26 +0200 Subject: [PATCH 47/62] set request throttle to avoid Error 429 --- learning_loop_node/data_exchanger.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index 0d4d2add..c9815e24 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -6,6 +6,7 @@ from glob import glob from http import HTTPStatus from io import BytesIO +from time import time from typing import Dict, List, Optional import aiofiles # type: ignore @@ -108,13 +109,15 @@ async def download_images(self, image_uuids: List[str], image_folder: str, chunk chunk_ids = image_uuids[i:i+chunk_size] tasks = [] for j, chunk_j in enumerate(chunk_paths): + start = time() tasks.append(create_task(self._download_one_image(chunk_j, chunk_ids[j], image_folder))) + await asyncio.sleep(max(0, 0.02 - (time() - start))) # prevent too many requests at once await asyncio.gather(*tasks) async def _download_one_image(self, path: str, image_id: str, image_folder: str) -> None: response = await self.loop_communicator.get(path) if response.status_code != HTTPStatus.OK: - logging.error(f'bad status code {response.status_code} for {path}') + logging.error(f'bad status code {response.status_code} for {path}. Details: {response.text}') return filename = f'{image_folder}/{image_id}.jpg' async with aiofiles.open(filename, 'wb') as f: From d811b53d47365f6e94399019b9ef29c75b2274f1 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 5 Apr 2024 11:07:47 +0200 Subject: [PATCH 48/62] make inference_batch_size an attribute that can be overwritten by children --- learning_loop_node/trainer/trainer_logic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 93128b4b..108349ad 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -25,6 +25,7 @@ def __init__(self, model_format: str) -> None: self._detection_progress: Optional[float] = None self._executor: Optional[Executor] = None self.start_training_task: Optional[Coroutine] = None + self.inference_batch_size = 10 # ---------------------------------------- IMPLEMENTED ABSTRACT PROPERTIES ---------------------------------------- @@ -116,10 +117,9 @@ async def _do_detections(self) -> None: self.active_training_io.save_detections([], 0) num_images = len(images) - batch_size = 200 - for idx, i in enumerate(range(0, num_images, batch_size)): + for idx, i in enumerate(range(0, num_images, self.inference_batch_size)): self._detection_progress = 0.5 + (i/num_images)*0.5 - batch_images = images[i:i+batch_size] + batch_images = images[i:i+self.inference_batch_size] batch_detections = await self._detect(model_information, batch_images, tmp_folder) self.active_training_io.save_detections(batch_detections, idx) From a55788303ade5f44caad512763281b231c8eb7c5 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Fri, 5 Apr 2024 18:44:22 +0200 Subject: [PATCH 49/62] add 'model_size' to ModelInformation. Required when continuing training Fix tests --- learning_loop_node/data_classes/general.py | 1 + learning_loop_node/tests/test_executor.py | 2 +- learning_loop_node/trainer/tests/states/test_state_train.py | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py index 5c616841..3ef5e412 100644 --- a/learning_loop_node/data_classes/general.py +++ b/learning_loop_node/data_classes/general.py @@ -53,6 +53,7 @@ class ModelInformation(): categories: List[Category] resolution: Optional[int] = None model_root_path: Optional[str] = None + model_size: Optional[str] = None @property def context(self): diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py index 38d50b4b..7a69dca4 100644 --- a/learning_loop_node/tests/test_executor.py +++ b/learning_loop_node/tests/test_executor.py @@ -29,7 +29,7 @@ async def test_executor_lifecycle(): cmd = 'bash some_executable.sh' with open(executor.path+'some_executable.sh', 'w') as f: f.write('/bin/bash -c "while true; do sleep 1; echo some output; done"') - os.chmod(cmd, 0o755) + os.chmod(executor.path+'some_executable.sh', 0o755) await executor.start(cmd) diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py index 66fa2639..4e1d200c 100644 --- a/learning_loop_node/trainer/tests/states/test_state_train.py +++ b/learning_loop_node/trainer/tests/states/test_state_train.py @@ -16,12 +16,12 @@ async def test_successful_training(test_initialized_trainer: TestingTrainerLogic _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) - await asyncio.sleep(0.1) # give tests a bit time to to check for the state + await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.01) assert trainer.start_training_task is not None assert trainer._executor is not None - await trainer._executor.stop_and_wait() # NOTE normally a training terminates itself + await trainer.stop() # NOTE normally a training terminates itself await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer.training.training_state == TrainerState.TrainingFinished From d4551f33471cfc190da4186cd26243c96192afc1 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Mon, 8 Apr 2024 11:25:49 +0200 Subject: [PATCH 50/62] Improve logs. Cleanup tmp-dir after model download. --- learning_loop_node/data_exchanger.py | 9 ++++++--- learning_loop_node/trainer/trainer_logic.py | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index c9815e24..92b899b4 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -126,8 +126,10 @@ async def _download_one_image(self, path: str, image_id: str, image_folder: str) os.remove(filename) async def download_model(self, target_folder: str, context: Context, model_uuid: str, model_format: str) -> List[str]: - """Downloads a model and returns the paths of the downloaded files.""" - logging.info(f'Downloading model {model_uuid} to {target_folder}..') + """Downloads a model (and additional meta data like model.json) and returns the paths of the downloaded files. + Used before training a model (in case of resuming a training) or before detecting images. + """ + logging.info(f'Downloading model data for uuid {model_uuid} from the loop to {target_folder}..') path = f'/{context.organization}/projects/{context.project}/models/{model_uuid}/{model_format}/file' response = await self.loop_communicator.get(path, requires_login=False) @@ -153,7 +155,8 @@ async def download_model(self, target_folder: str, context: Context, model_uuid: new_file = shutil.move(file, target_folder) created_files.append(new_file) - logging.info(f'---- downloaded model {model_uuid}/{model_format} to {tmp_path}. Moved to {target_folder}.') + shutil.rmtree(tmp_path, ignore_errors=True) + logging.info(f'Downloaded model {model_uuid}({model_format}) to {target_folder}.') return created_files async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]: diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 108349ad..ea32b6dc 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -130,8 +130,8 @@ async def _start_training(self): if self._can_resume(): self.start_training_task = self._resume() else: - base_model_id = self.training.base_model_uuid_or_name - if not is_valid_uuid4(base_model_id): + base_model_uuid_or_name = self.training.base_model_uuid_or_name + if not is_valid_uuid4(base_model_uuid_or_name): self.start_training_task = self._start_training_from_scratch() else: self.start_training_task = self._start_training_from_base_model() From 26c9a33531eabed0a820d31b959aa33ee9559660 Mon Sep 17 00:00:00 2001 From: Niklas Neugebauer <68709968+NiklasNeugebauer@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:01:55 +0200 Subject: [PATCH 51/62] Update data_exchanger.py clarify docstring --- learning_loop_node/data_exchanger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index 92b899b4..9e8ffdb8 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -127,7 +127,7 @@ async def _download_one_image(self, path: str, image_id: str, image_folder: str) async def download_model(self, target_folder: str, context: Context, model_uuid: str, model_format: str) -> List[str]: """Downloads a model (and additional meta data like model.json) and returns the paths of the downloaded files. - Used before training a model (in case of resuming a training) or before detecting images. + Used before training a model (when continuing a finished training) or before detecting images. """ logging.info(f'Downloading model data for uuid {model_uuid} from the loop to {target_folder}..') From 1fa6c4ebae6c5025db3dcd97a1cbe9ca91f41f07 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Tue, 9 Apr 2024 11:29:03 +0200 Subject: [PATCH 52/62] Handle file opening error --- .syncignore | 5 +++++ learning_loop_node/loop_communication.py | 10 +++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 .syncignore diff --git a/.syncignore b/.syncignore new file mode 100644 index 00000000..7bf361fc --- /dev/null +++ b/.syncignore @@ -0,0 +1,5 @@ +.git/ +__pycache__/ +.DS_Store +*.tmp +.env \ No newline at end of file diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index 57feaf4b..901532fd 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -81,7 +81,15 @@ async def put(self, path, files: Optional[List[str]] = None, requires_login=True if files is None: return await self.async_client.put(api_prefix+path, **kwargs) - file_handles = [open(f, 'rb') for f in files] # Open files and store handles + file_handles = [] + for f in files: + try: + file_handles.append(open(f, 'rb')) + except FileNotFoundError: + for fh in file_handles: + fh.close() # Ensure all files are closed + return httpx.Response(404, content=b'File not found') + try: file_list = [('files', fh) for fh in file_handles] # Use file handles response = await self.async_client.put(api_prefix+path, files=file_list) From 7cd73e1a0125e974aa2899564ba985853553fff8 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Tue, 9 Apr 2024 11:54:32 +0200 Subject: [PATCH 53/62] (re)enable todo/fixme warnings in mypy --- .vscode/settings.json | 1 - 1 file changed, 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index ff950a35..aec19884 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,7 +9,6 @@ "--disable=C0111", // Missing docstring (in function/class/method) "--disable=C0114", // Missing module docstring "--disable=C0301", // Line too long (exceeds character limit) - "--disable=W0511", // TODO/FIXME not being used "--disable=W0718", // Catching too general exception "--disable=W0719", // Raising too general exception "--disable=W1203", // Use % formatting in logging functions and pass the % parameters as arguments From 20adbc7b90e9201940ea3fd2d935a2a166d228bc Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Wed, 10 Apr 2024 10:21:25 +0200 Subject: [PATCH 54/62] improve handling of authorization in rest communication --- learning_loop_node/loop_communication.py | 29 +++++++++++++++++++----- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index 901532fd..62ecccd9 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -31,14 +31,14 @@ def __init__(self) -> None: def websocket_url(self) -> str: return f'ws{"s" if "learning-loop.ai" in self.host else ""}://' + self.host - async def ensure_login(self) -> None: + async def ensure_login(self, relogin=False) -> None: """aiohttp client session needs to be created on the event loop""" assert not self.async_client.is_closed, 'async client must not be used after shutdown' - if not self.async_client.cookies.keys(): + if not self.async_client.cookies.keys() or relogin: + self.async_client.cookies.clear() response = await self.async_client.post('/api/login', data={'username': self.username, 'password': self.password}) if response.status_code != 200: - self.async_client.cookies.clear() logging.info(f'Login failed with response: {response}') raise LoopCommunicationException('Login failed with response: ' + str(response)) self.async_client.cookies.update(response.cookies) @@ -50,6 +50,7 @@ async def logout(self) -> None: if response.status_code != 200: logging.info(f'Logout failed with response: {response}') raise LoopCommunicationException('Logout failed with response: ' + str(response)) + self.async_client.cookies.clear() def get_cookies(self) -> Cookies: return self.async_client.cookies @@ -73,7 +74,12 @@ async def backend_ready(self) -> bool: async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/api') -> httpx.Response: if requires_login: await self.ensure_login() - return await self.async_client.get(api_prefix+path) + + response = await self.async_client.get(api_prefix+path) + + if response.status_code == 401: + await self.ensure_login(relogin=True) + return response async def put(self, path, files: Optional[List[str]] = None, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response: if requires_login: @@ -97,14 +103,25 @@ async def put(self, path, files: Optional[List[str]] = None, requires_login=True for fh in file_handles: fh.close() # Ensure all files are closed + if response.status_code == 401: + await self.ensure_login(relogin=True) return response async def post(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response: if requires_login: await self.ensure_login() - return await self.async_client.post(api_prefix+path, **kwargs) + response = await self.async_client.post(api_prefix+path, **kwargs) + + if response.status_code == 401: + await self.ensure_login(relogin=True) + return response async def delete(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response: if requires_login: await self.ensure_login() - return await self.async_client.delete(api_prefix+path, **kwargs) + + response = await self.async_client.delete(api_prefix+path, **kwargs) + + if response.status_code == 401: + await self.ensure_login(relogin=True) + return response From f2e41a525da801c785206c82cd3dd4bca4593ac7 Mon Sep 17 00:00:00 2001 From: Niklas Neugebauer <68709968+NiklasNeugebauer@users.noreply.github.com> Date: Thu, 11 Apr 2024 10:27:37 +0200 Subject: [PATCH 55/62] create executable_path variable to avoid redundancy --- learning_loop_node/tests/test_executor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py index 7a69dca4..1842f71e 100644 --- a/learning_loop_node/tests/test_executor.py +++ b/learning_loop_node/tests/test_executor.py @@ -27,9 +27,10 @@ async def test_executor_lifecycle(): executor = Executor('/tmp/test_executor/' + str(uuid4())+'/') cmd = 'bash some_executable.sh' - with open(executor.path+'some_executable.sh', 'w') as f: + executable_path = executor.path+'some_executable.sh' + with open(executable_path, 'w') as f: f.write('/bin/bash -c "while true; do sleep 1; echo some output; done"') - os.chmod(executor.path+'some_executable.sh', 0o755) + os.chmod(executable_path, 0o755) await executor.start(cmd) From 719c664d0f248f6de83629b5106ad738a11839d6 Mon Sep 17 00:00:00 2001 From: Niklas Neugebauer Date: Thu, 11 Apr 2024 11:37:17 +0200 Subject: [PATCH 56/62] Retry http requests if login required and 401 was thrown --- learning_loop_node/loop_communication.py | 56 +++++++++++++++++++----- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index 62ecccd9..0642c3c1 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -1,6 +1,6 @@ import asyncio import logging -from typing import List, Optional +from typing import Awaitable, Callable, List, Optional import httpx from httpx import Cookies, Timeout @@ -71,19 +71,40 @@ async def backend_ready(self) -> bool: logging.info(f'backend not ready: {e}') await asyncio.sleep(10) + async def retry_on_401(self, func: Callable[..., Awaitable[httpx.Response]], *args, **kwargs) -> httpx.Response: + response = await func(*args, **kwargs) + if response.status_code == 401: + await self.ensure_login(relogin=True) + response = await func(*args, **kwargs) + return response + async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/api') -> httpx.Response: if requires_login: await self.ensure_login() + # retry on 401 if required + if requires_login: + return await self.retry_on_401(self._get, path, api_prefix) + else: + return await self._get(path, api_prefix) + + async def _get(self, path: str, api_prefix: str = '/api') -> httpx.Response: + response = await self.async_client.get(api_prefix+path) - if response.status_code == 401: - await self.ensure_login(relogin=True) return response - async def put(self, path, files: Optional[List[str]] = None, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response: + async def put(self, path: str, files: Optional[List[str]] = None, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response: if requires_login: await self.ensure_login() + + # retry on 401 if required + if requires_login: + return await self.retry_on_401(self._put, path, api_prefix, **kwargs) + else: + return await self._put(path, files, api_prefix, **kwargs) + + async def _put(self, path, files: Optional[List[str]] = None, api_prefix='/api', **kwargs) -> httpx.Response: if files is None: return await self.async_client.put(api_prefix+path, **kwargs) @@ -103,25 +124,36 @@ async def put(self, path, files: Optional[List[str]] = None, requires_login=True for fh in file_handles: fh.close() # Ensure all files are closed - if response.status_code == 401: - await self.ensure_login(relogin=True) return response - async def post(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response: + async def post(self, path: str, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response: if requires_login: await self.ensure_login() + + # retry on 401 if required + if requires_login: + return await self.retry_on_401(self._post, path, api_prefix, **kwargs) + else: + return await self._post(path, api_prefix, **kwargs) + + async def _post(self, path, api_prefix='/api', **kwargs) -> httpx.Response: + response = await self.async_client.post(api_prefix+path, **kwargs) - if response.status_code == 401: - await self.ensure_login(relogin=True) return response - async def delete(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response: + async def delete(self, path: str, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response: if requires_login: await self.ensure_login() + # retry on 401 if required + if requires_login: + return await self.retry_on_401(self._delete, path, api_prefix, **kwargs) + else: + return await self._delete(path, api_prefix, **kwargs) + + async def _delete(self, path, api_prefix='/api', **kwargs) -> httpx.Response: + response = await self.async_client.delete(api_prefix+path, **kwargs) - if response.status_code == 401: - await self.ensure_login(relogin=True) return response From 9bde621711004c3b7ad22a5b026165db8feb3aa4 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Thu, 11 Apr 2024 11:45:54 +0200 Subject: [PATCH 57/62] Major rerfactoring and Api changes. Resolve linting hints (#14) * improve code documentation, abstraction layers and api * fix all mypi and linting issues * solve all linting errors in mock nodes * further improvements of documentation and refactoring * Further refactoring and API improvements * Further refactoring and API improvements * Further refactoring and API improvements * fix tests * Refactoring * Refactoring, fix tests * Minor fixes * Prevent deadlock when training is stopped before a valid model was created * make _get_latest_model_files async and don't run it on threadpool * make sure no old cookies are used * refactoring * simplify process executor and use async process api * Refactor executor * adapt backdoor controls to async api * adapt trainer_logic to async executor api * adapt tests to async executor api * adapt mock_trainer_logic to new api * adapt testing_trainer_logic to new api * fix typing error * minor refactoring, fix of tests * simplify state string and do all detections * fix test test_executor_lifecycle * fix test test_go_to_cleanup_if_no_detections_exist * deactivate asyncio warnings * set request throttle to avoid Error 429 * make inference_batch_size an attribute that can be overwritten by children * add 'model_size' to ModelInformation. Required when continuing training Fix tests * Improve logs. Cleanup tmp-dir after model download. * Update data_exchanger.py clarify docstring * Handle file opening error * (re)enable todo/fixme warnings in mypy * create executable_path variable to avoid redundancy --------- Co-authored-by: Niklas Neugebauer <68709968+NiklasNeugebauer@users.noreply.github.com> --- .syncignore | 5 + .vscode/settings.json | 10 +- learning_loop_node/__init__.py | 4 +- .../annotation/annotator_logic.py | 4 +- learning_loop_node/data_classes/__init__.py | 15 +- learning_loop_node/data_classes/detections.py | 9 +- learning_loop_node/data_classes/general.py | 7 +- learning_loop_node/data_classes/training.py | 54 ++- learning_loop_node/data_exchanger.py | 30 +- learning_loop_node/detector/__init__.py | 1 - learning_loop_node/detector/detector_node.py | 7 +- .../inbox_filter/cam_observation_history.py | 11 +- learning_loop_node/detector/outbox.py | 1 - learning_loop_node/detector/tests/conftest.py | 1 - .../tests/test_client_communication.py | 4 +- .../detector/tests/test_outbox.py | 2 + learning_loop_node/globals.py | 4 +- .../helpers/gdrive_downloader.py | 2 +- learning_loop_node/helpers/misc.py | 31 +- learning_loop_node/loop_communication.py | 31 +- learning_loop_node/node.py | 4 +- learning_loop_node/py.typed | 0 learning_loop_node/tests/test_executor.py | 23 +- learning_loop_node/tests/test_helper.py | 1 - learning_loop_node/trainer/executor.py | 170 +++---- learning_loop_node/trainer/io_helpers.py | 23 +- .../trainer/rest/backdoor_controls.py | 7 +- learning_loop_node/trainer/rest/controls.py | 2 + learning_loop_node/trainer/tests/conftest.py | 47 +- .../trainer/tests/state_helper.py | 2 +- .../tests/states/test_state_cleanup.py | 8 +- .../tests/states/test_state_detecting.py | 45 +- .../states/test_state_download_train_model.py | 44 +- .../tests/states/test_state_prepare.py | 29 +- .../test_state_sync_confusion_matrix.py | 40 +- .../trainer/tests/states/test_state_train.py | 55 +-- .../states/test_state_upload_detections.py | 52 +- .../tests/states/test_state_upload_model.py | 35 +- .../trainer/tests/test_errors.py | 25 +- .../trainer/tests/testing_trainer_logic.py | 46 +- learning_loop_node/trainer/trainer_logic.py | 145 +++--- .../trainer/trainer_logic_abstraction.py | 146 ------ .../trainer/trainer_logic_generic.py | 456 ++++++++++++------ learning_loop_node/trainer/trainer_node.py | 6 +- .../trainer/training_syncronizer.py | 53 -- mock_detector/app_code/tests/test_detector.py | 2 + mock_trainer/app_code/mock_trainer_logic.py | 69 ++- mock_trainer/app_code/progress_simulator.py | 10 +- mock_trainer/app_code/tests/conftest.py | 3 +- .../app_code/tests/test_detections.py | 14 +- .../app_code/tests/test_mock_trainer.py | 13 +- 51 files changed, 922 insertions(+), 886 deletions(-) create mode 100644 .syncignore create mode 100644 learning_loop_node/py.typed delete mode 100644 learning_loop_node/trainer/trainer_logic_abstraction.py delete mode 100644 learning_loop_node/trainer/training_syncronizer.py diff --git a/.syncignore b/.syncignore new file mode 100644 index 00000000..7bf361fc --- /dev/null +++ b/.syncignore @@ -0,0 +1,5 @@ +.git/ +__pycache__/ +.DS_Store +*.tmp +.env \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 45eb6e46..aec19884 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -12,7 +12,15 @@ "--disable=W0718", // Catching too general exception "--disable=W0719", // Raising too general exception "--disable=W1203", // Use % formatting in logging functions and pass the % parameters as arguments - "--disable=W1514" // Using open without explicitly specifying an encoding + "--disable=W1514", // Using open without explicitly specifying an encoding + "--disable=R0902", // Too many instance attributes + "--disable=R0903", // Too few public methods + "--disable=R0912", // Too many branches + "--disable=R0913", // Too many arguments + "--disable=R0914", // Too many local variables + "--disable=R0915", // Too many statements + "--disable=R1732", // Consider using with for resource-allocating operations + "--disable=R0801" // Similar lines in 2 files ], "[python]": { "editor.defaultFormatter": "ms-python.autopep8", diff --git a/learning_loop_node/__init__.py b/learning_loop_node/__init__.py index 5f4433bc..2fa5362e 100644 --- a/learning_loop_node/__init__.py +++ b/learning_loop_node/__init__.py @@ -1,6 +1,4 @@ import logging -import os -import sys # from . import log_conf from .detector.detector_logic import DetectorLogic @@ -8,4 +6,6 @@ from .globals import GLOBALS from .trainer.trainer_node import TrainerNode +__all__ = ['TrainerNode', 'DetectorNode', 'DetectorLogic', 'GLOBALS'] + logging.info('>>>>>>>>>>>>>>>>>> LOOP INITIALIZED <<<<<<<<<<<<<<<<<<<<<<<') diff --git a/learning_loop_node/annotation/annotator_logic.py b/learning_loop_node/annotation/annotator_logic.py index 932abce9..a80cc13b 100644 --- a/learning_loop_node/annotation/annotator_logic.py +++ b/learning_loop_node/annotation/annotator_logic.py @@ -7,10 +7,10 @@ class AnnotatorLogic(): - def __init__(self): + def __init__(self) -> None: self._node: Optional[Node] = None - def init(self, node: Node): + def init(self, node: Node) -> None: self._node = node @abstractmethod diff --git a/learning_loop_node/data_classes/__init__.py b/learning_loop_node/data_classes/__init__.py index 0e0a10e9..524cb8bb 100644 --- a/learning_loop_node/data_classes/__init__.py +++ b/learning_loop_node/data_classes/__init__.py @@ -4,5 +4,16 @@ from .general import (AnnotationNodeStatus, Category, CategoryType, Context, DetectionStatus, ErrorConfiguration, ModelInformation, NodeState, NodeStatus) from .socket_response import SocketResponse -from .training import (BasicModel, Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData, - TrainingError, TrainingOut, TrainingStatus) +from .training import (Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData, + TrainingError, TrainingOut, TrainingStateData, TrainingStatus) + +__all__ = [ + 'AnnotationData', 'AnnotationEventType', 'SegmentationAnnotation', 'ToolOutput', 'UserInput', + 'BoxDetection', 'ClassificationDetection', 'Detections', 'Observation', 'Point', 'PointDetection', + 'SegmentationDetection', 'Shape', + 'AnnotationNodeStatus', 'Category', 'CategoryType', 'Context', 'DetectionStatus', 'ErrorConfiguration', + 'ModelInformation', 'NodeState', 'NodeStatus', + 'SocketResponse', + 'Errors', 'Hyperparameter', 'Model', 'PretrainedModel', 'TrainerState', 'Training', 'TrainingData', + 'TrainingError', 'TrainingOut', 'TrainingStateData', 'TrainingStatus', +] diff --git a/learning_loop_node/data_classes/detections.py b/learning_loop_node/data_classes/detections.py index 21924720..0872b256 100644 --- a/learning_loop_node/data_classes/detections.py +++ b/learning_loop_node/data_classes/detections.py @@ -13,8 +13,11 @@ @dataclass(**KWONLY_SLOTS) class BoxDetection(): + """Coordinates according to COCO format. x,y is the top left corner of the box. + x increases to the right, y increases downwards. + """ category_name: str - x: int # TODO add definition of x,y,w,h + x: int y: int width: int height: int @@ -47,6 +50,8 @@ def __str__(self): @dataclass(**KWONLY_SLOTS) class PointDetection(): + """Coordinates according to COCO format. x,y is the center of the point. + x increases to the right, y increases downwards.""" category_name: str x: float y: float @@ -111,7 +116,7 @@ class Detections(): point_detections: List[PointDetection] = field(default_factory=list) segmentation_detections: List[SegmentationDetection] = field(default_factory=list) classification_detections: List[ClassificationDetection] = field(default_factory=list) - tags: Optional[List[str]] = field(default_factory=list) + tags: List[str] = field(default_factory=list) date: Optional[str] = field(default_factory=current_datetime) image_id: Optional[str] = None # used for detection of trainers diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py index 9d5c893e..3ef5e412 100644 --- a/learning_loop_node/data_classes/general.py +++ b/learning_loop_node/data_classes/general.py @@ -34,10 +34,6 @@ def from_list(values: List[dict]) -> List['Category']: return [from_dict(data_class=Category, data=value) for value in values] -def create_category(identifier: str, name: str, ctype: Union[CategoryType, str]): # TODO: This is probably unused - return Category(id=identifier, name=name, description='', hotkey='', color='', type=ctype, point_size=None) - - @dataclass(**KWONLY_SLOTS) class Context(): organization: str @@ -57,6 +53,7 @@ class ModelInformation(): categories: List[Category] resolution: Optional[int] = None model_root_path: Optional[str] = None + model_size: Optional[str] = None @property def context(self): @@ -64,6 +61,8 @@ def context(self): @staticmethod def load_from_disk(model_root_path: str) -> Optional['ModelInformation']: + """Load model.json from model_root_path and return ModelInformation object. + """ model_info_file_path = f'{model_root_path}/model.json' if not os.path.exists(model_info_file_path): logging.warning(f"could not find model information file '{model_info_file_path}'") diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index 449cc85b..d530ae7a 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -3,6 +3,7 @@ import time from dataclasses import dataclass, field from enum import Enum +from pathlib import Path from typing import Dict, List, Optional # pylint: disable=no-name-in-module @@ -17,6 +18,14 @@ class Hyperparameter(): flip_rl: bool flip_ud: bool + @staticmethod + def from_data(data: Dict): + return Hyperparameter( + resolution=data['resolution'], + flip_rl=data.get('flip_rl', False), + flip_ud=data.get('flip_ud', False) + ) + @dataclass(**KWONLY_SLOTS) class TrainingData(): @@ -64,7 +73,7 @@ class TrainerState(str, Enum): @dataclass(**KWONLY_SLOTS) class TrainingStatus(): - id: str # TODO this must not be changed, but tests wont detect it -> update tests! + id: str # NOTE this must not be changed, but tests wont detect a change -> update tests! name: str state: Optional[str] errors: Optional[Dict] @@ -79,13 +88,13 @@ class TrainingStatus(): architecture: Optional[str] = None context: Optional[Context] = None - def short_str(self): + def short_str(self) -> str: prgr = f'{self.progress * 100:.0f}%' if self.progress else '' trtesk = f'{self.train_image_count}/{self.test_image_count}/{self.skipped_image_count}' if self.train_image_count else 'n.a.' cntxt = f'{self.context.organization}/{self.context.project}' if self.context else '' hyps = f'({self.hyperparameters})' if self.hyperparameters else '' arch = f'.{self.architecture} - ' if self.architecture else '' - return f'[{str(self.state)} {prgr}. {self.name}({self.id}). Tr/Ts/Tsk: {trtesk} {cntxt}{arch}{hyps}]' + return f'[{str(self.state).rsplit(".", maxsplit=1)[-1]} {prgr}. {self.name}({self.id}). Tr/Ts/Tsk: {trtesk} {cntxt}{arch}{hyps}]' @dataclass(**KWONLY_SLOTS) @@ -93,22 +102,35 @@ class Training(): id: str context: Context - project_folder: str - images_folder: str - training_folder: str + project_folder: str # f'{GLOBALS.data_folder}/{context.organization}/{context.project}' + images_folder: str # f'{project_folder}/images' + training_folder: str # f'{project_folder}/trainings/{trainings_id}' start_time: float = field(default_factory=time.time) - base_model_id: Optional[str] = None + # model uuid to download (to continue training) | is not a uuid when training from scratch (blank or pt-name from provided_pretrained_models->name) + base_model_uuid_or_name: Optional[str] = None + data: Optional[TrainingData] = None training_number: Optional[int] = None training_state: Optional[str] = None - model_id_for_detecting: Optional[str] = None + model_uuid_for_detecting: Optional[str] = None hyperparameters: Optional[Dict] = None + @property + def training_folder_path(self) -> Path: + return Path(self.training_folder) + + def set_values_from_data(self, data: Dict) -> None: + self.data = TrainingData(categories=Category.from_list(data['categories'])) + self.data.hyperparameter = Hyperparameter.from_data(data=data) + self.training_number = data['training_number'] + self.base_model_uuid_or_name = data['id'] + self.training_state = TrainerState.Initialized + @dataclass(**KWONLY_SLOTS) class TrainingOut(): - confusion_matrix: Optional[Dict] = None + confusion_matrix: Optional[Dict] = None # This is actually just class-wise metrics train_image_count: Optional[int] = None test_image_count: Optional[int] = None trainer_id: Optional[str] = None @@ -116,9 +138,9 @@ class TrainingOut(): @dataclass(**KWONLY_SLOTS) -class BasicModel(): - confusion_matrix: Optional[Dict] = None - meta_information: Optional[Dict] = None +class TrainingStateData(): + confusion_matrix: Dict = field(default_factory=dict) + meta_information: Dict = field(default_factory=dict) @dataclass(**KWONLY_SLOTS) @@ -133,8 +155,8 @@ class Model(): class Errors(): - def __init__(self): - self._errors: Dict = {} + def __init__(self) -> None: + self._errors: Dict[str, str] = {} def set(self, key: str, value: str): self._errors[key] = value @@ -143,7 +165,7 @@ def set(self, key: str, value: str): def errors(self) -> Dict: return self._errors - def reset(self, key: str): + def reset(self, key: str) -> None: try: del self._errors[key] except AttributeError: @@ -151,7 +173,7 @@ def reset(self, key: str): except KeyError: pass - def reset_all(self): + def reset_all(self) -> None: self._errors = {} def has_error_for(self, key: str) -> bool: diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index ab53b243..9e8ffdb8 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -6,6 +6,7 @@ from glob import glob from http import HTTPStatus from io import BytesIO +from time import time from typing import Dict, List, Optional import aiofiles # type: ignore @@ -108,13 +109,15 @@ async def download_images(self, image_uuids: List[str], image_folder: str, chunk chunk_ids = image_uuids[i:i+chunk_size] tasks = [] for j, chunk_j in enumerate(chunk_paths): + start = time() tasks.append(create_task(self._download_one_image(chunk_j, chunk_ids[j], image_folder))) + await asyncio.sleep(max(0, 0.02 - (time() - start))) # prevent too many requests at once await asyncio.gather(*tasks) async def _download_one_image(self, path: str, image_id: str, image_folder: str) -> None: response = await self.loop_communicator.get(path) if response.status_code != HTTPStatus.OK: - logging.error(f'bad status code {response.status_code} for {path}') + logging.error(f'bad status code {response.status_code} for {path}. Details: {response.text}') return filename = f'{image_folder}/{image_id}.jpg' async with aiofiles.open(filename, 'wb') as f: @@ -122,11 +125,13 @@ async def _download_one_image(self, path: str, image_id: str, image_folder: str) if not await is_valid_image(filename, self.check_jpeg): os.remove(filename) - async def download_model(self, target_folder: str, context: Context, model_id: str, model_format: str) -> List[str]: - """Downloads a model and returns the paths of the downloaded files.""" - logging.info(f'Downloading model {model_id} to {target_folder}..') + async def download_model(self, target_folder: str, context: Context, model_uuid: str, model_format: str) -> List[str]: + """Downloads a model (and additional meta data like model.json) and returns the paths of the downloaded files. + Used before training a model (when continuing a finished training) or before detecting images. + """ + logging.info(f'Downloading model data for uuid {model_uuid} from the loop to {target_folder}..') - path = f'/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' + path = f'/{context.organization}/projects/{context.project}/models/{model_uuid}/{model_format}/file' response = await self.loop_communicator.get(path, requires_login=False) if response.status_code != 200: content = response.json() @@ -150,19 +155,18 @@ async def download_model(self, target_folder: str, context: Context, model_id: s new_file = shutil.move(file, target_folder) created_files.append(new_file) - logging.info(f'---- downloaded model {model_id}/{model_format} to {tmp_path}. Moved to {target_folder}.') + shutil.rmtree(tmp_path, ignore_errors=True) + logging.info(f'Downloaded model {model_uuid}({model_format}) to {target_folder}.') return created_files async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]: """Used by the trainers. Function returns the new model uuid to use for detection.""" response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/trainings/{training_number}/models/latest/{mformat}/file', files=files) if response.status_code != 200: - logging.error( - f'---- could not upload model for training {training_number} and format {mformat}. Details: {response.text}') + logging.error(f'Could not upload model for training {training_number}, format {mformat}: {response.text}') response.raise_for_status() return None - else: - uploaded_model = response.json() - logging.info( - f'---- uploaded model for training {training_number} and format {mformat}. Model id is {uploaded_model}') - return uploaded_model['id'] + + uploaded_model = response.json() + logging.info(f'Uploaded model for training {training_number}, format {mformat}. Response is: {uploaded_model}') + return uploaded_model['id'] diff --git a/learning_loop_node/detector/__init__.py b/learning_loop_node/detector/__init__.py index 8b137891..e69de29b 100644 --- a/learning_loop_node/detector/__init__.py +++ b/learning_loop_node/detector/__init__.py @@ -1 +0,0 @@ - diff --git a/learning_loop_node/detector/detector_node.py b/learning_loop_node/detector/detector_node.py index 18b8ab6c..92b5fa21 100644 --- a/learning_loop_node/detector/detector_node.py +++ b/learning_loop_node/detector/detector_node.py @@ -186,7 +186,9 @@ async def _check_for_update(self) -> None: if not update_to_model_id: self.log.info('could not check for updates') return - if self.detector_logic.is_initialized: # TODO: solve race condition !!! + + # TODO: solve race condition (it should not be required to recheck if model_info is not None, but it is!) + if self.detector_logic.is_initialized: model_info = self.detector_logic._model_info # pylint: disable=protected-access if model_info is not None: self.log.info(f'Current model: {model_info.version} with id {model_info.id}') @@ -221,8 +223,7 @@ async def _check_for_update(self) -> None: await self.data_exchanger.download_model(target_model_folder, Context(organization=self.organization, project=self.project), - update_to_model_id, - self.detector_logic.model_format) + update_to_model_id, self.detector_logic.model_format) try: os.unlink(model_symlink) os.remove(model_symlink) diff --git a/learning_loop_node/detector/inbox_filter/cam_observation_history.py b/learning_loop_node/detector/inbox_filter/cam_observation_history.py index 88bbe881..a87c72ee 100644 --- a/learning_loop_node/detector/inbox_filter/cam_observation_history.py +++ b/learning_loop_node/detector/inbox_filter/cam_observation_history.py @@ -1,20 +1,17 @@ import os from typing import List, Union -from learning_loop_node.data_classes import (BoxDetection, - ClassificationDetection, - Detections, Observation, - PointDetection, - SegmentationDetection) +from learning_loop_node.data_classes import (BoxDetection, ClassificationDetection, Detections, Observation, + PointDetection, SegmentationDetection) class CamObservationHistory: - def __init__(self): + def __init__(self) -> None: self.reset_time = 3600 self.recent_observations: List[Observation] = [] self.iou_threshold = 0.5 - def forget_old_detections(self): + def forget_old_detections(self) -> None: self.recent_observations = [detection for detection in self.recent_observations if not detection.is_older_than(self.reset_time)] diff --git a/learning_loop_node/detector/outbox.py b/learning_loop_node/detector/outbox.py index 23138c85..ca1a200d 100644 --- a/learning_loop_node/detector/outbox.py +++ b/learning_loop_node/detector/outbox.py @@ -53,7 +53,6 @@ def save(self, image: bytes, detections: Optional[Detections] = None, tags: Opti with open(tmp + '/image.json', 'w') as f: json.dump(jsonable_encoder(asdict(detections)), f) - # TODO sometimes No such file or directory: '/tmp/learning_loop_lib_data/tmp/2023-09-07_13:27:38.399/image.jpg' with open(tmp + '/image.jpg', 'wb') as f: f.write(image) diff --git a/learning_loop_node/detector/tests/conftest.py b/learning_loop_node/detector/tests/conftest.py index ad183fe2..1611f265 100644 --- a/learning_loop_node/detector/tests/conftest.py +++ b/learning_loop_node/detector/tests/conftest.py @@ -12,7 +12,6 @@ import uvicorn from learning_loop_node import DetectorNode -from learning_loop_node.data_classes.general import Category, ModelInformation from learning_loop_node.detector.outbox import Outbox from learning_loop_node.globals import GLOBALS diff --git a/learning_loop_node/detector/tests/test_client_communication.py b/learning_loop_node/detector/tests/test_client_communication.py index 97e3f074..24fbd095 100644 --- a/learning_loop_node/detector/tests/test_client_communication.py +++ b/learning_loop_node/detector/tests/test_client_communication.py @@ -2,7 +2,7 @@ import json import pytest -import requests # type: ignore +import requests from learning_loop_node import DetectorNode from learning_loop_node.data_classes import ModelInformation @@ -101,4 +101,4 @@ async def test_about_endpoint(test_detector_node: DetectorNode): assert response_dict['operation_mode'] == 'idle' assert response_dict['state'] == 'online' assert response_dict['target_model'] == '1.1' - assert any([c.name == 'purple point' for c in model_information.categories]) + assert any(c.name == 'purple point' for c in model_information.categories) diff --git a/learning_loop_node/detector/tests/test_outbox.py b/learning_loop_node/detector/tests/test_outbox.py index 9db7dd09..adf56744 100644 --- a/learning_loop_node/detector/tests/test_outbox.py +++ b/learning_loop_node/detector/tests/test_outbox.py @@ -9,6 +9,8 @@ from learning_loop_node.detector.detector_node import DetectorNode from learning_loop_node.detector.outbox import Outbox +# pylint: disable=redefined-outer-name + @pytest.fixture() def test_outbox(): diff --git a/learning_loop_node/globals.py b/learning_loop_node/globals.py index eee9511a..336df3fa 100644 --- a/learning_loop_node/globals.py +++ b/learning_loop_node/globals.py @@ -1,8 +1,8 @@ class Globals(): - def __init__(self): + def __init__(self) -> None: self.data_folder: str = '/data' - self.detector_port: int = 5004 # TODO move to tests + self.detector_port: int = 5004 # NOTE used for tests GLOBALS = Globals() diff --git a/learning_loop_node/helpers/gdrive_downloader.py b/learning_loop_node/helpers/gdrive_downloader.py index 8e5b3120..deefed68 100755 --- a/learning_loop_node/helpers/gdrive_downloader.py +++ b/learning_loop_node/helpers/gdrive_downloader.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -import requests +import requests # type: ignore # https://stackoverflow.com/a/39225272/4082686 diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py index 1f2e297d..aea20e60 100644 --- a/learning_loop_node/helpers/misc.py +++ b/learning_loop_node/helpers/misc.py @@ -55,7 +55,7 @@ def _handle_task_result(task: asyncio.Task, logger.exception(message, *message_args) -def get_free_memory_mb() -> float: # TODO check if this is used +def get_free_memory_mb() -> float: # NOTE used by yolov5 pynvml.nvmlInit() h = pynvml.nvmlDeviceGetHandleByIndex(0) info = pynvml.nvmlDeviceGetMemoryInfo(h) @@ -76,7 +76,6 @@ async def is_valid_image(filename: str, check_jpeg: bool) -> bool: return "OK" in out.decode() -@staticmethod async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) -> None: logging.info('deleting corrupt images') n_deleted = 0 @@ -90,15 +89,7 @@ async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) -> def create_resource_paths(organization_name: str, project_name: str, image_ids: List[str]) -> Tuple[List[str], List[str]]: - # TODO: experimental: return [f'/{organization_name}/projects/{project_name}/images/{id}/main' for id in image_ids], image_ids - # if not image_ids: - # return [], [] - # url_ids: List[Tuple(str, str)] = [(f'/{organization_name}/projects/{project_name}/images/{id}/main', id) - # for id in image_ids] - # urls, ids = list(map(list, zip(*url_ids))) - - # return urls, ids def create_image_folder(project_folder: str) -> str: @@ -141,17 +132,17 @@ async def wrapper_ensure_socket_response(*args, **kwargs): if isinstance(value, str): return asdict(SocketResponse.for_success(value)) - elif isinstance(value, bool): + if isinstance(value, bool): return asdict(SocketResponse.from_bool(value)) - elif isinstance(value, SocketResponse): + if isinstance(value, SocketResponse): return value - elif (args[0] in ['connect', 'disconnect', 'connect_error']): + if (args[0] in ['connect', 'disconnect', 'connect_error']): return value - elif value is None: + if value is None: return None - else: - raise Exception( - f"Return type for sio must be str, bool, SocketResponse or None', but was {type(value)}'") + + raise Exception( + f"Return type for sio must be str, bool, SocketResponse or None', but was {type(value)}'") except Exception as e: logging.exception(f'An error occured for {args[0]}') @@ -161,6 +152,8 @@ async def wrapper_ensure_socket_response(*args, **kwargs): def is_valid_uuid4(val): + if not val: + return False try: _ = UUID(str(val)).version return True @@ -189,7 +182,6 @@ def activate_asyncio_warnings() -> None: logging.exception('could not activate asyncio warnings. Exception:') -@staticmethod def images_for_ids(image_ids, image_folder) -> List[str]: logging.info(f'### Going to get images for {len(image_ids)} images ids') start = perf_counter() @@ -200,7 +192,6 @@ def images_for_ids(image_ids, image_folder) -> List[str]: return images -@staticmethod def generate_training(project_folder: str, context: Context) -> Training: training_uuid = str(uuid4()) return Training( @@ -212,7 +203,6 @@ def generate_training(project_folder: str, context: Context) -> Training: ) -@staticmethod def delete_all_training_folders(project_folder: str): if not os.path.exists(f'{project_folder}/trainings'): return @@ -220,7 +210,6 @@ def delete_all_training_folders(project_folder: str): shutil.rmtree(f'{project_folder}/trainings/{uuid}', ignore_errors=True) -@staticmethod def create_training_folder(project_folder: str, trainings_id: str) -> str: training_folder = f'{project_folder}/trainings/{trainings_id}' os.makedirs(training_folder, exist_ok=True) diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index 75c57189..901532fd 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -24,6 +24,7 @@ def __init__(self) -> None: self.project: str = environment_reader.project() # used by mock_detector self.base_url: str = f'http{"s" if "learning-loop.ai" in host else ""}://' + host self.async_client: httpx.AsyncClient = httpx.AsyncClient(base_url=self.base_url, timeout=Timeout(60.0)) + self.async_client.cookies.clear() logging.info(f'Loop interface initialized with base_url: {self.base_url} / user: {self.username}') @@ -80,8 +81,23 @@ async def put(self, path, files: Optional[List[str]] = None, requires_login=True if files is None: return await self.async_client.put(api_prefix+path, **kwargs) - file_list = [('files', open(f, 'rb')) for f in files] # TODO: does this properly close the files after upload? - return await self.async_client.put(api_prefix+path, files=file_list) + file_handles = [] + for f in files: + try: + file_handles.append(open(f, 'rb')) + except FileNotFoundError: + for fh in file_handles: + fh.close() # Ensure all files are closed + return httpx.Response(404, content=b'File not found') + + try: + file_list = [('files', fh) for fh in file_handles] # Use file handles + response = await self.async_client.put(api_prefix+path, files=file_list) + finally: + for fh in file_handles: + fh.close() # Ensure all files are closed + + return response async def post(self, path, requires_login=True, api_prefix='/api', **kwargs) -> httpx.Response: if requires_login: @@ -92,14 +108,3 @@ async def delete(self, path, requires_login=True, api_prefix='/api', **kwargs) - if requires_login: await self.ensure_login() return await self.async_client.delete(api_prefix+path, **kwargs) - - # --------------------------------- unused?! --------------------------------- #TODO remove? - - # def get_data(self, path): - # return asyncio.get_event_loop().run_until_complete(self._get_data_async(path)) - - # async def _get_data_async(self, path) -> bytes: - # response = await self.get(f'{self.project_path}{path}') - # if response.status_code != 200: - # raise LoopCommunicationException('bad response: ' + str(response)) - # return response.content diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py index 38742fa4..9418123e 100644 --- a/learning_loop_node/node.py +++ b/learning_loop_node/node.py @@ -62,7 +62,7 @@ def sio_client(self) -> AsyncClient: # --------------------------------------------------- APPLICATION LIFECYCLE --------------------------------------------------- @asynccontextmanager - async def lifespan(self, app: FastAPI): + async def lifespan(self, app: FastAPI): # pylint: disable=unused-argument try: await self._on_startup() self.repeat_task = asyncio.create_task(self.repeat_loop()) @@ -78,7 +78,7 @@ async def lifespan(self, app: FastAPI): async def _on_startup(self): self.log.info('received "startup" lifecycle-event') - activate_asyncio_warnings() + # activate_asyncio_warnings() if self.needs_login: await self.loop_communicator.backend_ready() self.log.info('ensuring login') diff --git a/learning_loop_node/py.typed b/learning_loop_node/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py index b661c818..1842f71e 100644 --- a/learning_loop_node/tests/test_executor.py +++ b/learning_loop_node/tests/test_executor.py @@ -21,26 +21,28 @@ def cleanup(): cleanup_process.communicate() -def test_executor_lifecycle(): +@pytest.mark.asyncio +async def test_executor_lifecycle(): assert_process_is_running('some_executable.sh', False) - executor = Executor('/tmp/test_executor/' + str(uuid4())) - cmd = executor.path + '/some_executable.sh' - with open(cmd, 'w') as f: - f.write('while true; do echo "some output"; sleep 1; done') - os.chmod(cmd, 0o755) + executor = Executor('/tmp/test_executor/' + str(uuid4())+'/') + cmd = 'bash some_executable.sh' + executable_path = executor.path+'some_executable.sh' + with open(executable_path, 'w') as f: + f.write('/bin/bash -c "while true; do sleep 1; echo some output; done"') + os.chmod(executable_path, 0o755) - executor.start(cmd) + await executor.start(cmd) - assert executor.is_process_running() + assert executor.is_running() assert_process_is_running('some_executable.sh') sleep(1) assert 'some output' in executor.get_log() - executor.stop() + await executor.stop_and_wait() - assert not executor.is_process_running() + assert not executor.is_running() sleep(1) assert_process_is_running('some_executable.sh', False) @@ -48,6 +50,7 @@ def test_executor_lifecycle(): def assert_process_is_running(process_name, running=True): if running: for process in psutil.process_iter(): + print(process.name(), process.cmdline()) process_name_match = process_name in process.name() process_cmd_match = process_name in str(process.cmdline()) if process_name_match or process_cmd_match: diff --git a/learning_loop_node/tests/test_helper.py b/learning_loop_node/tests/test_helper.py index e802c7a0..c52037ed 100644 --- a/learning_loop_node/tests/test_helper.py +++ b/learning_loop_node/tests/test_helper.py @@ -9,7 +9,6 @@ from learning_loop_node.data_classes import Context from learning_loop_node.helpers.misc import create_image_folder, create_project_folder, create_training_folder from learning_loop_node.loop_communication import LoopCommunicator -from learning_loop_node.trainer.trainer_logic import TrainerLogic def get_files_in_folder(folder: str): diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py index c768332c..082407ad 100644 --- a/learning_loop_node/trainer/executor.py +++ b/learning_loop_node/trainer/executor.py @@ -1,105 +1,109 @@ - -import ctypes +import asyncio import logging import os -import signal -import subprocess -from sys import platform +import shlex +from io import BufferedWriter from typing import List, Optional -import psutil +class Executor: + def __init__(self, base_path: str, log_name='last_training.log') -> None: + """An executor that runs a command in a separate async subprocess. + The log of the process is written to 'last_training.log' in the base_path. + Tthe process is executed in the base_path directory. + The process should be awaited to finish using `wait` or stopped using `stop` to + avoid zombie processes and close the log file.""" -def create_signal_handler(sig=signal.SIGTERM): - if platform == "linux" or platform == "linux2": - # "The system will send a signal to the child once the parent exits for any reason (even sigkill)." - # https://stackoverflow.com/a/19448096 - libc = ctypes.CDLL("libc.so.6") + self.path = base_path + self.log_file_path = f'{self.path}/{log_name}' + self.log_file: None | BufferedWriter = None + self._process: Optional[asyncio.subprocess.Process] = None # pylint: disable=no-member + os.makedirs(self.path, exist_ok=True) - def callable_(): - os.setsid() - return libc.prctl(1, sig) + def _get_running_process(self) -> Optional[asyncio.subprocess.Process]: # pylint: disable=no-member + """Get the running process if available.""" + if self._process is not None and self._process.returncode is None: + return self._process + return None - return callable_ - return os.setsid + async def start(self, cmd: str, env: Optional[dict[str, str]] = None) -> None: + """Start the process with the given command and environment variables.""" + full_env = os.environ.copy() + if env is not None: + full_env.update(env) -class Executor: - def __init__(self, base_path: str) -> None: - self.path = base_path - os.makedirs(self.path, exist_ok=True) - self.process: Optional[subprocess.Popen[bytes]] = None - - def start(self, cmd: str): - with open(f'{self.path}/last_training.log', 'a') as f: - f.write(f'\nStarting executor with command: {cmd}\n') - # pylint: disable=subprocess-popen-preexec-fn - self.process = subprocess.Popen( - f'cd {self.path}; {cmd} >> last_training.log 2>&1', - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - executable='/bin/bash', - preexec_fn=create_signal_handler(), - ) + logging.info(f'Starting executor with command: {cmd} in {self.path} - logging to {self.log_file_path}') + self.log_file = open(self.log_file_path, 'ab') - def is_process_running(self): - if self.process is None: - return False + self._process = await asyncio.create_subprocess_exec( + *shlex.split(cmd), + cwd=self.path, + stdout=self.log_file, + stderr=asyncio.subprocess.STDOUT, # Merge stderr with stdout + env=full_env + ) - if self.process.poll() is not None: - return False + def is_running(self) -> bool: + """Check if the process is still running.""" + return self._process is not None and self._process.returncode is None - try: - psutil.Process(self.process.pid) - except psutil.NoSuchProcess: - # self.process.terminate() # TODO does this make sense? - # self.process = None - return False + def terminate(self) -> None: + """Terminate the process.""" - return True + if process := self._get_running_process(): + try: + process.terminate() + return + except ProcessLookupError: + logging.error('No process to terminate') + self._process = None - def get_log(self) -> str: - try: - with open(f'{self.path}/last_training.log') as f: - return f.read() - except Exception: - return '' + async def wait(self) -> Optional[int]: + """Wait for the process to finish. Returns the return code of the process or None if no process is running.""" - def get_log_by_lines(self, since_last_start=False) -> List[str]: # TODO do not read whole log again - try: - with open(f'{self.path}/last_training.log') as f: - lines = f.readlines() - if since_last_start: - lines_since_last_start = [] - for line in reversed(lines): - lines_since_last_start.append(line) - if line.startswith('Starting executor with command:'): - break - return list(reversed(lines_since_last_start)) - return lines - except Exception: - return [] + if not self._process: + logging.info('No process to wait for') + return None - def stop(self): - if self.process is None: - logging.info('no process running ... nothing to stop') - return + return_code = await self._process.wait() - logging.info('terminating process') + self.close_log() + self._process = None - try: - os.killpg(os.getpgid(self.process.pid), signal.SIGTERM) - except ProcessLookupError: - pass + return return_code - self.process.terminate() - _, _ = self.process.communicate(timeout=3) + async def stop_and_wait(self) -> Optional[int]: + """Terminate the process and wait for it to finish. Returns the return code of the process.""" - @property - def return_code(self): - if not self.process: - return None - if self.is_process_running(): + if not self.is_running(): + logging.info('No process to stop') return None - return self.process.poll() + + self.terminate() + return await self.wait() + + # -------------------------------------------------------------------------------------------- LOGGING + + def get_log(self) -> str: + """Get the log of the process as a string.""" + if not os.path.exists(self.log_file_path): + return '' + with open(self.log_file_path, 'r') as f: + return f.read() + + def get_log_by_lines(self, tail: Optional[int] = None) -> List[str]: + """Get the log of the process as a list of lines.""" + if not os.path.exists(self.log_file_path): + return [] + with open(self.log_file_path) as f: + lines = f.readlines() + if tail is not None: + lines = lines[-tail:] + return lines + + def close_log(self): + """Close the log file.""" + if self.log_file is not None: + self.log_file.close() + self.log_file = None diff --git a/learning_loop_node/trainer/io_helpers.py b/learning_loop_node/trainer/io_helpers.py index 6ec7a5c3..4849d67a 100644 --- a/learning_loop_node/trainer/io_helpers.py +++ b/learning_loop_node/trainer/io_helpers.py @@ -14,6 +14,16 @@ from ..loop_communication import LoopCommunicator +class EnvironmentVars: + def __init__(self) -> None: + self.restart_after_training = os.environ.get( + 'RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1'] + self.keep_old_trainings = os.environ.get( + 'KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1'] + self.inference_batch_size = int( + os.environ.get('INFERENCE_BATCH_SIZE', '10')) + + class LastTrainingIO: def __init__(self, node_uuid: str) -> None: @@ -137,7 +147,8 @@ async def upload_detetions(self): num_files = self.get_number_of_detection_files() print(f'num_files: {num_files}', flush=True) if not num_files: - raise Exception('no detection files found') + logging.error('no detection files found') + return current_json_file_index = self.load_detections_upload_file_index() for i in range(current_json_file_index, num_files): detections = self.load_detections(i) @@ -164,9 +175,9 @@ async def _upload_detections(self, context: Context, batch_detections: List[Dete msg = f'could not upload detections. {str(response)}' logging.error(msg) raise Exception(msg) + + logging.info('successfully uploaded detections') + if up_progress > len(batch_detections): + self.save_detection_upload_progress(0) else: - logging.info('successfully uploaded detections') - if up_progress > len(batch_detections): - self.save_detection_upload_progress(0) - else: - self.save_detection_upload_progress(up_progress) + self.save_detection_upload_progress(up_progress) diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py index a796fc4d..e2dafc26 100644 --- a/learning_loop_node/trainer/rest/backdoor_controls.py +++ b/learning_loop_node/trainer/rest/backdoor_controls.py @@ -5,7 +5,6 @@ from dataclasses import asdict from typing import TYPE_CHECKING, Dict -from dacite import from_dict from fastapi import APIRouter, HTTPException, Request from ...data_classes import ErrorConfiguration, NodeState @@ -98,7 +97,7 @@ async def add_steps(request: Request): assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic' - if not trainer_logic._executor or not trainer_logic._executor.is_process_running(): # pylint: disable=protected-access + if not trainer_logic._executor or not trainer_logic._executor.is_running(): # pylint: disable=protected-access training = trainer_logic._training # pylint: disable=protected-access logging.error(f'cannot add steps when not running, state: {training.training_state if training else "None"}') raise HTTPException(status_code=409, detail="trainer is not running") @@ -127,9 +126,9 @@ async def kill_process(request: Request): trainer_node = trainer_node_from_request(request) trainer_logic = trainer_node.trainer_logic assert isinstance(trainer_logic, TrainerLogic), 'trainer_logic is not TrainerLogic' - if not trainer_logic._executor or not trainer_logic._executor.is_process_running(): + if not trainer_logic._executor or not trainer_logic._executor.is_running(): raise HTTPException(status_code=409, detail="trainer is not running") - trainer_logic._executor.stop() + await trainer_logic._executor.stop_and_wait() @router.post("/force_status_update") diff --git a/learning_loop_node/trainer/rest/controls.py b/learning_loop_node/trainer/rest/controls.py index b8fbbec8..6c92d9a8 100644 --- a/learning_loop_node/trainer/rest/controls.py +++ b/learning_loop_node/trainer/rest/controls.py @@ -7,6 +7,8 @@ router = APIRouter() +# pylint: disable=protected-access + @router.post("/controls/detect/{organization}/{project}/{version}") async def operation_mode(organization: str, project: str, version: str, request: Request): diff --git a/learning_loop_node/trainer/tests/conftest.py b/learning_loop_node/trainer/tests/conftest.py index 75937920..aca1919c 100644 --- a/learning_loop_node/trainer/tests/conftest.py +++ b/learning_loop_node/trainer/tests/conftest.py @@ -10,6 +10,8 @@ from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_node import TrainerNode +# pylint: disable=protected-access + logging.basicConfig(level=logging.INFO) # show ouptut from uvicorn server https://stackoverflow.com/a/66132186/364388 log_to_stderr(logging.INFO) @@ -24,16 +26,14 @@ async def test_initialized_trainer_node(): trainer = TestingTrainerLogic() node = TrainerNode(name='test', trainer_logic=trainer, uuid='NOD30000-0000-0000-0000-000000000000') - trainer._node = node # pylint: disable=protected-access - trainer.init_new_training(context=Context(organization='zauberzeug', project='demo'), - details={'categories': [], - 'id': '917d5c7f-403d-7e92-f95f-577f79c2273a', # version 1.2 of demo project - 'training_number': 0, - 'resolution': 800, - 'flip_rl': False, - 'flip_ud': False}) - - # pylint: disable=protected-access + trainer._node = node + trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'), + details={'categories': [], + 'id': '917d5c7f-403d-7e92-f95f-577f79c2273a', # version 1.2 of demo project + 'training_number': 0, + 'resolution': 800, + 'flip_rl': False, + 'flip_ud': False}) await node._on_startup() yield node await node._on_shutdown() @@ -44,19 +44,17 @@ async def test_initialized_trainer(): trainer = TestingTrainerLogic() node = TrainerNode(name='test', trainer_logic=trainer, uuid='NODE-000-0000-0000-0000-000000000000') - # pylint: disable=protected-access - await node._on_startup() - trainer._node = node # pylint: disable=protected-access - trainer.init_new_training(context=Context(organization='zauberzeug', project='demo'), - details={'categories': [], - 'id': '917d5c7f-403d-7e92-f95f-577f79c2273a', # version 1.2 of demo project - 'training_number': 0, - 'resolution': 800, - 'flip_rl': False, - 'flip_ud': False}) + await node._on_startup() + trainer._node = node + trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'), + details={'categories': [], + 'id': '917d5c7f-403d-7e92-f95f-577f79c2273a', # version 1.2 of demo project + 'training_number': 0, + 'resolution': 800, + 'flip_rl': False, + 'flip_ud': False}) yield trainer - # await node._on_shutdown() try: await node._on_shutdown() except Exception: @@ -66,10 +64,3 @@ async def test_initialized_trainer(): def is_port_in_use(port): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: return s.connect_ex(('localhost', port)) == 0 - - -# @pytest.fixture(autouse=True, scope='session') -# def initialize_active_training(): -# from learning_loop_node.trainer import active_training_module -# active_training_module.init('00000000-0000-0000-0000-000000000000') -# yield diff --git a/learning_loop_node/trainer/tests/state_helper.py b/learning_loop_node/trainer/tests/state_helper.py index a5b982ec..01c9001d 100644 --- a/learning_loop_node/trainer/tests/state_helper.py +++ b/learning_loop_node/trainer/tests/state_helper.py @@ -7,7 +7,7 @@ def create_active_training_file(trainer: TrainerLogic, **kwargs) -> None: update_attributes(trainer._training, **kwargs) # pylint: disable=protected-access - trainer.node.last_training_io.save(training=trainer.active_training) + trainer.node.last_training_io.save(training=trainer.training) async def assert_training_state(training: Training, state: str, timeout: float, interval: float) -> None: diff --git a/learning_loop_node/trainer/tests/states/test_state_cleanup.py b/learning_loop_node/trainer/tests/states/test_state_cleanup.py index 3326d156..f3911a54 100644 --- a/learning_loop_node/trainer/tests/states/test_state_cleanup.py +++ b/learning_loop_node/trainer/tests/states/test_state_cleanup.py @@ -1,11 +1,13 @@ from learning_loop_node.trainer.tests.state_helper import create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic +# pylint: disable=protected-access + async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state='ready_for_cleanup') - trainer.init_from_last_training() + trainer._init_from_last_training() trainer.active_training_io.save_detections(detections=[]) trainer.active_training_io.save_detection_upload_progress(count=42) @@ -16,9 +18,9 @@ async def test_cleanup_successfull(test_initialized_trainer: TestingTrainerLogic assert trainer.active_training_io.detection_upload_progress_exist() is True assert trainer.active_training_io.detections_upload_file_index_exists() is True - await trainer.clear_training() + await trainer._clear_training() - assert trainer._training is None # pylint: disable=protected-access + assert trainer._training is None assert trainer.node.last_training_io.exists() is False assert trainer.active_training_io.detections_exist() is False assert trainer.active_training_io.detection_upload_progress_exist() is False diff --git a/learning_loop_node/trainer/tests/states/test_state_detecting.py b/learning_loop_node/trainer/tests/states/test_state_detecting.py index fbb8e9c0..5492f8dc 100644 --- a/learning_loop_node/trainer/tests/states/test_state_detecting.py +++ b/learning_loop_node/trainer/tests/states/test_state_detecting.py @@ -6,6 +6,7 @@ from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic +# pylint: disable=protected-access error_key = 'detecting' @@ -13,38 +14,36 @@ def trainer_has_error(trainer: TrainerLogic): return trainer.errors.has_error_for(error_key) -async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogic): # TODO Flaky test +async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogic): # NOTE was a flaky test trainer = test_initialized_trainer create_active_training_file(trainer, training_state='train_model_uploaded', - model_id_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a') + model_uuid_for_detecting='917d5c7f-403d-7e92-f95f-577f79c2273a') # trainer.load_active_training() _ = asyncio.get_running_loop().create_task( - trainer.perform_state('do_detections', TrainerState.Detecting, - TrainerState.Detected, trainer._do_detections) - ) + trainer._perform_state('do_detections', TrainerState.Detecting, TrainerState.Detected, trainer._do_detections)) - await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=10, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detecting, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detected, timeout=10, interval=0.001) assert trainer_has_error(trainer) is False - assert trainer.active_training.training_state == TrainerState.Detected - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.Detected + assert trainer.node.last_training_io.load() == trainer.training assert trainer.active_training_io.detections_exist() async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded) - trainer.init_from_last_training() - trainer.active_training.model_id_for_detecting = '12345678-bobo-7e92-f95f-424242424242' + trainer._init_from_last_training() + trainer.training.model_uuid_for_detecting = '12345678-bobo-7e92-f95f-424242424242' - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.Detecting, timeout=5, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detecting, timeout=5, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) - assert trainer._training is None # pylint: disable=protected-access + assert trainer._training is None assert trainer.active_training_io.detections_exist() is False assert trainer.node.last_training_io.exists() is False @@ -52,25 +51,25 @@ async def test_detecting_can_be_aborted(test_initialized_trainer: TestingTrainer async def test_model_not_downloadable_error(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelUploaded, - model_id_for_detecting='00000000-0000-0000-0000-000000000000') # bad model id - trainer.init_from_last_training() + model_uuid_for_detecting='00000000-0000-0000-0000-000000000000') # bad model id + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, 'detecting', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, 'train_model_uploaded', timeout=1, interval=0.001) + await assert_training_state(trainer.training, 'detecting', timeout=1, interval=0.001) + await assert_training_state(trainer.training, 'train_model_uploaded', timeout=1, interval=0.001) await asyncio.sleep(0.1) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.TrainModelUploaded - assert trainer.active_training.model_id_for_detecting == '00000000-0000-0000-0000-000000000000' - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainModelUploaded + assert trainer.training.model_uuid_for_detecting == '00000000-0000-0000-0000-000000000000' + assert trainer.node.last_training_io.load() == trainer.training def test_save_load_detections(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer) - trainer.init_from_last_training() + trainer._init_from_last_training() detections = [get_dummy_detections(), get_dummy_detections()] diff --git a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py index 12e9b745..282a2288 100644 --- a/learning_loop_node/trainer/tests/states/test_state_download_train_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_download_train_model.py @@ -6,37 +6,39 @@ from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic +# pylint: disable=protected-access + async def test_downloading_is_successful(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.DataDownloaded) trainer.model_format = 'mocked' - trainer.init_from_last_training() + trainer._init_from_last_training() asyncio.get_running_loop().create_task( - trainer.perform_state('download_model', - TrainerState.TrainModelDownloading, - TrainerState.TrainModelDownloaded, trainer._download_model)) - await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, 'train_model_downloaded', timeout=1, interval=0.001) + trainer._perform_state('download_model', + TrainerState.TrainModelDownloading, + TrainerState.TrainModelDownloaded, trainer._download_model)) + await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) + await assert_training_state(trainer.training, 'train_model_downloaded', timeout=1, interval=0.001) - assert trainer.active_training.training_state == TrainerState.TrainModelDownloaded - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainModelDownloaded + assert trainer.node.last_training_io.load() == trainer.training # file on disk - assert os.path.exists(f'{trainer.active_training.training_folder}/base_model.json') - assert os.path.exists(f'{trainer.active_training.training_folder}/file_1.txt') - assert os.path.exists(f'{trainer.active_training.training_folder}/file_2.txt') + assert os.path.exists(f'{trainer.training.training_folder}/base_model.json') + assert os.path.exists(f'{trainer.training.training_folder}/file_1.txt') + assert os.path.exists(f'{trainer.training.training_folder}/file_2.txt') async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state='data_downloaded') - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -48,14 +50,14 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.DataDownloaded, - base_model_id='00000000-0000-0000-0000-000000000000') # bad model id) - trainer.init_from_last_training() + base_model_uuid_or_name='00000000-0000-0000-0000-000000000000') # bad model id) + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, 'train_model_downloading', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.DataDownloaded, timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, 'train_model_downloading', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.DataDownloaded, timeout=1, interval=0.001) assert trainer.errors.has_error_for('download_model') assert trainer._training is not None # pylint: disable=protected-access - assert trainer.active_training.training_state == TrainerState.DataDownloaded - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.DataDownloaded + assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_prepare.py b/learning_loop_node/trainer/tests/states/test_state_prepare.py index 8c490c92..d3222f9a 100644 --- a/learning_loop_node/trainer/tests/states/test_state_prepare.py +++ b/learning_loop_node/trainer/tests/states/test_state_prepare.py @@ -5,6 +5,7 @@ from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic +# pylint: disable=protected-access error_key = 'prepare' @@ -15,22 +16,22 @@ def trainer_has_error(trainer: TrainerLogic): async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer) - trainer.init_from_last_training() + trainer._init_from_last_training() - await trainer.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare) + await trainer._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare) assert trainer_has_error(trainer) is False - assert trainer.active_training.training_state == TrainerState.DataDownloaded - assert trainer.active_training.data is not None - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.DataDownloaded + assert trainer.training.data is not None + assert trainer.node.last_training_io.load() == trainer.training async def test_abort_preparing(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -43,13 +44,13 @@ async def test_request_error(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, context=Context( organization='zauberzeug', project='some_bad_project')) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, TrainerState.DataDownloading, timeout=3, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.Initialized, timeout=3, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, TrainerState.DataDownloading, timeout=3, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Initialized, timeout=3, interval=0.001) assert trainer_has_error(trainer) assert trainer._training is not None # pylint: disable=protected-access - assert trainer.active_training.training_state == TrainerState.Initialized - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.Initialized + assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py index cc145233..6a292be5 100644 --- a/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py +++ b/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py @@ -10,6 +10,8 @@ from ..state_helper import assert_training_state, create_active_training_file from ..testing_trainer_logic import TestingTrainerLogic +# pylint: disable=protected-access + error_key = 'sync_confusion_matrix' @@ -23,14 +25,14 @@ async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic): # TODO this requires trainer to have _training # trainer.load_active_training() create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) assert trainer_has_error(trainer) is False - assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced + assert trainer.node.last_training_io.load() == trainer.training async def test_unsynced_model_available__sync_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture): @@ -40,15 +42,15 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine await mock_socket_io_call(mocker, test_initialized_trainer_node, {'success': True}) create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) - trainer.init_from_last_training() + trainer._init_from_last_training() trainer.has_new_model = True - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=1, interval=0.001) assert trainer_has_error(trainer) is False # assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.node.last_training_io.load() == trainer.training async def test_unsynced_model_available__sio_not_connected(test_initialized_trainer_node: TrainerNode): @@ -60,14 +62,14 @@ async def test_unsynced_model_available__sio_not_connected(test_initialized_trai assert test_initialized_trainer_node.sio_client.connected is False trainer.has_new_model = True - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.training async def test_unsynced_model_available__request_is_not_successful(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture): @@ -79,14 +81,14 @@ async def test_unsynced_model_available__request_is_not_successful(test_initiali create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) trainer.has_new_model = True - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, 'confusion_matrix_syncing', timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.training, 'confusion_matrix_syncing', timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.training async def test_basic_mock(test_initialized_trainer_node: TrainerNode, mocker: MockerFixture): diff --git a/learning_loop_node/trainer/tests/states/test_state_train.py b/learning_loop_node/trainer/tests/states/test_state_train.py index 46a7f953..4e1d200c 100644 --- a/learning_loop_node/trainer/tests/states/test_state_train.py +++ b/learning_loop_node/trainer/tests/states/test_state_train.py @@ -5,47 +5,46 @@ from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic +# pylint: disable=protected-access + async def test_successful_training(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) - await asyncio.sleep(0.1) # give tests a bit time to to check for the state + await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.01) assert trainer.start_training_task is not None - assert trainer.start_training_task.__name__ == 'start_training' - # pylint: disable=protected-access assert trainer._executor is not None - trainer._executor.stop() # NOTE normally a training terminates itself - await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await trainer.stop() # NOTE normally a training terminates itself + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.active_training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.training async def test_stop_running_training(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) # pylint: disable=protected-access - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.01) assert trainer.start_training_task is not None - assert trainer.start_training_task.__name__ == 'start_training' await trainer.stop() - await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=2, interval=0.01) - assert trainer.active_training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.training async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrainerLogic): @@ -53,20 +52,18 @@ async def test_training_can_maybe_resumed(test_initialized_trainer: TestingTrain # NOTE e.g. when a node-computer is restarted create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) - trainer.init_from_last_training() - trainer._can_resume = True # pylint: disable=protected-access + trainer._init_from_last_training() + trainer._can_resume_flag = True - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await condition(lambda: trainer._executor and trainer._executor.is_process_running(), timeout=1, interval=0.01) # pylint: disable=protected-access - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await condition(lambda: trainer._executor and trainer._executor.is_running(), timeout=1, interval=0.01) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) assert trainer.start_training_task is not None - assert trainer.start_training_task.__name__ == 'resume' - # pylint: disable=protected-access assert trainer._executor is not None - trainer._executor.stop() # NOTE normally a training terminates itself e.g - await assert_training_state(trainer.active_training, TrainerState.TrainingFinished, timeout=1, interval=0.001) + await trainer._executor.stop_and_wait() # NOTE normally a training terminates itself e.g + await assert_training_state(trainer.training, TrainerState.TrainingFinished, timeout=1, interval=0.001) - assert trainer.active_training.training_state == TrainerState.TrainingFinished - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainingFinished + assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py index 757cf968..e2784514 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_detections.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_detections.py @@ -10,6 +10,7 @@ from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic +# pylint: disable=protected-access error_key = 'upload_detections' @@ -44,14 +45,14 @@ async def create_valid_detection_file(trainer: TrainerLogic, number_of_entries: async def test_upload_successful(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.Detected) - trainer.init_from_last_training() + trainer._init_from_last_training() await create_valid_detection_file(trainer) await asyncio.get_running_loop().create_task( - trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions)) + trainer._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions)) - assert trainer.active_training.training_state == TrainerState.ReadyForCleanup - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.ReadyForCleanup + assert trainer.node.last_training_io.load() == trainer.training @pytest.mark.asyncio @@ -59,14 +60,14 @@ async def test_detection_upload_progress_is_stored(test_initialized_trainer: Tes trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.Detected) - trainer.init_from_last_training() + trainer._init_from_last_training() await create_valid_detection_file(trainer) assert trainer.active_training_io.load_detections_upload_file_index() == 0 # await trainer.upload_detections() await asyncio.get_running_loop().create_task( - trainer.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions)) + trainer._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, trainer.active_training_io.upload_detetions)) assert trainer.active_training_io.load_detection_upload_progress() == 0 # Progress is reset for every file assert trainer.active_training_io.load_detections_upload_file_index() == 1 @@ -77,7 +78,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.Detected) - trainer.init_from_last_training() + trainer._init_from_last_training() await create_valid_detection_file(trainer, 2, 0) await create_valid_detection_file(trainer, 2, 1) @@ -91,7 +92,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test for i in range(skip_detections, len(detections), batch_size): batch_detections = detections[i:i+batch_size] # pylint: disable=protected-access - await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size) + await trainer.active_training_io._upload_detections(trainer.training.context, batch_detections, i + batch_size) expected_value = i + batch_size if i + batch_size < len(detections) else 0 # Progress is reset for every file assert trainer.active_training_io.load_detection_upload_progress() == expected_value @@ -107,7 +108,7 @@ async def test_ensure_all_detections_are_uploaded(test_initialized_trainer: Test for i in range(skip_detections, len(detections), batch_size): batch_detections = detections[i:i+batch_size] # pylint: disable=protected-access - await trainer.active_training_io._upload_detections(trainer.active_training.context, batch_detections, i + batch_size) + await trainer.active_training_io._upload_detections(trainer.training.context, batch_detections, i + batch_size) expected_value = i + batch_size if i + batch_size < len(detections) else 0 # Progress is reset for every file assert trainer.active_training_io.load_detection_upload_progress() == expected_value @@ -120,44 +121,41 @@ async def test_bad_status_from_LearningLoop(test_initialized_trainer: TestingTra create_active_training_file(trainer, training_state=TrainerState.Detected, context=Context( organization='zauberzeug', project='some_bad_project')) - trainer.init_from_last_training() + trainer._init_from_last_training() trainer.active_training_io.save_detections([get_dummy_detections()]) - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001) + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.Detected, timeout=1, interval=0.001) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.Detected - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.Detected + assert trainer.node.last_training_io.load() == trainer.training -async def test_other_errors(test_initialized_trainer: TestingTrainerLogic): +async def test_go_to_cleanup_if_no_detections_exist(test_initialized_trainer: TestingTrainerLogic): + """This test simulates a situation where the detection file is missing. + In this case, the trainer should report an error and move to the ReadyForCleanup state.""" trainer = test_initialized_trainer # e.g. missing detection file create_active_training_file(trainer, training_state=TrainerState.Detected) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) - await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001) - await assert_training_state(trainer.active_training, TrainerState.Detected, timeout=1, interval=0.001) - - assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.Detected - assert trainer.node.last_training_io.load() == trainer.active_training + _ = asyncio.get_running_loop().create_task(trainer._run()) + await assert_training_state(trainer.training, TrainerState.ReadyForCleanup, timeout=1, interval=0.001) async def test_abort_uploading(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.Detected) - trainer.init_from_last_training() + trainer._init_from_last_training() await create_valid_detection_file(trainer) - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.DetectionUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.DetectionUploading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) diff --git a/learning_loop_node/trainer/tests/states/test_state_upload_model.py b/learning_loop_node/trainer/tests/states/test_state_upload_model.py index 9faa656f..b2bfa4c7 100644 --- a/learning_loop_node/trainer/tests/states/test_state_upload_model.py +++ b/learning_loop_node/trainer/tests/states/test_state_upload_model.py @@ -7,6 +7,7 @@ from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic from learning_loop_node.trainer.trainer_logic import TrainerLogic +# pylint: disable=protected-access error_key = 'upload_model' @@ -19,29 +20,29 @@ async def test_successful_upload(mocker: MockerFixture, test_initialized_trainer mock_upload_model_for_training(mocker, 'new_model_id') create_active_training_file(trainer) - trainer.init_from_last_training() + trainer._init_from_last_training() train_task = asyncio.get_running_loop().create_task( - trainer.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, trainer._upload_model)) + trainer._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, trainer._upload_model)) - await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) await train_task assert trainer_has_error(trainer) is False - assert trainer.active_training.training_state == TrainerState.TrainModelUploaded - assert trainer.active_training.model_id_for_detecting is not None - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.TrainModelUploaded + assert trainer.training.model_uuid_for_detecting is not None + assert trainer.node.last_training_io.load() == trainer.training async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) await trainer.stop() await asyncio.sleep(0.1) @@ -57,18 +58,18 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced) - trainer.init_from_last_training() + trainer._init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001) # TODO goes to finished because of the error - await assert_training_state(trainer.active_training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001) + await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001) assert trainer_has_error(trainer) - assert trainer.active_training.training_state == TrainerState.ConfusionMatrixSynced - assert trainer.active_training.model_id_for_detecting is None - assert trainer.node.last_training_io.load() == trainer.active_training + assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced + assert trainer.training.model_uuid_for_detecting is None + assert trainer.node.last_training_io.load() == trainer.training async def test_mock_loop_response_example(mocker: MockerFixture, test_initialized_trainer: TestingTrainerLogic): @@ -77,7 +78,7 @@ async def test_mock_loop_response_example(mocker: MockerFixture, test_initialize mock_upload_model_for_training(mocker, 'new_model_id') create_active_training_file(trainer) - trainer.init_from_last_training() + trainer._init_from_last_training() # pylint: disable=protected-access result = await trainer._upload_model_return_new_model_uuid(Context(organization='zauberzeug', project='demo')) diff --git a/learning_loop_node/trainer/tests/test_errors.py b/learning_loop_node/trainer/tests/test_errors.py index 1ba85572..9a9c1cd8 100644 --- a/learning_loop_node/trainer/tests/test_errors.py +++ b/learning_loop_node/trainer/tests/test_errors.py @@ -1,40 +1,45 @@ import asyncio import re +import pytest + from learning_loop_node.data_classes import TrainerState from learning_loop_node.trainer.tests.state_helper import assert_training_state, create_active_training_file from learning_loop_node.trainer.tests.testing_trainer_logic import TestingTrainerLogic +# pylint: disable=protected-access + async def test_training_process_is_stopped_when_trainer_reports_error(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) - trainer.init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + trainer._init_from_last_training() + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) trainer.error_msg = 'some_error' - await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) +@pytest.mark.skip(reason='The since_last_start flag is deprecated.') async def test_log_can_provide_only_data_for_current_run(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.TrainModelDownloaded) - trainer.init_from_last_training() - _ = asyncio.get_running_loop().create_task(trainer.run()) + trainer._init_from_last_training() + _ = asyncio.get_running_loop().create_task(trainer._run()) - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) await asyncio.sleep(0.1) # give tests a bit time to to check for the state assert trainer._executor is not None assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) == 1 trainer.error_msg = 'some_error' - await assert_training_state(trainer.active_training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainModelDownloaded, timeout=6, interval=0.001) trainer.error_msg = None - await assert_training_state(trainer.active_training, TrainerState.TrainingRunning, timeout=1, interval=0.001) + await assert_training_state(trainer.training, TrainerState.TrainingRunning, timeout=1, interval=0.001) await asyncio.sleep(1) assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines()))) > 1 # Here only the current run is provided - assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines(since_last_start=True)))) == 1 + # assert len(re.findall('Starting executor', str(trainer._executor.get_log_by_lines(since_last_start=True)))) == 1 diff --git a/learning_loop_node/trainer/tests/testing_trainer_logic.py b/learning_loop_node/trainer/tests/testing_trainer_logic.py index c7faeca8..50171e08 100644 --- a/learning_loop_node/trainer/tests/testing_trainer_logic.py +++ b/learning_loop_node/trainer/tests/testing_trainer_logic.py @@ -1,8 +1,8 @@ import asyncio import time -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional -from learning_loop_node.data_classes import BasicModel, Context, Detections, ModelInformation, PretrainedModel +from learning_loop_node.data_classes import Context, Detections, ModelInformation, PretrainedModel, TrainingStateData from learning_loop_node.trainer.trainer_logic import TrainerLogic @@ -11,7 +11,7 @@ class TestingTrainerLogic(TrainerLogic): def __init__(self, can_resume: bool = False) -> None: super().__init__('mocked') - self._can_resume: bool = can_resume + self._can_resume_flag: bool = can_resume self.has_new_model: bool = False self.error_msg: Optional[str] = None @@ -25,25 +25,25 @@ def model_architecture(self) -> str: @property def provided_pretrained_models(self) -> List[PretrainedModel]: - return [ - PretrainedModel(name='small', label='Small', description='a small model'), - PretrainedModel(name='medium', label='Medium', description='a medium model'), - PretrainedModel(name='large', label='Large', description='a large model')] + return [PretrainedModel(name='small', label='Small', description='a small model'), + PretrainedModel(name='medium', label='Medium', description='a medium model'), + PretrainedModel(name='large', label='Large', description='a large model')] # pylint: disable=unused-argument - async def start_training(self, model: str = 'model.model') -> None: + async def _start_training_from_base_model(self, model: str = 'model.model') -> None: assert self._executor is not None - self._executor.start('while true; do sleep 1; done') + await self._executor.start('/bin/bash -c "while true; do sleep 1; done"') - async def start_training_from_scratch(self, base_model_id: str) -> None: - await self.start_training(model=f'model_{base_model_id}.pt') + async def _start_training_from_scratch(self) -> None: + assert self.training.base_model_uuid_or_name is not None, 'base_model_uuid_or_name must be set' + await self._start_training_from_base_model(model=f'model_{self.training.base_model_uuid_or_name}.pt') - def get_new_best_model(self) -> Optional[BasicModel]: + def _get_new_best_training_state(self) -> Optional[TrainingStateData]: if self.has_new_model: - return BasicModel(confusion_matrix={}) + return TrainingStateData(confusion_matrix={}) return None - def on_model_published(self, basic_model: BasicModel) -> None: + def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: pass async def _prepare(self) -> None: @@ -54,9 +54,9 @@ async def _download_model(self) -> None: await super()._download_model() await asyncio.sleep(0.1) # give tests a bit time to to check for the state - async def upload_model(self) -> None: + async def _upload_model(self) -> None: await asyncio.sleep(0.1) # give tests a bit time to to check for the state - await super().upload_model() + await super()._upload_model() await asyncio.sleep(0.1) # give tests a bit time to to check for the state async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]: @@ -66,7 +66,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona assert isinstance(result, str) return result - def get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: + async def _get_latest_model_files(self) -> Dict[str, List[str]]: time.sleep(1) # NOTE reduce flakyness in Backend tests du to wrong order of events. fake_weight_file = '/tmp/weightfile.weights' with open(fake_weight_file, 'wb') as f: @@ -77,18 +77,18 @@ def get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: f.write('zweiundvierzig') return {'mocked': [fake_weight_file, more_data_file], 'mocked_2': [fake_weight_file, more_data_file]} - def can_resume(self) -> bool: - return self._can_resume + def _can_resume(self) -> bool: + return self._can_resume_flag - async def resume(self) -> None: - return await self.start_training() + async def _resume(self) -> None: + return await self._start_training_from_base_model() async def _detect(self, model_information: ModelInformation, images: List[str], model_folder: str) -> List[Detections]: detections: List[Detections] = [] return detections - async def clear_training_data(self, training_folder: str) -> None: + async def _clear_training_data(self, training_folder: str) -> None: return - def get_executor_error_from_log(self) -> Optional[str]: + def _get_executor_error_from_log(self) -> Optional[str]: return self.error_msg diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 82fd8aad..ea32b6dc 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -5,7 +5,7 @@ import shutil from abc import abstractmethod from datetime import datetime -from typing import Coroutine, Dict, List, Optional +from typing import Coroutine, List, Optional from dacite import from_dict @@ -18,45 +18,47 @@ class TrainerLogic(TrainerLogicGeneric): def __init__(self, model_format: str) -> None: - super().__init__(model_format) - self.model_format: str = model_format - # NOTE: String to be used in the file path for the model on the server: - # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' + """This class is the base class for all trainers that use an executor to run training processes. + The executor is used to run the training process in a separate process.""" + super().__init__(model_format) + self._detection_progress: Optional[float] = None self._executor: Optional[Executor] = None self.start_training_task: Optional[Coroutine] = None + self.inference_batch_size = 10 + + # ---------------------------------------- IMPLEMENTED ABSTRACT PROPERTIES ---------------------------------------- + + @property + def detection_progress(self) -> Optional[float]: + return self._detection_progress + + # ---------------------------------------- PROPERTIES ---------------------------------------- @property def executor(self) -> Executor: assert self._executor is not None, 'executor must be set, call `run_training` first' return self._executor - @property - def hyperparameters(self) -> Optional[Dict]: - if self._training and self._training.data and self._training.data.hyperparameter: - information = {} - information['resolution'] = self._training.data.hyperparameter.resolution - information['flipRl'] = self._training.data.hyperparameter.flip_rl - information['flipUd'] = self._training.data.hyperparameter.flip_ud - return information - return None + # ---------------------------------------- IMPLEMENTED ABSTRACT MEHTODS ---------------------------------------- async def _train(self) -> None: previous_state = TrainerState.TrainModelDownloaded error_key = 'run_training' - self._executor = Executor(self.active_training.training_folder) - self.active_training.training_state = TrainerState.TrainingRunning + self._executor = Executor(self.training.training_folder) + self.training.training_state = TrainerState.TrainingRunning try: await self._start_training() - last_sync_time = datetime.now() + while True: - if not self.executor.is_process_running(): + await asyncio.sleep(0.1) + if not self.executor.is_running(): break if (datetime.now() - last_sync_time).total_seconds() > 5: last_sync_time = datetime.now() - if self.get_executor_error_from_log(): + if self._get_executor_error_from_log(): break self.errors.reset(error_key) try: @@ -65,65 +67,49 @@ async def _train(self) -> None: logging.warning('CancelledError in run_training') raise except Exception: - pass - else: - await asyncio.sleep(0.1) + logging.error('Error in sync_confusion_matrix (this error is ignored)') - error = self.get_executor_error_from_log() - if error: + if error := self._get_executor_error_from_log(): raise TrainingError(cause=error) - # TODO check if this works: + + # NOTE: This is problematic, because the return code is not 0 when executor was stoppen e.g. via self.stop() # if self.executor.return_code != 0: - # self.errors.set(error_key, f'Executor return code was {self.executor.return_code}') - # raise TrainingError(cause=f'Executor return code was {self.executor.return_code}') + # raise TrainingError(cause=f'Executor returned with error code: {self.executor.return_code}') except TrainingError: - logging.exception('Error in TrainingProcess') - if self.executor.is_process_running(): - self.executor.stop() - self.active_training.training_state = previous_state + logging.exception('Exception in trainer_logic._train') + await self.executor.stop_and_wait() + self.training.training_state = previous_state raise - async def _start_training(self): - self.start_training_task = None # NOTE: this is used i.e. by tests - if self.can_resume(): - self.start_training_task = self.resume() - else: - base_model_id = self.active_training.base_model_id - if not is_valid_uuid4(base_model_id): # TODO this check was done earlier! - assert isinstance(base_model_id, str) - # TODO this could be removed here and accessed via self.training.base_model_id - self.start_training_task = self.start_training_from_scratch(base_model_id) - else: - self.start_training_task = self.start_training() - await self.start_training_task - async def _do_detections(self) -> None: - context = self.active_training.context - model_id = self.active_training.model_id_for_detecting - assert model_id, 'model_id must be set' + context = self.training.context + model_id = self.training.model_uuid_for_detecting + if not model_id: + logging.error('model_id is not set! Cannot do detections.') + return tmp_folder = f'/tmp/model_for_auto_detections_{model_id}_{self.model_format}' shutil.rmtree(tmp_folder, ignore_errors=True) os.makedirs(tmp_folder) logging.info(f'downloading detection model to {tmp_folder}') - await self.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format) + await self.node.data_exchanger.download_model(tmp_folder, context, model_id, self.model_format) with open(f'{tmp_folder}/model.json', 'r') as f: model_information = from_dict(data_class=ModelInformation, data=json.load(f)) project_folder = create_project_folder(context) image_folder = create_image_folder(project_folder) - self.data_exchanger.set_context(context) + self.node.data_exchanger.set_context(context) image_ids = [] for state, p in zip(['inbox', 'annotate', 'review', 'complete'], [0.1, 0.2, 0.3, 0.4]): - self.detection_progress = p + self._detection_progress = p logging.info(f'fetching image ids of {state}') - new_ids = await self.data_exchanger.fetch_image_uuids(query_params=f'state={state}') + new_ids = await self.node.data_exchanger.fetch_image_uuids(query_params=f'state={state}') image_ids += new_ids logging.info(f'downloading {len(new_ids)} images') - await self.data_exchanger.download_images(new_ids, image_folder) - self.detection_progress = 0.42 + await self.node.data_exchanger.download_images(new_ids, image_folder) + self._detection_progress = 0.42 # await delete_corrupt_images(image_folder) images = await asyncio.get_event_loop().run_in_executor(None, images_for_ids, image_ids, image_folder) @@ -131,19 +117,36 @@ async def _do_detections(self) -> None: self.active_training_io.save_detections([], 0) num_images = len(images) - batch_size = 200 - for idx, i in enumerate(range(0, num_images, batch_size)): - self.detection_progress = 0.5 + (i/num_images)*0.5 - batch_images = images[i:i+batch_size] + for idx, i in enumerate(range(0, num_images, self.inference_batch_size)): + self._detection_progress = 0.5 + (i/num_images)*0.5 + batch_images = images[i:i+self.inference_batch_size] batch_detections = await self._detect(model_information, batch_images, tmp_folder) self.active_training_io.save_detections(batch_detections, idx) + # ---------------------------------------- METHODS ---------------------------------------- + + async def _start_training(self): + self.start_training_task = None # NOTE: this is used i.e. by tests + if self._can_resume(): + self.start_training_task = self._resume() + else: + base_model_uuid_or_name = self.training.base_model_uuid_or_name + if not is_valid_uuid4(base_model_uuid_or_name): + self.start_training_task = self._start_training_from_scratch() + else: + self.start_training_task = self._start_training_from_base_model() + await self.start_training_task + + # ---------------------------------------- OVERWRITTEN METHODS ---------------------------------------- + async def stop(self) -> None: """If executor is running, stop it. Else cancel training task.""" + print('===============> stop received in trainer_logic.', flush=True) + if not self.training_active: return - if self._executor and self._executor.is_process_running(): - self.executor.stop() + if self._executor and self._executor.is_running(): + await self.executor.stop_and_wait() elif self.training_task: logging.info('cancelling training task') if self.training_task.cancel(): @@ -152,33 +155,31 @@ async def stop(self) -> None: except asyncio.CancelledError: pass logging.info('cancelled training task') - self.may_restart() - - def get_log(self) -> str: - return self.executor.get_log() + self._may_restart() # ---------------------------------------- ABSTRACT METHODS ---------------------------------------- @abstractmethod - async def start_training(self) -> None: - '''Should be used to start a training.''' + async def _start_training_from_base_model(self) -> None: + '''Should be used to start a training on executer, e.g. self.executor.start(cmd).''' @abstractmethod - async def start_training_from_scratch(self, base_model_id: str) -> None: - '''Should be used to start a training from scratch. - base_model_id is the id of a pretrained model provided by self.provided_pretrained_models.''' + async def _start_training_from_scratch(self) -> None: + '''Should be used to start a training from scratch on executer, e.g. self.executor.start(cmd). + NOTE base_model_id is now accessible via self.training.base_model_id + the id of a pretrained model provided by self.provided_pretrained_models.''' @abstractmethod - def can_resume(self) -> bool: + def _can_resume(self) -> bool: '''Override this method to return True if the trainer can resume training.''' @abstractmethod - async def resume(self) -> None: + async def _resume(self) -> None: '''Is called when self.can_resume() returns True. One may resume the training on a previously trained model stored by self.on_model_published(basic_model).''' @abstractmethod - def get_executor_error_from_log(self) -> Optional[str]: + def _get_executor_error_from_log(self) -> Optional[str]: '''Should be used to provide error informations to the Learning Loop by extracting data from self.executor.get_log().''' @abstractmethod diff --git a/learning_loop_node/trainer/trainer_logic_abstraction.py b/learning_loop_node/trainer/trainer_logic_abstraction.py deleted file mode 100644 index 64349e3d..00000000 --- a/learning_loop_node/trainer/trainer_logic_abstraction.py +++ /dev/null @@ -1,146 +0,0 @@ -import os -import time -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Dict, List, Optional - -from socketio import AsyncClient - -from ..data_classes import Context, Errors, PretrainedModel, TrainerState, Training, TrainingData -from ..data_exchanger import DataExchanger -from ..loop_communication import LoopCommunicator -from .io_helpers import ActiveTrainingIO, LastTrainingIO - -if TYPE_CHECKING: - from .trainer_node import TrainerNode - - -class TrainerLogicAbstraction(ABC): - - def __init__(self, model_format: str): - - # NOTE: String to be used in the file path for the model on the server: - # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' - self.model_format: str = model_format - - self._node: Optional['TrainerNode'] = None # type: ignore - self._last_training_io: Optional[LastTrainingIO] = None # type: ignore - self.errors = Errors() - - self._training: Optional[Training] = None - self._active_training_io: Optional[ActiveTrainingIO] = None - - self.restart_after_training = os.environ.get('RESTART_AFTER_TRAINING', 'FALSE').lower() in ['true', '1'] - self.keep_old_trainings = os.environ.get('KEEP_OLD_TRAININGS', 'FALSE').lower() in ['true', '1'] - self.inference_batch_size = int(os.environ.get('INFERENCE_BATCH_SIZE', '10')) - - @property - def node(self) -> 'TrainerNode': - assert self._node is not None, 'node should be set by TrainerNode before initialization' - return self._node - - @property - def last_training_io(self) -> LastTrainingIO: - assert self._last_training_io is not None, 'last_training_io should be set by TrainerNode before initialization' - return self._last_training_io - - @property - def data_exchanger(self) -> DataExchanger: - return self.node.data_exchanger - - @property - def loop_communicator(self) -> LoopCommunicator: - return self.node.loop_communicator - - @property - def node_uuid(self) -> str: - return self.node.uuid - - @property - def sio_client(self) -> AsyncClient: - return self.node.sio_client - - @property - def active_training_io(self) -> ActiveTrainingIO: - assert self._active_training_io is not None, 'active_training_io must be set, call `init` first' - return self._active_training_io - - @property - def training_active(self) -> bool: - """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'""" - return self._training is not None and self._active_training_io is not None - - @property - def state(self) -> str: - if (not self.training_active) or (self.active_training.training_state is None): - return TrainerState.Idle.value - else: - return self.active_training.training_state - - @property - def active_training(self) -> Training: - assert self._training is not None, 'training must be initialized, call `init` first' - return self._training - - @property - def training_uptime(self) -> Optional[float]: - if self.training_active: - return time.time() - self.active_training.start_time - return None - - @property - def training_data(self) -> Optional[TrainingData]: - if self.training_active and self.active_training.data: - return self.active_training.data - return None - - @property - def training_context(self) -> Optional[Context]: - if self.training_active: - return self.active_training.context - return None - - # --- ABSTRACT PROPERTIES - # --------- implemented in TrainerLogicGeneric - - @property - @abstractmethod - def general_progress(self) -> Optional[float]: - """Returns the general progress of the training per state or None if idle""" - - # --------- implemented in TrainerLogic(with Executor) - @property - @abstractmethod - def hyperparameters(self) -> Optional[Dict]: - """Returns the currently used hyperparameters if available""" - - # --------- not implemented in any abstract class - @property - @abstractmethod - def model_architecture(self) -> Optional[str]: - """Returns the architecture name of the model if available""" - - @property - @abstractmethod - def provided_pretrained_models(self) -> List[PretrainedModel]: - """Returns the list of provided pretrained models""" - - # --- ABSTRACT METHODS ----- - # --------- implemented in TrainerLogicGeneric --- - - @abstractmethod - async def on_shutdown(self): - """Called when the trainer is shut down""" - - @abstractmethod - async def begin_training(self, organization: str, project: str, details: dict): - """Starts the training process""" - - @abstractmethod - async def try_continue_run_if_incomplete(self) -> bool: - """Start training continuation if possible, returns True if continuation started""" - - # --- implemented in TrainerLogic(with Executor) --- - - @abstractmethod - async def stop(self): - """Stops the training process""" diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index 7221e6ec..f790bbd9 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -3,37 +3,130 @@ import logging import shutil import sys -from abc import abstractmethod +import time +from abc import ABC, abstractmethod from dataclasses import asdict -from typing import Callable, Coroutine, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Callable, Coroutine, Dict, List, Optional -from dacite import from_dict from fastapi.encoders import jsonable_encoder -from ..data_classes import BasicModel, Category, Context, Hyperparameter, TrainerState, TrainingData, TrainingOut +from ..data_classes import (Context, Errors, Hyperparameter, PretrainedModel, TrainerState, Training, TrainingData, + TrainingOut, TrainingStateData) from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4 from .downloader import TrainingsDownloader -from .io_helpers import ActiveTrainingIO -from .trainer_logic_abstraction import TrainerLogicAbstraction +from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO +if TYPE_CHECKING: + from .trainer_node import TrainerNode -class TrainerLogicGeneric(TrainerLogicAbstraction): + +class TrainerLogicGeneric(ABC): def __init__(self, model_format: str) -> None: - super().__init__(model_format) + + # NOTE: model_format is used in the file path for the model on the server: + # It acts as a key for list of files (cf. _get_latest_model_files) + # '/{context.organization}/projects/{context.project}/models/{model_id}/{model_format}/file' + self.model_format: str = model_format + self.errors = Errors() + self.training_task: Optional[asyncio.Task] = None - self.detection_progress = 0.0 self.shutdown_event: asyncio.Event = asyncio.Event() + self._node: Optional['TrainerNode'] = None # type: ignore + self._last_training_io: Optional[LastTrainingIO] = None # type: ignore + + self._training: Optional[Training] = None + self._active_training_io: Optional[ActiveTrainingIO] = None + self._environment_vars = EnvironmentVars() + + # ---------------------------------------- PROPERTIES TO AVOID CHECKING FOR NONE ---------------------------------------- + + @property + def node(self) -> 'TrainerNode': + assert self._node is not None, 'node should be set by TrainerNode before initialization' + return self._node + + @property + def last_training_io(self) -> LastTrainingIO: + assert self._last_training_io is not None, 'last_training_io should be set by TrainerNode before initialization' + return self._last_training_io + + @property + def active_training_io(self) -> ActiveTrainingIO: + assert self._active_training_io is not None, 'active_training_io must be set, call `init` first' + return self._active_training_io + + @property + def training(self) -> Training: + assert self._training is not None, 'training must be initialized, call `init` first' + return self._training + + @property + def hyperparameter(self) -> Hyperparameter: + assert self.training_data is not None, 'Training should have data' + assert self.training_data.hyperparameter is not None, 'Training.data should have hyperparameter' + return self.training_data.hyperparameter + + # ---------------------------------------- PROPERTIES ---------------------------------------- + + @property + def training_data(self) -> Optional[TrainingData]: + if self.training_active and self.training.data: + return self.training.data + return None + + @property + def training_context(self) -> Optional[Context]: + if self.training_active: + return self.training.context + return None + + @property + def training_active(self) -> bool: + """_training and _active_training_io are set in 'init_new_training' or 'init_from_last_training'. + """ + return self._training is not None and self._active_training_io is not None + + @property + def state(self) -> str: + """Returns the current state of the training. Used solely by the node in send_status(). + """ + if (not self.training_active) or (self.training.training_state is None): + return TrainerState.Idle.value + return self.training.training_state + + @property + def training_uptime(self) -> Optional[float]: + """Livetime of current Training object. Start time is set during initialization of Training object. + """ + if self.training_active: + return time.time() - self.training.start_time + return None + + @property + def hyperparameters_for_state_sync(self) -> Optional[Dict]: + """Used in sync_confusion_matrix and send_status to provide information about the training configuration. + """ + if self._training and self._training.data and self._training.data.hyperparameter: + information = {} + information['resolution'] = self._training.data.hyperparameter.resolution + information['flipRl'] = self._training.data.hyperparameter.flip_rl + information['flipUd'] = self._training.data.hyperparameter.flip_ud + return information + return None + @property def general_progress(self) -> Optional[float]: - """Represents the progress for different states.""" + """Represents the progress for different states, should run from 0 to 100 for each state. + Note that training_progress and detection_progress need to be implemented in the specific trainer. + """ if not self.training_active: return None - t_state = self.active_training.training_state + t_state = self.training.training_state if t_state == TrainerState.DataDownloading: - return self.data_exchanger.progress + return self.node.data_exchanger.progress if t_state == TrainerState.TrainingRunning: return self.training_progress if t_state == TrainerState.Detecting: @@ -41,45 +134,85 @@ def general_progress(self) -> Optional[float]: return None - def init_new_training(self, context: Context, details: Dict) -> None: - """Called on `begin_training` event from the Learning Loop. - Note that details needs the entries 'categories' and 'training_number'""" + # ---------------------------------------- ABSTRACT PROPERTIES ---------------------------------------- - project_folder = create_project_folder(context) - if not self.keep_old_trainings: - # NOTE: We delete all existing training folders because they are not needed anymore. - delete_all_training_folders(project_folder) - self._training = generate_training(project_folder, context) - self._training.data = TrainingData(categories=Category.from_list(details['categories'])) - self._training.data.hyperparameter = from_dict(data_class=Hyperparameter, data=details) - self._training.training_number = details['training_number'] - self._training.base_model_id = details['id'] - self._training.training_state = TrainerState.Initialized - self._active_training_io = ActiveTrainingIO( - self._training.training_folder, self.loop_communicator, context) - logging.info(f'training initialized: {self._training}') + @property + @abstractmethod + def training_progress(self) -> Optional[float]: + """Represents the training progress.""" + raise NotImplementedError + + @property + @abstractmethod + def detection_progress(self) -> Optional[float]: + """Represents the detection progress.""" + raise NotImplementedError + + @property + @abstractmethod + def model_architecture(self) -> Optional[str]: + """Returns the architecture name of the model if available""" + raise NotImplementedError + + @property + @abstractmethod + def provided_pretrained_models(self) -> List[PretrainedModel]: + """Returns the list of provided pretrained models. + The names of the models will come back as model_uuid_or_name in the training details. + """ + raise NotImplementedError + + # ---------------------------------------- METHODS ---------------------------------------- + + # NOTE: Trainings are started by the Learning Loop via the begin_training event + # or by the trainer itself via try_continue_run_if_incomplete. + # The trainer will then initialize a new training object and start the training loop. + # Initializing a new training object will create the folder structure for the training. + # The training loop will then run through the states of the training. async def try_continue_run_if_incomplete(self) -> bool: + """Tries to continue a training if the last training was not finished. + """ if not self.training_active and self.last_training_io.exists(): + self._init_from_last_training() logging.info('found incomplete training, continuing now.') - self.init_from_last_training() - asyncio.get_event_loop().create_task(self.run()) + asyncio.get_event_loop().create_task(self._run()) return True return False - def init_from_last_training(self) -> None: + def _init_from_last_training(self) -> None: + """Initializes a new training object from the last training saved on disc via last_training_io. + """ self._training = self.last_training_io.load() assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder' self._active_training_io = ActiveTrainingIO( - self._training.training_folder, self.loop_communicator, self._training.context) + self._training.training_folder, self.node.loop_communicator, self._training.context) async def begin_training(self, organization: str, project: str, details: Dict) -> None: - """Called on `begin_training` event from the Learning Loop.""" + """Called on `begin_training` event from the Learning Loop. + """ + self._init_new_training(Context(organization=organization, project=project), details) + asyncio.get_event_loop().create_task(self._run()) + + def _init_new_training(self, context: Context, details: Dict) -> None: + """Called on `begin_training` event from the Learning Loop. + Note that details needs the entries 'categories' and 'training_number', + but also the hyperparameter entries. + """ + project_folder = create_project_folder(context) + if not self._environment_vars.keep_old_trainings: + delete_all_training_folders(project_folder) + self._training = generate_training(project_folder, context) + self._training.set_values_from_data(details) - self.init_new_training(Context(organization=organization, project=project), details) - asyncio.get_event_loop().create_task(self.run()) + self._active_training_io = ActiveTrainingIO( + self._training.training_folder, self.node.loop_communicator, context) + logging.info(f'new training initialized: {self._training}') - async def run(self) -> None: + async def _run(self) -> None: + """Called on `begin_training` event from the Learning Loop. + Either via `begin_training` or `try_continue_run_if_incomplete`. + """ self.errors.reset_all() try: self.training_task = asyncio.get_running_loop().create_task(self._training_loop()) @@ -87,46 +220,47 @@ async def run(self) -> None: except asyncio.CancelledError: if not self.shutdown_event.is_set(): logging.info('training task was cancelled but not by shutdown event') - self.active_training.training_state = TrainerState.ReadyForCleanup - self.last_training_io.save(self.active_training) - await self.clear_training() + self.training.training_state = TrainerState.ReadyForCleanup + self.last_training_io.save(self.training) + await self._clear_training() except Exception as e: logging.exception(f'Error in train: {e}') # ---------------------------------------- TRAINING STATES ---------------------------------------- async def _training_loop(self) -> None: - """asyncio.CancelledError is catched in run""" - + """Cycle through the training states until the training is finished or + an asyncio.CancelledError is raised. + """ assert self.training_active while self._training is not None: - tstate = self.active_training.training_state - logging.info(f'STATE LOOP: {tstate}, eerrors: {self.errors.errors}') + tstate = self.training.training_state await asyncio.sleep(0.6) # Note: Required for pytests! + if tstate == TrainerState.Initialized: # -> DataDownloading -> DataDownloaded - await self.perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare) + await self._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, self._prepare) elif tstate == TrainerState.DataDownloaded: # -> TrainModelDownloading -> TrainModelDownloaded - await self.perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model) + await self._perform_state('download_model', TrainerState.TrainModelDownloading, TrainerState.TrainModelDownloaded, self._download_model) elif tstate == TrainerState.TrainModelDownloaded: # -> TrainingRunning -> TrainingFinished - await self.perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train) + await self._perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train) elif tstate == TrainerState.TrainingFinished: # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced - await self.perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix) + await self._perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix) elif tstate == TrainerState.ConfusionMatrixSynced: # -> TrainModelUploading -> TrainModelUploaded - await self.perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model) + await self._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model) elif tstate == TrainerState.TrainModelUploaded: # -> Detecting -> Detected - await self.perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections) + await self._perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections) elif tstate == TrainerState.Detected: # -> DetectionUploading -> ReadyForCleanup - await self.perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions) + await self._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions) elif tstate == TrainerState.ReadyForCleanup: # -> RESTART or TrainingFinished - await self.clear_training() - self.may_restart() + await self._clear_training() + self._may_restart() - async def perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False): + async def _perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False): await asyncio.sleep(0.1) logging.info(f'Performing state: {state_during}') - previous_state = self.active_training.training_state - self.active_training.training_state = state_during + previous_state = self.training.training_state + self.training.training_state = state_during await asyncio.sleep(0.1) if reset_early: self.errors.reset(error_key) @@ -141,52 +275,60 @@ async def perform_state(self, error_key: str, state_during: TrainerState, state_ except Exception as e: self.errors.set(error_key, str(e)) logging.exception(f'Error in {state_during} - Exception:') - self.active_training.training_state = previous_state + self.training.training_state = previous_state else: if not reset_early: self.errors.reset(error_key) - self.active_training.training_state = state_after - self.last_training_io.save(self.active_training) + self.training.training_state = state_after + self.last_training_io.save(self.training) async def _prepare(self) -> None: - self.data_exchanger.set_context(self.active_training.context) - downloader = TrainingsDownloader(self.data_exchanger) - image_data, skipped_image_count = await downloader.download_training_data(self.active_training.images_folder) - assert self.active_training.data is not None, 'training.data must be set' - self.active_training.data.image_data = image_data - self.active_training.data.skipped_image_count = skipped_image_count + """Downloads images to the images_folder and saves annotations to training.data.image_data. + """ + self.node.data_exchanger.set_context(self.training.context) + downloader = TrainingsDownloader(self.node.data_exchanger) + image_data, skipped_image_count = await downloader.download_training_data(self.training.images_folder) + assert self.training.data is not None, 'training.data must be set' + self.training.data.image_data = image_data + self.training.data.skipped_image_count = skipped_image_count async def _download_model(self) -> None: - model_id = self.active_training.base_model_id - assert model_id is not None, 'model_id must be set' - if is_valid_uuid4( - self.active_training.base_model_id): # TODO this checks if we continue a training -> make more explicit - logging.info('loading model from Learning Loop') - logging.info(f'downloading model {model_id} as {self.model_format}') - await self.data_exchanger.download_model(self.active_training.training_folder, self.active_training.context, model_id, self.model_format) - shutil.move(f'{self.active_training.training_folder}/model.json', - f'{self.active_training.training_folder}/base_model.json') - else: - logging.info(f'base_model_id {model_id} is not a valid uuid4, skipping download') - - async def _sync_confusion_matrix(self): - '''NOTE: This stage sets the errors explicitly because it may be used inside the training stage.''' + """If training is continued, the model is downloaded from the Learning Loop to the training_folder. + The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training. + """ + base_model_uuid = self.training.base_model_uuid_or_name + + # TODO this checks if we continue a training -> make more explicit + if not base_model_uuid or not is_valid_uuid4(base_model_uuid): + logging.info(f'skipping model download. No base model provided (in form of uuid): {base_model_uuid}') + return + + logging.info('loading model from Learning Loop') + logging.info(f'downloading model {base_model_uuid} as {self.model_format}') + await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_uuid, self.model_format) + shutil.move(f'{self.training.training_folder}/model.json', + f'{self.training.training_folder}/base_model.json') + + async def _sync_confusion_matrix(self) -> None: + """Syncronizes the confusion matrix with the Learning Loop via the update_training endpoint. + NOTE: This stage sets the errors explicitly because it may be used inside the training stage. + """ error_key = 'sync_confusion_matrix' try: - new_best_model = self.get_new_best_model() - if new_best_model and self.active_training.data: - new_training = TrainingOut(trainer_id=self.node_uuid, + new_best_model = self._get_new_best_training_state() + if new_best_model and self.training.data: + new_training = TrainingOut(trainer_id=self.node.uuid, confusion_matrix=new_best_model.confusion_matrix, - train_image_count=self.active_training.data.train_image_count(), - test_image_count=self.active_training.data.test_image_count(), - hyperparameters=self.hyperparameters) + train_image_count=self.training.data.train_image_count(), + test_image_count=self.training.data.test_image_count(), + hyperparameters=self.hyperparameters_for_state_sync) await asyncio.sleep(0.1) # NOTE needed for tests. - result = await self.sio_client.call('update_training', ( - self.active_training.context.organization, self.active_training.context.project, jsonable_encoder(new_training))) + result = await self.node.sio_client.call('update_training', ( + self.training.context.organization, self.training.context.project, jsonable_encoder(new_training))) if isinstance(result, dict) and result['success']: logging.info(f'successfully updated training {asdict(new_training)}') - self.on_model_published(new_best_model) + self._on_metrics_published(new_best_model) else: raise Exception(f'Error for update_training: Response from loop was : {result}') except Exception as e: @@ -195,25 +337,23 @@ async def _sync_confusion_matrix(self): raise self.errors.reset(error_key) - async def _upload_model(self) -> None | bool: - """Returns True if the training should be cleaned up.""" - - new_model_id = await self._upload_model_return_new_model_uuid(self.active_training.context) - if new_model_id is None: - self.active_training.training_state = TrainerState.ReadyForCleanup + async def _upload_model(self) -> None: + """Uploads the latest model to the Learning Loop. + """ + new_model_uuid = await self._upload_model_return_new_model_uuid(self.training.context) + if new_model_uuid is None: + self.training.training_state = TrainerState.ReadyForCleanup logging.error('could not upload model - maybe training failed.. cleaning up') - return True - logging.info(f'Successfully uploaded model and received new model id: {new_model_id}') - self.active_training.model_id_for_detecting = new_model_id - return None + logging.info(f'Successfully uploaded model and received new model id: {new_model_uuid}') + self.training.model_uuid_for_detecting = new_model_uuid async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]: """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file. Note that with the latest trainers the conversion to (.wts) is done by the trainer. The conversion from .wts to .engine is done by the detector (needs to be done on target hardware). - Note that trainer may train with different classes, which is why we send an initial model.json file. - """ - files = await asyncio.get_running_loop().run_in_executor(None, self.get_latest_model_files) + Note that trainer may train with different classes, which is why we send an initial model.json file.""" + + files = await self._get_latest_model_files() if files is None: return None @@ -225,10 +365,10 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona model_uuid = None for file_format in [f for f in files if f not in already_uploaded_formats]: - _files = files[file_format] + [self.dump_categories_to_json()] + _files = files[file_format] + [self._dump_categories_to_json()] assert len([f for f in _files if 'model.json' in f]) == 1, "model.json must be included exactly once" - model_uuid = await self.data_exchanger.upload_model_get_uuid(context, _files, self.active_training.training_number, file_format) + model_uuid = await self.node.data_exchanger.upload_model_get_uuid(context, _files, self.training.training_number, file_format) if model_uuid is None: return None @@ -237,91 +377,119 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> Optiona return model_uuid - def dump_categories_to_json(self) -> str: + def _dump_categories_to_json(self) -> str: + """Dumps the categories to a json file and returns the path to the file. + """ content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None json_path = '/tmp/model.json' with open(json_path, 'w') as f: json.dump(content, f) return json_path - async def clear_training(self): + async def _clear_training(self): + """Clears the training data after a training has finished. + """ self.active_training_io.delete_detections() self.active_training_io.delete_detection_upload_progress() self.active_training_io.delete_detections_upload_file_index() - await self.clear_training_data(self.active_training.training_folder) + await self._clear_training_data(self.training.training_folder) self.last_training_io.delete() - # self.training.training_state = TrainingState.TrainingFinished await self.node.send_status() self._training = None # ---------------------------------------- OTHER METHODS ---------------------------------------- - def may_restart(self) -> None: - if self.restart_after_training: - logging.info('restarting') - sys.exit(0) - else: - logging.info('not restarting') - async def on_shutdown(self) -> None: self.shutdown_event.set() await self.stop() await self.stop() - # ---------------------------------------- ABSTRACT PROPERTIES ---------------------------------------- - - @property - @abstractmethod - def training_progress(self) -> Optional[float]: - """Represents the training progress.""" - raise NotImplementedError - + async def stop(self): + """Stops the training process by canceling training task. + """ + if not self.training_active: + return + if self.training_task: + logging.info('cancelling training task') + if self.training_task.cancel(): + try: + await self.training_task + except asyncio.CancelledError: + pass + logging.info('cancelled training task') + self._may_restart() + + def _may_restart(self) -> None: + """If the environment variable RESTART_AFTER_TRAINING is set, the trainer will restart after a training. + """ + if self._environment_vars.restart_after_training: + logging.info('restarting') + sys.exit(0) + else: + logging.info('not restarting') # ---------------------------------------- ABSTRACT METHODS ---------------------------------------- @abstractmethod async def _train(self) -> None: - '''Should be used to execute a training. + """Should be used to execute a training. + At this point, images are already downloaded to the images_folder and annotations are saved in training.data.image_data. + If a training is continued, the model is already downloaded. The model should be synchronized with the Learning Loop via self._sync_confusion_matrix() every now and then. - asyncio.CancelledError should be catched and re-raised.''' + asyncio.CancelledError should be catched and re-raised. + """ + raise NotImplementedError @abstractmethod async def _do_detections(self) -> None: - '''Should be used to execute detections. + """Should be used to infer detections of all images and save them to drive. active_training_io.save_detections(...) should be used to store the detections. - asyncio.CancelledError should be catched and re-raised.''' + asyncio.CancelledError should be catched and re-raised. + """ + raise NotImplementedError @abstractmethod - def get_new_best_model(self) -> Optional[BasicModel]: - '''Is called frequently in `_sync_confusion_matrix` to check if a new "best" model is availabe. - Returns None if no new model could be found. Otherwise BasicModel(confusion_matrix, meta_information). + def _get_new_best_training_state(self) -> Optional[TrainingStateData]: + """Is called frequently by `_sync_confusion_matrix` to check if a new "best" model is availabe. + Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information). `confusion_matrix` contains a dict of all classes: - - The classes must be identified by their id, not their name. + - The classes must be identified by their uuid, not their name. - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives). - `meta_information` can hold any data which is helpful for self.on_model_published to store weight file etc for later upload via self.get_model_files - ''' + `meta_information` can hold any data which is helpful for self._on_metrics_published to store weight file etc for later upload via self.get_model_files + """ + raise NotImplementedError @abstractmethod - def on_model_published(self, basic_model: BasicModel) -> None: - '''Called after a BasicModel has been successfully send to the Learning Loop. - The files for this model should be stored. - self.get_latest_model_files is used to gather all files needed for transfering the actual data from the trainer node to the Learning Loop. - In the simplest implementation this method just renames the weight file (encoded in BasicModel.meta_information) into a file name like latest_published_model - ''' + def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: + """Called after the metrics corresponding to TrainingStateData have been successfully send to the Learning Loop. + Receives the TrainingStateData object which was returned by self._get_new_best_training_state. + If above function returns None, this function is not called. + The respective files for this model should be stored so they can be later uploaded in get_latest_model_files. + """ + raise NotImplementedError @abstractmethod - def get_latest_model_files(self) -> Optional[Union[List[str], Dict[str, List[str]]]]: - '''Called when the Learning Loop requests to backup the latest model for the training. - Should return a list of file paths which describe the model. + async def _get_latest_model_files(self) -> Dict[str, List[str]]: + """Called when the Learning Loop requests to backup the latest model for the training. + This function is used to __generate and gather__ all files needed for transfering the actual data from the trainer node to the Learning Loop. + In the simplest implementation this method just renames the weight file (e.g. stored in TrainingStateData.meta_information) into a file name like latest_published_model + + The function should return a list of file paths which describe the model per format. These files must contain all data neccessary for the trainer to resume a training (eg. weight file, hyperparameters, etc.) and will be stored in the Learning Loop unter the format of this trainer. Note: by convention the weightfile should be named "model." where extension is the file format of the weightfile. For example "model.pt" for pytorch or "model.weights" for darknet/yolo. If a trainer can also generate other formats (for example for an detector), - a dictionary mapping format -> list of files can be returned.''' + a dictionary mapping format -> list of files can be returned. + + If the function returns an empty dict, something went wrong and the model upload will be skipped. + """ + raise NotImplementedError @abstractmethod - async def clear_training_data(self, training_folder: str) -> None: - '''Called after a training has finished. Deletes all data that is not needed anymore after a training run. - This can be old weightfiles or any additional files.''' + async def _clear_training_data(self, training_folder: str) -> None: + """Called after a training has finished. Deletes all data that is not needed anymore after a training run. + This can be old weightfiles or any additional files. + """ + raise NotImplementedError diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py index c87124c1..f69cf103 100644 --- a/learning_loop_node/trainer/trainer_node.py +++ b/learning_loop_node/trainer/trainer_node.py @@ -9,12 +9,12 @@ from ..node import Node from .io_helpers import LastTrainingIO from .rest import backdoor_controls, controls -from .trainer_logic_abstraction import TrainerLogicAbstraction +from .trainer_logic_generic import TrainerLogicGeneric class TrainerNode(Node): - def __init__(self, name: str, trainer_logic: TrainerLogicAbstraction, uuid: Optional[str] = None, use_backdoor_controls: bool = False): + def __init__(self, name: str, trainer_logic: TrainerLogicGeneric, uuid: Optional[str] = None, use_backdoor_controls: bool = False): super().__init__(name, uuid, 'trainer') trainer_logic._node = self self.trainer_logic = trainer_logic @@ -84,7 +84,7 @@ async def send_status(self): status.train_image_count = data.train_image_count() status.test_image_count = data.test_image_count() status.skipped_image_count = data.skipped_image_count - status.hyperparameters = self.trainer_logic.hyperparameters + status.hyperparameters = self.trainer_logic.hyperparameters_for_state_sync status.errors = self.trainer_logic.errors.errors status.context = self.trainer_logic.training_context diff --git a/learning_loop_node/trainer/training_syncronizer.py b/learning_loop_node/trainer/training_syncronizer.py deleted file mode 100644 index 97041bb9..00000000 --- a/learning_loop_node/trainer/training_syncronizer.py +++ /dev/null @@ -1,53 +0,0 @@ - -import asyncio -import logging -from dataclasses import asdict -from typing import TYPE_CHECKING - -import socketio -from dacite import from_dict -from fastapi.encoders import jsonable_encoder - -from ..data_classes import TrainingOut -from ..data_classes.socket_response import SocketResponse - -if TYPE_CHECKING: - from .trainer_logic import TrainerLogic - - -class TrainingSyncronizer: - def __init__(self, trainer_node_uuid: str, sio_client: socketio.AsyncClient): - self.trainer_node_uuid = trainer_node_uuid - self.sio_client = sio_client - - async def sync_model(model, current_training): - new_training = TrainingOut( - trainer_id=self.trainer_node_uuid, - confusion_matrix=model.confusion_matrix, - train_image_count=current_training.data.train_image_count(), - test_image_count=current_training.data.test_image_count(), - hyperparameters=trainer.hyperparameters) - - await asyncio.sleep(0.1) # NOTE needed for tests. - - result = await self.sio_client.call('update_training', (current_training.context.organization, current_training.context.project, jsonable_encoder(new_training))) - response = from_dict(data_class=SocketResponse, data=result) - - return response - - -async def try_sync_model(mo): - try: - model = trainer.get_new_model() - except Exception as exc: - logging.exception('error while getting new model') - raise Exception(f'Could not get new model: {str(exc)}') from exc - logging.debug(f'new model {model}') - - if model: - response = await sync_model(trainer, trainer_node_uuid, sio_client, model) - - if not response.success: - error_msg = f'Error for update_training: Response from loop was : {asdict(response)}' - logging.error(error_msg) - raise Exception(error_msg) diff --git a/mock_detector/app_code/tests/test_detector.py b/mock_detector/app_code/tests/test_detector.py index 3d05d99e..75816212 100644 --- a/mock_detector/app_code/tests/test_detector.py +++ b/mock_detector/app_code/tests/test_detector.py @@ -5,6 +5,8 @@ from learning_loop_node.detector.detector_node import DetectorNode from learning_loop_node.globals import GLOBALS +# pylint: disable=unused-argument + @pytest.fixture(scope="session") def event_loop(request): diff --git a/mock_trainer/app_code/mock_trainer_logic.py b/mock_trainer/app_code/mock_trainer_logic.py index e88a2de3..d293758e 100644 --- a/mock_trainer/app_code/mock_trainer_logic.py +++ b/mock_trainer/app_code/mock_trainer_logic.py @@ -2,11 +2,11 @@ import asyncio import logging import time -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional -from learning_loop_node.data_classes import (BasicModel, BoxDetection, CategoryType, ClassificationDetection, - Detections, ErrorConfiguration, ModelInformation, Point, PointDetection, - PretrainedModel, SegmentationDetection, Shape) +from learning_loop_node.data_classes import (BoxDetection, CategoryType, ClassificationDetection, Detections, + ErrorConfiguration, ModelInformation, Point, PointDetection, + PretrainedModel, SegmentationDetection, Shape, TrainingStateData) from learning_loop_node.trainer.trainer_logic import TrainerLogic from . import progress_simulator @@ -23,28 +23,28 @@ def __init__(self, model_format: str) -> None: self.current_iteration = 0 self.provide_new_model = True - def can_resume(self) -> bool: + def _can_resume(self) -> bool: return False - async def resume(self) -> None: + async def _resume(self) -> None: pass - async def start_training(self) -> None: + async def _start_training_from_base_model(self) -> None: self.current_iteration = 0 if self.error_configuration.begin_training: raise Exception('Could not start training') - self.executor.start('while true; do sleep 1; done') + await self.executor.start('/bin/bash -c "while true; do sleep 1; done"') - async def start_training_from_scratch(self, base_model_id: str) -> None: + async def _start_training_from_scratch(self) -> None: self.current_iteration = 0 - self.executor.start('while true; do sleep 1; done') + await self.executor.start('/bin/bash -c "while true; do sleep 1; done"') - def get_executor_error_from_log(self) -> Optional[str]: + def _get_executor_error_from_log(self) -> Optional[str]: if self.error_configuration.crash_training: return 'mocked crash' return None - def get_latest_model_files(self) -> Union[List[str], Dict[str, List[str]]]: + async def _get_latest_model_files(self) -> Dict[str, List[str]]: if self.error_configuration.save_model: raise Exception() @@ -66,37 +66,34 @@ async def _detect(self, model_information: ModelInformation, images: List[str], for image in images: image_id = image.split('/')[-1].replace('.jpg', '') - box_detections = [] - point_detections = [] - segmentation_detections = [] - classification_detections = [] - det_entry = { - 'image_id': image_id, 'box_detections': box_detections, 'point_detections': point_detections, - 'segmentation_detections': segmentation_detections, - 'classification_detections': classification_detections} + box_detections: List[BoxDetection] = [] + point_detections: List[PointDetection] = [] + segmentation_detections: List[SegmentationDetection] = [] + classification_detections: List[ClassificationDetection] = [] + for c in model_information.categories: if c.type == CategoryType.Box: - d = BoxDetection(category_name=c.name, x=1, y=2, width=30, height=40, - model_name=model_information.version, confidence=.99, category_id=c.id) - box_detections.append(d) + bd = BoxDetection(category_name=c.name, x=1, y=2, width=30, height=40, + model_name=model_information.version, confidence=.99, category_id=c.id) + box_detections.append(bd) elif c.type == CategoryType.Point: - d = PointDetection(category_name=c.name, x=100, y=200, - model_name=model_information.version, confidence=.97, category_id=c.id) - point_detections.append(d) + pd = PointDetection(category_name=c.name, x=100, y=200, + model_name=model_information.version, confidence=.97, category_id=c.id) + point_detections.append(pd) elif c.type == CategoryType.Segmentation: - d = SegmentationDetection(category_name=c.name, shape=Shape(points=[Point(x=1, y=2), Point( + sd = SegmentationDetection(category_name=c.name, shape=Shape(points=[Point(x=1, y=2), Point( x=3, y=4)]), model_name=model_information.version, confidence=.96, category_id=c.id) - segmentation_detections.append(d) + segmentation_detections.append(sd) elif c.type == CategoryType.Classification: - d = ClassificationDetection(category_name=c.name, model_name=model_information.version, - confidence=.95, category_id=c.id) - classification_detections.append(d) + cd = ClassificationDetection(category_name=c.name, model_name=model_information.version, + confidence=.95, category_id=c.id) + classification_detections.append(cd) detections.append(Detections(box_detections=box_detections, point_detections=point_detections, segmentation_detections=segmentation_detections, classification_detections=classification_detections, image_id=image_id)) return detections - async def clear_training_data(self, training_folder: str): + async def _clear_training_data(self, training_folder: str): pass @property @@ -111,7 +108,7 @@ def training_progress(self) -> float: print(f'prog. is {self.current_iteration} / {self.max_iterations} = {self.current_iteration / self.max_iterations}') return self.current_iteration / self.max_iterations - def get_new_best_model(self) -> Optional[BasicModel]: + def _get_new_best_training_state(self) -> Optional[TrainingStateData]: logging.warning('get_new_model called') if self.error_configuration.get_new_model: raise Exception('Could not get new model') @@ -120,9 +117,9 @@ def get_new_best_model(self) -> Optional[BasicModel]: self.current_iteration += 1 return progress_simulator.increment_time(self, self.latest_known_confusion_matrix) - def on_model_published(self, basic_model: BasicModel) -> None: - assert isinstance(basic_model.confusion_matrix, Dict) - self.latest_known_confusion_matrix = basic_model.confusion_matrix + def _on_metrics_published(self, training_state_data: TrainingStateData) -> None: + assert isinstance(training_state_data.confusion_matrix, Dict) + self.latest_known_confusion_matrix = training_state_data.confusion_matrix @property def model_architecture(self) -> str: diff --git a/mock_trainer/app_code/progress_simulator.py b/mock_trainer/app_code/progress_simulator.py index 6eaf5ced..76f8be52 100644 --- a/mock_trainer/app_code/progress_simulator.py +++ b/mock_trainer/app_code/progress_simulator.py @@ -1,17 +1,17 @@ import random from typing import Dict, Optional -from learning_loop_node.data_classes import BasicModel +from learning_loop_node.data_classes import TrainingStateData from learning_loop_node.trainer.trainer_logic import TrainerLogic -def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -> Optional[BasicModel]: +def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -> Optional[TrainingStateData]: if not trainer._training or not trainer._training.data: # pylint: disable=protected-access return None confusion_matrix = {} - assert trainer.active_training.data is not None - for category in trainer.active_training.data.categories: + assert trainer.training.data is not None + for category in trainer.training.data.categories: try: minimum = latest_known_confusion_matrix[category.id]['tp'] except Exception: @@ -23,7 +23,7 @@ def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) - 'fn': max(random.randint(10-maximum, 10-minimum), 2), } - new_model = BasicModel( + new_model = TrainingStateData( confusion_matrix=confusion_matrix, ) diff --git a/mock_trainer/app_code/tests/conftest.py b/mock_trainer/app_code/tests/conftest.py index 86c62dc2..6c23ca7e 100644 --- a/mock_trainer/app_code/tests/conftest.py +++ b/mock_trainer/app_code/tests/conftest.py @@ -1,5 +1,4 @@ import asyncio -import logging import shutil import pytest @@ -7,6 +6,8 @@ from learning_loop_node.globals import GLOBALS from learning_loop_node.loop_communication import LoopCommunicator +# pylint: disable=redefined-outer-name + @pytest.fixture() async def glc(): diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py index 42fbfe8b..a1e3b471 100644 --- a/mock_trainer/app_code/tests/test_detections.py +++ b/mock_trainer/app_code/tests/test_detections.py @@ -12,8 +12,10 @@ from ..mock_trainer_logic import MockTrainerLogic +# pylint: disable=protected-access,redefined-outer-name,unused-argument -async def test_all(setup_test_project1, glc: LoopCommunicator): # pylint: disable=unused-argument, redefined-outer-name + +async def test_all(setup_test_project1, glc: LoopCommunicator): assert_image_count(0) assert GLOBALS.data_folder == '/tmp/learning_loop_lib_data' @@ -28,14 +30,14 @@ async def test_all(setup_test_project1, glc: LoopCommunicator): # pylint: disab 'resolution': 800, 'flip_rl': False, 'flip_ud': False} - trainer._node = node # pylint: disable=protected-access - trainer.init_new_training(context=context, details=details) + trainer._node = node + trainer._init_new_training(context=context, details=details) project_folder = create_project_folder(context) training = generate_training(project_folder, context) - training.model_id_for_detecting = latest_model_id - trainer._training = training # pylint: disable=protected-access - await trainer._do_detections() # pylint: disable=protected-access + training.model_uuid_for_detecting = latest_model_id + trainer._training = training + await trainer._do_detections() detections = trainer.active_training_io.load_detections() assert_image_count(10) # TODO This assert fails frequently on Drone diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py index f20797b0..a5d397f5 100644 --- a/mock_trainer/app_code/tests/test_mock_trainer.py +++ b/mock_trainer/app_code/tests/test_mock_trainer.py @@ -7,16 +7,19 @@ from ..mock_trainer_logic import MockTrainerLogic +# pylint: disable=protected-access +# pylint: disable=unused-argument + async def create_mock_trainer() -> MockTrainerLogic: mock_trainer = MockTrainerLogic(model_format='mocked') - mock_trainer._executor = Executor(GLOBALS.data_folder) # pylint: disable=protected-access + mock_trainer._executor = Executor(GLOBALS.data_folder) return mock_trainer async def test_get_model_files(setup_test_project2): mock_trainer = await create_mock_trainer() - files = mock_trainer.get_latest_model_files() + files = mock_trainer._get_latest_model_files() assert isinstance(files, Dict) @@ -27,7 +30,7 @@ async def test_get_model_files(setup_test_project2): async def test_get_new_model(setup_test_project2): mock_trainer = await create_mock_trainer() - await mock_trainer.start_training() + await mock_trainer._start_training_from_base_model() model = Model(uuid=(str(uuid4()))) context = Context(organization="", project="") @@ -37,6 +40,6 @@ async def test_get_new_model(setup_test_project2): project_folder="", images_folder="", training_folder="",) - mock_trainer.active_training.data = TrainingData(image_data=[], categories=[]) - model = mock_trainer.get_new_best_model() + mock_trainer.training.data = TrainingData(image_data=[], categories=[]) + model = mock_trainer._get_new_best_training_state() assert model is not None From a634dfead98ad2cca8aa97e1e10c7ccc01b432b5 Mon Sep 17 00:00:00 2001 From: Niklas Neugebauer Date: Thu, 11 Apr 2024 12:54:12 +0200 Subject: [PATCH 58/62] remove redundant second if statement --- learning_loop_node/loop_communication.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index 0642c3c1..a2e65124 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -81,9 +81,6 @@ async def retry_on_401(self, func: Callable[..., Awaitable[httpx.Response]], *ar async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/api') -> httpx.Response: if requires_login: await self.ensure_login() - - # retry on 401 if required - if requires_login: return await self.retry_on_401(self._get, path, api_prefix) else: return await self._get(path, api_prefix) @@ -97,14 +94,11 @@ async def _get(self, path: str, api_prefix: str = '/api') -> httpx.Response: async def put(self, path: str, files: Optional[List[str]] = None, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response: if requires_login: await self.ensure_login() - - # retry on 401 if required - if requires_login: return await self.retry_on_401(self._put, path, api_prefix, **kwargs) else: return await self._put(path, files, api_prefix, **kwargs) - async def _put(self, path, files: Optional[List[str]] = None, api_prefix='/api', **kwargs) -> httpx.Response: + async def _put(self, path: str, files: Optional[List[str]] = None, api_prefix='/api', **kwargs) -> httpx.Response: if files is None: return await self.async_client.put(api_prefix+path, **kwargs) @@ -129,9 +123,6 @@ async def _put(self, path, files: Optional[List[str]] = None, api_prefix='/api', async def post(self, path: str, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response: if requires_login: await self.ensure_login() - - # retry on 401 if required - if requires_login: return await self.retry_on_401(self._post, path, api_prefix, **kwargs) else: return await self._post(path, api_prefix, **kwargs) @@ -145,9 +136,6 @@ async def _post(self, path, api_prefix='/api', **kwargs) -> httpx.Response: async def delete(self, path: str, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response: if requires_login: await self.ensure_login() - - # retry on 401 if required - if requires_login: return await self.retry_on_401(self._delete, path, api_prefix, **kwargs) else: return await self._delete(path, api_prefix, **kwargs) From 507d59e8bcfadce3007251bf6d35b1a66675d325 Mon Sep 17 00:00:00 2001 From: Niklas Neugebauer Date: Thu, 11 Apr 2024 12:56:11 +0200 Subject: [PATCH 59/62] shorten most function to one-line --- learning_loop_node/loop_communication.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index a2e65124..ed322c87 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -86,10 +86,7 @@ async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/ return await self._get(path, api_prefix) async def _get(self, path: str, api_prefix: str = '/api') -> httpx.Response: - - response = await self.async_client.get(api_prefix+path) - - return response + return await self.async_client.get(api_prefix+path) async def put(self, path: str, files: Optional[List[str]] = None, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response: if requires_login: @@ -128,10 +125,7 @@ async def post(self, path: str, requires_login: bool = True, api_prefix: str = ' return await self._post(path, api_prefix, **kwargs) async def _post(self, path, api_prefix='/api', **kwargs) -> httpx.Response: - - response = await self.async_client.post(api_prefix+path, **kwargs) - - return response + return await self.async_client.post(api_prefix+path, **kwargs) async def delete(self, path: str, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response: if requires_login: @@ -141,7 +135,4 @@ async def delete(self, path: str, requires_login: bool = True, api_prefix: str = return await self._delete(path, api_prefix, **kwargs) async def _delete(self, path, api_prefix='/api', **kwargs) -> httpx.Response: - - response = await self.async_client.delete(api_prefix+path, **kwargs) - - return response + return await self.async_client.delete(api_prefix+path, **kwargs) From b547780ad39fb22e96604fbe2b71f0b2c39e00b1 Mon Sep 17 00:00:00 2001 From: Niklas Neugebauer Date: Thu, 11 Apr 2024 12:58:24 +0200 Subject: [PATCH 60/62] fix missing argument when calling _put also remove default arguments in private methods to make this kind of error more obvious --- learning_loop_node/loop_communication.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index ed322c87..a643fec4 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -85,17 +85,17 @@ async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/ else: return await self._get(path, api_prefix) - async def _get(self, path: str, api_prefix: str = '/api') -> httpx.Response: + async def _get(self, path: str, api_prefix: str) -> httpx.Response: return await self.async_client.get(api_prefix+path) async def put(self, path: str, files: Optional[List[str]] = None, requires_login: bool = True, api_prefix: str = '/api', **kwargs) -> httpx.Response: if requires_login: await self.ensure_login() - return await self.retry_on_401(self._put, path, api_prefix, **kwargs) + return await self.retry_on_401(self._put, path, files, api_prefix, **kwargs) else: return await self._put(path, files, api_prefix, **kwargs) - async def _put(self, path: str, files: Optional[List[str]] = None, api_prefix='/api', **kwargs) -> httpx.Response: + async def _put(self, path: str, files: Optional[List[str]], api_prefix: str, **kwargs) -> httpx.Response: if files is None: return await self.async_client.put(api_prefix+path, **kwargs) @@ -134,5 +134,5 @@ async def delete(self, path: str, requires_login: bool = True, api_prefix: str = else: return await self._delete(path, api_prefix, **kwargs) - async def _delete(self, path, api_prefix='/api', **kwargs) -> httpx.Response: + async def _delete(self, path, api_prefix, **kwargs) -> httpx.Response: return await self.async_client.delete(api_prefix+path, **kwargs) From eaab2bccb32bf30495a6fc7131d6c7e27f3fc780 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Mon, 15 Apr 2024 15:36:36 +0200 Subject: [PATCH 61/62] try rerduce flakyness of test --- learning_loop_node/tests/test_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_loop_node/tests/test_executor.py b/learning_loop_node/tests/test_executor.py index 1842f71e..1dbae97c 100644 --- a/learning_loop_node/tests/test_executor.py +++ b/learning_loop_node/tests/test_executor.py @@ -37,7 +37,7 @@ async def test_executor_lifecycle(): assert executor.is_running() assert_process_is_running('some_executable.sh') - sleep(1) + sleep(5) assert 'some output' in executor.get_log() await executor.stop_and_wait() From 11d0b66eaaa3baadc1d6fdcc6330f5fd3d9ddd33 Mon Sep 17 00:00:00 2001 From: "Dr. Dennis Wittich" Date: Mon, 15 Apr 2024 17:45:33 +0200 Subject: [PATCH 62/62] fix test of mock_trainer --- mock_trainer/app_code/tests/test_mock_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py index a5d397f5..e2b518b0 100644 --- a/mock_trainer/app_code/tests/test_mock_trainer.py +++ b/mock_trainer/app_code/tests/test_mock_trainer.py @@ -19,7 +19,7 @@ async def create_mock_trainer() -> MockTrainerLogic: async def test_get_model_files(setup_test_project2): mock_trainer = await create_mock_trainer() - files = mock_trainer._get_latest_model_files() + files = await mock_trainer._get_latest_model_files() assert isinstance(files, Dict)