From 7293f7a6e5e580614c4feffba89a00d4457bd90f Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Sun, 28 Apr 2024 00:18:40 +0200 Subject: [PATCH 01/32] Update Trigger interface. Add DataDriftTrigger initialization and inform logic. --- .../pipeline_executor/pipeline_executor.py | 4 +- .../internal/triggers/amounttrigger.py | 17 ++- .../internal/triggers/datadrifttrigger.py | 112 ++++++++++++++++++ .../internal/triggers/timetrigger.py | 18 ++- modyn/supervisor/internal/triggers/trigger.py | 24 +++- 5 files changed, 165 insertions(+), 10 deletions(-) create mode 100644 modyn/supervisor/internal/triggers/datadrifttrigger.py diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index 6cd46db2b..3cd5865fa 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -6,7 +6,7 @@ import sys import traceback from time import sleep -from typing import Any, Optional +from typing import Any, Optional, Generator from modyn.common.benchmark import Stopwatch from modyn.supervisor.internal.evaluation_result_writer import AbstractEvaluationResultWriter, LogResultWriter @@ -113,6 +113,8 @@ def _setup_trigger(self) -> None: trigger_module = dynamic_module_import("modyn.supervisor.internal.triggers") self.trigger: Trigger = getattr(trigger_module, trigger_id)(trigger_config) + self.trigger.init_trigger(self.pipeline_id, self.pipeline_config, self.modyn_config, self.eval_directory) + self.trigger.inform_previous_model(self.previous_model_id) assert self.trigger is not None, "Error during trigger initialization" diff --git a/modyn/supervisor/internal/triggers/amounttrigger.py b/modyn/supervisor/internal/triggers/amounttrigger.py index af2bc8d8e..32eea18a4 100644 --- a/modyn/supervisor/internal/triggers/amounttrigger.py +++ b/modyn/supervisor/internal/triggers/amounttrigger.py @@ -1,3 +1,5 @@ +import pathlib +from typing import Generator from modyn.supervisor.internal.triggers.trigger import Trigger @@ -13,8 +15,13 @@ def __init__(self, trigger_config: dict): self.remaining_data_points = 0 super().__init__(trigger_config) + + def init_trigger( + self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path + ) -> None: + pass - def inform(self, new_data: list[tuple[int, int, int]]) -> list[int]: + def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: assert self.remaining_data_points < self.data_points_for_trigger, "Inconsistent remaining datapoints" first_idx = self.data_points_for_trigger - self.remaining_data_points - 1 @@ -22,4 +29,10 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> list[int]: self.remaining_data_points = (self.remaining_data_points + len(new_data)) % self.data_points_for_trigger - return triggering_indices + yield from triggering_indices + + def inform_previous_trigger(self, previous_trigger_id: int) -> None: + pass + + def inform_previous_model(self, previous_model_id: int) -> None: + pass diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py new file mode 100644 index 000000000..b10771aee --- /dev/null +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -0,0 +1,112 @@ +import logging +import os +import pathlib +from typing import Optional, Generator + +# pylint: disable-next=no-name-in-module +from modyn.supervisor.internal.triggers.trigger import Trigger + + +logger = logging.getLogger(__name__) + + +class DataDriftTrigger(Trigger): + """Triggers when a certain number of data points have been used.""" + + def __init__(self, trigger_config: dict): + self.trigger_config = trigger_config + self.pipeline_id: Optional[int] = None + self.pipeline_config: Optional[dict] = None + self.modyn_config: Optional[dict] = None + self.base_dir: Optional[pathlib.Path] = None + + self.previous_trigger_id: Optional[int] = None + self.previous_model_id: Optional[int] = None + + self.data_cache = [] + self.leftover_data_points = 0 + + super().__init__(trigger_config) + + + def _parse_trigger_config(self) -> None: + self.detection_interval: int = 1000 + if "data_points_for_detection" in self.trigger_config.keys(): + self.detection_interval = self.trigger_config["data_points_for_detection"] + assert self.detection_interval > 0, "data_points_for_trigger needs to be at least 1" + + def _create_dirs(self) -> None: + assert self.pipeline_id is not None + assert self.base_dir is not None + + self.exp_output_dir = self.base_dir / str(self.pipeline_id) / f"datadrift" + self.drift_dir = self.exp_output_dir / "drift_results" + os.makedirs(self.drift_dir, exist_ok=True) + self.timers_dir = self.exp_output_dir / "timers" + os.makedirs(self.timers_dir, exist_ok=True) + + def init_trigger( + self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path + ) -> None: + self.pipeline_id = pipeline_id + self.pipeline_config = pipeline_config + self.modyn_config = modyn_config + self.base_dir = base_dir + + if "trigger_config" in self.pipeline_config["trigger"].keys(): + trigger_config = self.pipeline_config["trigger"]["trigger_config"] + self._parse_trigger_config(trigger_config) + self._create_dirs() + + + def detect_drift(self, idx_start, idx_end) -> bool: + pass + + + def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: + """ + Use Generator here because this data drift trigger + needs to wait for the previous trigger to finish and get the model + """ + # add new data to data_cache + self.data_cache.extend(new_data) + + unvisited_data_points = len(self.data_cache) + untriggered_data_points = unvisited_data_points + # the sliding window of data points for detection + detection_idx_start = 0 + detection_idx_end = 0 + + while unvisited_data_points >= self.detection_interval: + unvisited_data_points -= self.detection_interval + detection_idx_end += self.detection_interval + if detection_idx_end <= self.leftover_data_points: + continue + + # trigger_id doesn't always start from 0 + if self.previous_trigger_id is None: + # if no previous trigger exists, always trigger retraining + triggered = True + else: + # if exist previous trigger, detect drift + triggered = self.detect_drift(detection_idx_start, detection_idx_end) + + + if triggered: + trigger_data_points = detection_idx_end - detection_idx_start + trigger_idx = len(new_data) - (untriggered_data_points - trigger_data_points) - 1 + + # update bookkeeping and sliding window + untriggered_data_points -= trigger_data_points + detection_idx_start = detection_idx_end + yield trigger_idx + + # remove triggered data + del self.data_cache[:detection_idx_start] + self.leftover_data_points = detection_idx_end - detection_idx_start + + def inform_previous_trigger(self, previous_trigger_id: int) -> None: + self.previous_trigger_id = previous_trigger_id + + def inform_previous_model(self, previous_model_id: int) -> None: + self.previous_model_id = previous_model_id diff --git a/modyn/supervisor/internal/triggers/timetrigger.py b/modyn/supervisor/internal/triggers/timetrigger.py index ed944164c..e1f9414e7 100644 --- a/modyn/supervisor/internal/triggers/timetrigger.py +++ b/modyn/supervisor/internal/triggers/timetrigger.py @@ -1,4 +1,5 @@ -from typing import Optional +import pathlib +from typing import Optional, Generator from modyn.supervisor.internal.triggers.trigger import Trigger from modyn.utils import convert_timestr_to_seconds, validate_timestr @@ -25,7 +26,12 @@ def __init__(self, trigger_config: dict): super().__init__(trigger_config) - def inform(self, new_data: list[tuple[int, int, int]]) -> list[int]: + def init_trigger( + self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path + ) -> None: + pass + + def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: if self.next_trigger_at is None: if len(new_data) > 0: self.next_trigger_at = new_data[0][1] + self.trigger_every_s # new_data is sorted @@ -49,4 +55,10 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> list[int]: triggering_indices.append(idx - 1) self.next_trigger_at += self.trigger_every_s - return triggering_indices + yield from triggering_indices + + def inform_previous_trigger(self, previous_trigger_id: int) -> None: + pass + + def inform_previous_model(self, previous_model_id: int) -> None: + pass diff --git a/modyn/supervisor/internal/triggers/trigger.py b/modyn/supervisor/internal/triggers/trigger.py index 642fdf0cf..5f40bd1f2 100644 --- a/modyn/supervisor/internal/triggers/trigger.py +++ b/modyn/supervisor/internal/triggers/trigger.py @@ -1,16 +1,24 @@ +import pathlib +from typing import Generator from abc import ABC, abstractmethod class Trigger(ABC): def __init__(self, trigger_config: dict) -> None: assert trigger_config is not None, "trigger_config cannot be None." + + @abstractmethod + def init_trigger( + self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path + ) -> None: + """The supervisor initializes the trigger with trigger type specific configurations""" @abstractmethod - def inform(self, new_data: list[tuple[int, int, int]]) -> list[int]: - """The supervisor informs the trigger about new data. - In case the concrete trigger implementation decides to trigger, we return a list of _indices into new_data_. + def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: + """The supervisor informs the Trigger about new data. + In case the concrete Trigger implementation decides to trigger, we return a list of _indices into new_data_. This list contains the indices of all data points that cause a trigger. - The list might be empty or only contain a single element, which concrete triggers need to respect. + The list might be empty or only contain a single element, which concrete Triggers need to respect. Parameters: new_data (list[tuple[str, int, int]]): List of new data (keys, timestamps, labels). Can be empty. @@ -18,3 +26,11 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> list[int]: Returns: triggering_indices (list[int]): List of all indices that trigger training """ + + @abstractmethod + def inform_previous_trigger(self, previous_trigger_id: int) -> None: + """The supervisor informs the Trigger about the previous trigger_id""" + + @abstractmethod + def inform_previous_model(self, previous_model_id: int) -> None: + """The supervisor informs the Trigger about the model_id of the previous trigger""" From b02214c4724be266d2ec7c0d916917b991ee6a41 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Sun, 28 Apr 2024 00:22:29 +0200 Subject: [PATCH 02/32] Update Trigger init_trigger() explanation. --- modyn/supervisor/internal/triggers/trigger.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modyn/supervisor/internal/triggers/trigger.py b/modyn/supervisor/internal/triggers/trigger.py index 5f40bd1f2..df0fe60d7 100644 --- a/modyn/supervisor/internal/triggers/trigger.py +++ b/modyn/supervisor/internal/triggers/trigger.py @@ -11,7 +11,9 @@ def __init__(self, trigger_config: dict) -> None: def init_trigger( self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path ) -> None: - """The supervisor initializes the trigger with trigger type specific configurations""" + """The supervisor initializes the concrete Trigger with Trigger-type-specific configurations + base_dir: the base directory to store Trigger outputs. A location at the supervisor. + """ @abstractmethod def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: From a17b1cb9716e2ecca23e977fc58a549f9d501b8d Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Sun, 28 Apr 2024 00:50:23 +0200 Subject: [PATCH 03/32] Add Trigger dataset and dataloader utils --- .../internal/triggers/amounttrigger.py | 3 + .../internal/triggers/datadrifttrigger.py | 95 +++++++++++- .../internal/triggers/timetrigger.py | 3 + modyn/supervisor/internal/triggers/trigger.py | 4 + .../triggers/trigger_datasets/__init__.py | 13 ++ .../trigger_datasets/dataloader_info.py | 31 ++++ .../trigger_datasets/fixed_keys_dataset.py | 135 ++++++++++++++++++ .../online_trigger_dataset.py | 55 +++++++ modyn/supervisor/internal/triggers/utils.py | 68 +++++++++ 9 files changed, 405 insertions(+), 2 deletions(-) create mode 100644 modyn/supervisor/internal/triggers/trigger_datasets/__init__.py create mode 100644 modyn/supervisor/internal/triggers/trigger_datasets/dataloader_info.py create mode 100644 modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py create mode 100644 modyn/supervisor/internal/triggers/trigger_datasets/online_trigger_dataset.py create mode 100644 modyn/supervisor/internal/triggers/utils.py diff --git a/modyn/supervisor/internal/triggers/amounttrigger.py b/modyn/supervisor/internal/triggers/amounttrigger.py index 32eea18a4..43b211cb0 100644 --- a/modyn/supervisor/internal/triggers/amounttrigger.py +++ b/modyn/supervisor/internal/triggers/amounttrigger.py @@ -36,3 +36,6 @@ def inform_previous_trigger(self, previous_trigger_id: int) -> None: def inform_previous_model(self, previous_model_id: int) -> None: pass + + def inform_previous_trigger_data_points(self, previous_trigger_id: int, data_points: int) -> None: + pass diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index b10771aee..14a10f89c 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -5,6 +5,11 @@ # pylint: disable-next=no-name-in-module from modyn.supervisor.internal.triggers.trigger import Trigger +from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo +from modyn.supervisor.internal.triggers.utils import ( + prepare_trigger_dataloader_by_trigger, + prepare_trigger_dataloader_fixed_keys, +) logger = logging.getLogger(__name__) @@ -22,6 +27,12 @@ def __init__(self, trigger_config: dict): self.previous_trigger_id: Optional[int] = None self.previous_model_id: Optional[int] = None + self.previous_data_points: Optional[int] = None + + self.dataloader_info: Optional[DataLoaderInfo] = None + + self.detection_interval: int = 1000 + self.sample_size: Optional[int] = None self.data_cache = [] self.leftover_data_points = 0 @@ -30,10 +41,65 @@ def __init__(self, trigger_config: dict): def _parse_trigger_config(self) -> None: - self.detection_interval: int = 1000 if "data_points_for_detection" in self.trigger_config.keys(): self.detection_interval = self.trigger_config["data_points_for_detection"] assert self.detection_interval > 0, "data_points_for_trigger needs to be at least 1" + + if "sample_size" in self.trigger_config.keys(): + self.sample_size = self.trigger_config["sample_size"] + assert self.sample_size is None or self.sample_size > 0, "sample_size needs to be at least 1" + + def _init_dataloader_info(self) -> None: + assert self.pipeline_id is not None + assert self.pipeline_config is not None + assert self.modyn_config is not None + assert self.base_dir is not None + + training_config = self.pipeline_config["training"] + data_config = self.pipeline_config["data"] + + if "num_prefetched_partitions" in training_config: + num_prefetched_partitions = training_config["num_prefetched_partitions"] + else: + if "prefetched_partitions" in training_config: + raise ValueError( + "Found `prefetched_partitions` instead of `num_prefetched_partitions`in training configuration." + + " Please rename/remove that configuration" + ) + logger.warning("Number of prefetched partitions not explicitly given in training config - defaulting to 1.") + num_prefetched_partitions = 1 + + if "parallel_prefetch_requests" in training_config: + parallel_prefetch_requests = training_config["parallel_prefetch_requests"] + else: + logger.warning( + "Number of parallel prefetch requests not explicitly given in training config - defaulting to 1." + ) + parallel_prefetch_requests = 1 + + if "tokenizer" in data_config: + tokenizer = data_config["tokenizer"] + else: + tokenizer = None + + if "transformations" in data_config: + transform_list = data_config["transformations"] + else: + transform_list = [] + + self.dataloader_info = DataLoaderInfo( + self.pipeline_id, + dataset_id=data_config["dataset_id"], + num_dataloaders=training_config["dataloader_workers"], + batch_size=training_config["batch_size"], + bytes_parser=data_config["bytes_parser_function"], + transform_list=transform_list, + storage_address=f"{self.modyn_config['storage']['hostname']}:{self.modyn_config['storage']['port']}", + selector_address=f"{self.modyn_config['selector']['hostname']}:{self.modyn_config['selector']['port']}", + num_prefetched_partitions=num_prefetched_partitions, + parallel_prefetch_requests=parallel_prefetch_requests, + tokenizer=tokenizer, + ) def _create_dirs(self) -> None: assert self.pipeline_id is not None @@ -56,11 +122,32 @@ def init_trigger( if "trigger_config" in self.pipeline_config["trigger"].keys(): trigger_config = self.pipeline_config["trigger"]["trigger_config"] self._parse_trigger_config(trigger_config) + + self._init_dataloader_info() self._create_dirs() def detect_drift(self, idx_start, idx_end) -> bool: - pass + assert self.previous_trigger_id is not None + assert self.previous_data_points is not None and self.previous_data_points > 0 + assert self.previous_model_id is not None + assert self.dataloader_info is not None + + reference_dataloader = prepare_trigger_dataloader_by_trigger( + self.previous_trigger_id, + self.dataloader_info, + data_points_in_trigger=self.previous_data_points, + sample_size=self.sample_size, + ) + + current_keys, _, _ = zip(*self.data_cache[idx_start:idx_end]) # type: ignore + current_dataloader = prepare_trigger_dataloader_fixed_keys( + self.previous_trigger_id + 1, + self.dataloader_info, + current_keys, + sample_size=self.sample_size, + ) + def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: @@ -110,3 +197,7 @@ def inform_previous_trigger(self, previous_trigger_id: int) -> None: def inform_previous_model(self, previous_model_id: int) -> None: self.previous_model_id = previous_model_id + + def inform_previous_trigger_data_points(self, previous_trigger_id: int, data_points: int) -> None: + assert self.previous_trigger_id == previous_trigger_id + self.previous_data_points = data_points diff --git a/modyn/supervisor/internal/triggers/timetrigger.py b/modyn/supervisor/internal/triggers/timetrigger.py index e1f9414e7..9a81ae216 100644 --- a/modyn/supervisor/internal/triggers/timetrigger.py +++ b/modyn/supervisor/internal/triggers/timetrigger.py @@ -62,3 +62,6 @@ def inform_previous_trigger(self, previous_trigger_id: int) -> None: def inform_previous_model(self, previous_model_id: int) -> None: pass + + def inform_previous_trigger_data_points(self, previous_trigger_id: int, data_points: int) -> None: + pass diff --git a/modyn/supervisor/internal/triggers/trigger.py b/modyn/supervisor/internal/triggers/trigger.py index df0fe60d7..cd10895c4 100644 --- a/modyn/supervisor/internal/triggers/trigger.py +++ b/modyn/supervisor/internal/triggers/trigger.py @@ -36,3 +36,7 @@ def inform_previous_trigger(self, previous_trigger_id: int) -> None: @abstractmethod def inform_previous_model(self, previous_model_id: int) -> None: """The supervisor informs the Trigger about the model_id of the previous trigger""" + + @abstractmethod + def inform_previous_trigger_data_points(self, previous_trigger_id: int, data_points: int) -> None: + """The supervisor informs the Trigger about data points trained in previous trigger""" diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/__init__.py b/modyn/supervisor/internal/triggers/trigger_datasets/__init__.py new file mode 100644 index 000000000..9805cc02a --- /dev/null +++ b/modyn/supervisor/internal/triggers/trigger_datasets/__init__.py @@ -0,0 +1,13 @@ +"""Supervisor module. The supervisor initiates a pipeline and coordinates all components. + +""" + +import os + +from .dataloader_info import DataLoaderInfo # noqa: F401 +from .online_trigger_dataset import OnlineTriggerDataset # noqa: F401 +from .fixed_keys_dataset import FixedKeysDataset # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/dataloader_info.py b/modyn/supervisor/internal/triggers/trigger_datasets/dataloader_info.py new file mode 100644 index 000000000..8d9cf7189 --- /dev/null +++ b/modyn/supervisor/internal/triggers/trigger_datasets/dataloader_info.py @@ -0,0 +1,31 @@ +from typing import Optional + + +# TODO(366): Unify with similar classes in trainer_server and evaluator +class DataLoaderInfo: + def __init__( + self, + pipeline_id: int, + dataset_id: str, + num_dataloaders: int, + batch_size: int, + bytes_parser: str, + transform_list: list[str], + storage_address: str, + selector_address: str, + num_prefetched_partitions: int, + parallel_prefetch_requests: int, + tokenizer: Optional[None], + ): + self.pipeline_id = pipeline_id + self.dataset_id = dataset_id + self.num_dataloaders = num_dataloaders + self.batch_size = batch_size + self.bytes_parser = bytes_parser + self.transform_list = transform_list + self.storage_address = storage_address + self.selector_address = selector_address + self.num_prefetched_partitions = num_prefetched_partitions + self.parallel_prefetch_requests = parallel_prefetch_requests + self.tokenizer = tokenizer + self.training_id = -1 diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py b/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py new file mode 100644 index 000000000..4981ceb1e --- /dev/null +++ b/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py @@ -0,0 +1,135 @@ +import logging +import math +from typing import Callable, Generator, Iterable, Optional + +import grpc +from modyn.storage.internal.grpc.generated.storage_pb2 import ( # pylint: disable=no-name-in-module + GetRequest, + GetResponse, +) +from modyn.storage.internal.grpc.generated.storage_pb2_grpc import StorageStub +from modyn.utils import ( + BYTES_PARSER_FUNC_NAME, + MAX_MESSAGE_SIZE, + deserialize_function, + grpc_connection_established, + instantiate_class, +) +from torch.utils.data import IterableDataset, get_worker_info +from torchvision import transforms + +logger = logging.getLogger(__name__) + + +# TODO: update __init__ +# TODO(#275): inherit common abstraction of dataset +class FixedKeysDataset(IterableDataset): + # pylint: disable=too-many-instance-attributes, abstract-method + def __init__( + self, + dataset_id: str, + bytes_parser: str, + serialized_transforms: list[str], + storage_address: str, + trigger_id: int, + keys: list[int], + tokenizer: Optional[str] = None, + ): + self._trigger_id = trigger_id + self._dataset_id = dataset_id + self._first_call = True + + self._bytes_parser = bytes_parser + self._serialized_transforms = serialized_transforms + self._storage_address = storage_address + self._transform_list: list[Callable] = [] + self._transform: Optional[Callable] = None + self._storagestub: StorageStub = None + self._bytes_parser_function: Optional[Callable] = None + + # tokenizer for NLP tasks + self._tokenizer = None + self._tokenizer_name = tokenizer + if tokenizer is not None: + self._tokenizer = instantiate_class("modyn.models.tokenizers", tokenizer) + + self._keys = keys + + logger.debug("Initialized FixedKeysDataset.") + + def _init_transforms(self) -> None: + self._bytes_parser_function = deserialize_function(self._bytes_parser, BYTES_PARSER_FUNC_NAME) + self._transform = self._bytes_parser_function + self._setup_composed_transform() + + def _setup_composed_transform(self) -> None: + assert self._bytes_parser_function is not None + + self._transform_list = [self._bytes_parser_function] + for transform in self._serialized_transforms: + function = eval(transform) # pylint: disable=eval-used + self._transform_list.append(function) + + if self._tokenizer is not None: + self._transform_list.append(self._tokenizer) + + if len(self._transform_list) > 0: + self._transform = transforms.Compose(self._transform_list) + + def _init_grpc(self) -> None: + storage_channel = grpc.insecure_channel( + self._storage_address, + options=[ + ("grpc.max_receive_message_length", MAX_MESSAGE_SIZE), + ("grpc.max_send_message_length", MAX_MESSAGE_SIZE), + ], + ) + if not grpc_connection_established(storage_channel): + raise ConnectionError(f"Could not establish gRPC connection to storage at address {self._storage_address}.") + self._storagestub = StorageStub(storage_channel) + + def _info(self, msg: str, worker_id: Optional[int]) -> None: # pragma: no cover + logger.info(f"[Trigger {self._trigger_id}][Worker {worker_id}] {msg}") + + def _debug(self, msg: str, worker_id: Optional[int]) -> None: # pragma: no cover + logger.debug(f"[Trigger {self._trigger_id}][Worker {worker_id}] {msg}") + + @staticmethod + def _silence_pil() -> None: # pragma: no cover + pil_logger = logging.getLogger("PIL") + pil_logger.setLevel(logging.INFO) # by default, PIL on DEBUG spams the console + + def _get_data_from_storage(self, keys: list[int]) -> Iterable[list[tuple[int, bytes, int]]]: + request = GetRequest(dataset_id=self._dataset_id, keys=keys) + response: GetResponse + for response in self._storagestub.Get(request): + yield list(zip(response.keys, response.samples, response.labels)) + + def __iter__(self) -> Generator: + worker_info = get_worker_info() + if worker_info is None: + # Non-multi-threaded data loading. We use worker_id 0. + worker_id = 0 + else: + worker_id = worker_info.id + + if self._first_call: + self._first_call = False + self._debug("This is the first run of iter, making gRPC connections.", worker_id) + # We have to initialize transformations and gRPC connections here to do it per dataloader worker, + # otherwise the transformations/gRPC connections cannot be pickled for the new processes. + self._init_transforms() + self._init_grpc() + FixedKeysDataset._silence_pil() + self._debug("gRPC initialized.", worker_id) + + assert self._transform is not None + + total_samples = len(self._keys) + keys_per_worker = int(math.ceil(total_samples / worker_info.num_workers)) + worker_keys = self._keys[worker_id * keys_per_worker : min(total_samples, (worker_id+1) * keys_per_worker)] + + # TODO(#175): we might want to do/accelerate prefetching here. + for data in self._get_data_from_storage(worker_keys): + for key, sample, label in data: + yield key, self._transform(sample), label diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/online_trigger_dataset.py b/modyn/supervisor/internal/triggers/trigger_datasets/online_trigger_dataset.py new file mode 100644 index 000000000..6f8993ada --- /dev/null +++ b/modyn/supervisor/internal/triggers/trigger_datasets/online_trigger_dataset.py @@ -0,0 +1,55 @@ +import logging +import random +from typing import Generator, Optional + +from modyn.trainer_server.internal.dataset.online_dataset import OnlineDataset +from torch.utils.data import IterableDataset + +logger = logging.getLogger(__name__) + + +# TODO(#275): inherit common abstraction of dataset +class OnlineTriggerDataset(OnlineDataset, IterableDataset): + # pylint: disable=too-many-instance-attributes, abstract-method + def __init__( + self, + pipeline_id: int, + trigger_id: int, + dataset_id: str, + bytes_parser: str, + serialized_transforms: list[str], + storage_address: str, + selector_address: str, + training_id: int, + num_prefetched_partitions: int, + parallel_prefetch_requests: int, + tokenizer: Optional[str] = None, + sample_prob: Optional[float] = None, + ): + OnlineDataset.__init__( + self, + pipeline_id, + trigger_id, + dataset_id, + bytes_parser, + serialized_transforms, + storage_address, + selector_address, + training_id, + num_prefetched_partitions, + parallel_prefetch_requests, + tokenizer, + None, + ) + self._sample_prob = sample_prob + + # pylint: disable=too-many-locals, too-many-branches, too-many-statements + + def __iter__(self) -> Generator: + for transformed_tuple in OnlineDataset.__iter__(self): + if self._sample_prob is not None: + prob = random.random() + if prob < self._sample_prob: + yield transformed_tuple + else: + yield transformed_tuple diff --git a/modyn/supervisor/internal/triggers/utils.py b/modyn/supervisor/internal/triggers/utils.py new file mode 100644 index 000000000..80cb0c5de --- /dev/null +++ b/modyn/supervisor/internal/triggers/utils.py @@ -0,0 +1,68 @@ +import logging +import random +from typing import Optional, Union + +from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader +from modyn.supervisor.internal.triggers.trigger_datasets import ( + DataLoaderInfo, + OnlineTriggerDataset, + FixedKeysDataset, +) + +import pandas as pd +import torch +from torch.utils.data import DataLoader + +logger = logging.getLogger(__name__) + + +def prepare_trigger_dataloader_by_trigger( + trigger_id: int, + dataloader_info: DataLoaderInfo, + data_points_in_trigger: Optional[int] = None, + sample_size: Optional[int] = None, +) -> DataLoader: + sample_prob: Optional[float] = None + if data_points_in_trigger is not None and sample_size is not None: + sample_prob = sample_size / data_points_in_trigger + + train_set = OnlineTriggerDataset( + dataloader_info.pipeline_id, + trigger_id, + dataloader_info.dataset_id, + dataloader_info.bytes_parser, + dataloader_info.transform_list, + dataloader_info.storage_address, + dataloader_info.selector_address, + dataloader_info.training_id, + dataloader_info.num_prefetched_partitions, + dataloader_info.parallel_prefetch_requests, + dataloader_info.tokenizer, + sample_prob, + ) + + logger.debug("Creating online trigger DataLoader.") + return DataLoader(train_set, batch_size=dataloader_info.batch_size, num_workers=dataloader_info.num_dataloaders) + + +def prepare_trigger_dataloader_fixed_keys( + trigger_id: int, + dataloader_info: DataLoaderInfo, + keys: list[int], + sample_size: Optional[int] = None, +) -> DataLoader: + if sample_size is not None: + keys = random.sample(keys, min(len(keys), sample_size)) + + train_set = FixedKeysDataset( + dataloader_info.dataset_id, + dataloader_info.bytes_parser, + dataloader_info.transform_list, + dataloader_info.storage_address, + trigger_id, + keys, + dataloader_info.tokenizer, + ) + + logger.debug("Creating fixed keys DataLoader.") + return DataLoader(train_set, batch_size=dataloader_info.batch_size, num_workers=dataloader_info.num_dataloaders) \ No newline at end of file From 0627a267456c19b956cbe4bc4c295f447314bbc1 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Sun, 28 Apr 2024 01:10:14 +0200 Subject: [PATCH 04/32] Modify pipeline_executor handle new data logic for DataDriftTrigger --- .../pipeline_executor/pipeline_executor.py | 84 ++++++++++--------- 1 file changed, 46 insertions(+), 38 deletions(-) diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index 3cd5865fa..79e2cc398 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -191,25 +191,23 @@ def _handle_new_data(self, new_data: list[tuple[int, int, int]]) -> bool: def _handle_new_data_batch(self, batch: list[tuple[int, int, int]]) -> bool: self._sw.start("trigger_inform", overwrite=True) - triggering_indices = self.trigger.inform(batch) - num_triggers = len(triggering_indices) - self.pipeline_log["supervisor"]["num_triggers"] += len(triggering_indices) + triggering_indices: Generator[int] = self.trigger.inform(batch) + num_triggers = self._handle_triggers_within_batch(batch, triggering_indices) + + logger.info(f"There are {num_triggers} triggers in this batch.") + self.pipeline_log["supervisor"]["num_triggers"] += num_triggers self.pipeline_log["supervisor"]["trigger_batch_times"].append( {"batch_size": len(batch), "time": self._sw.stop("trigger_inform"), "num_triggers": num_triggers} ) - if num_triggers > 0: - logger.info(f"There are {num_triggers} triggers in this batch.") - self._handle_triggers_within_batch(batch, triggering_indices) - return True - - self._sw.start("selector_inform", overwrite=True) - selector_log = self.grpc.inform_selector(self.pipeline_id, batch) - self.pipeline_log["supervisor"]["selector_informs"].append( - {"total_selector_time": self._sw.stop(), "selector_log": selector_log} - ) + if num_triggers == 0: + self._sw.start("selector_inform", overwrite=True) + selector_log = self.grpc.inform_selector(self.pipeline_id, batch) + self.pipeline_log["supervisor"]["selector_informs"].append( + {"total_selector_time": self._sw.stop(), "selector_log": selector_log} + ) - return False + return num_triggers > 0 def _run_training(self, trigger_id: int) -> None: """Run training for trigger on GPU and block until done.""" @@ -239,6 +237,8 @@ def _run_training(self, trigger_id: int) -> None: # We store the trained model for evaluation in any case. self._sw.start("store_trained_model", overwrite=True) model_id = self.grpc.store_trained_model(self.current_training_id) + self.trigger.inform_previous_trigger(trigger_id) + self.trigger.inform_previous_model(model_id) self.pipeline_log["supervisor"]["triggers"][trigger_id]["store_trained_model_time"] = self._sw.stop() # Only if the pipeline actually wants to continue the training on it, we set previous model. @@ -268,8 +268,12 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg previous_trigger_idx = 0 logger.info("Handling triggers within batch.") self._update_pipeline_stage_and_enqueue_msg(PipelineStage.HANDLE_TRIGGERS_WITHIN_BATCH, MsgType.GENERAL) + num_triggers_in_batch = 0 + last_trigger_id: Optional[int] = None + + for triggering_idx in triggering_indices: + num_triggers_in_batch += 1 - for i, triggering_idx in enumerate(triggering_indices): self._update_pipeline_stage_and_enqueue_msg(PipelineStage.INFORM_SELECTOR_AND_TRIGGER, MsgType.GENERAL) triggering_data = batch[previous_trigger_idx : triggering_idx + 1] previous_trigger_idx = triggering_idx + 1 @@ -280,6 +284,7 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg # that all data until that point has been processed by the selector. self._sw.start("selector_inform", overwrite=True) trigger_id, selector_log = self.grpc.inform_selector_and_trigger(self.pipeline_id, triggering_data) + last_trigger_id = trigger_id self.pipeline_log["supervisor"]["triggers"][trigger_id] = { "total_selector_time": self._sw.stop(), "selector_log": selector_log, @@ -289,36 +294,39 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg num_samples_in_trigger = self.grpc.get_number_of_samples(self.pipeline_id, trigger_id) if num_samples_in_trigger > 0: self._run_training(trigger_id) # Blocks until training is done. + self.trigger.inform_previous_trigger_data_points(trigger_id, num_samples_in_trigger) self._update_pipeline_stage_and_enqueue_msg( PipelineStage.HANDLE_TRIGGERS_WITHIN_BATCH, MsgType.ID, id_submsg(IdType.TRIGGER, trigger_id) ) else: logger.info(f"Skipping training on empty trigger {trigger_id}]") - - # If no other trigger is coming in this batch, - # we have to inform the Selector about the remaining data in this batch. - if i == len(triggering_indices) - 1: - remaining_data = batch[triggering_idx + 1 :] - logger.info(f"There are {len(remaining_data)} data points remaining after the trigger.") - - if len(remaining_data) > 0: - # These data points will be included in the next trigger - # because we inform the Selector about them, - # just like other batches with no trigger at all are included. - self._update_pipeline_stage_and_enqueue_msg( - PipelineStage.INFORM_SELECTOR_REMAINING_DATA, MsgType.ID, id_submsg(IdType.TRIGGER, trigger_id) - ) - self._sw.start("selector_inform", overwrite=True) - selector_log = self.grpc.inform_selector(self.pipeline_id, remaining_data) - self.pipeline_log["supervisor"]["selector_informs"].append( - {"total_selector_time": self._sw.stop(), "selector_log": selector_log} - ) - - self._persist_pipeline_log() - self.num_triggers = self.num_triggers + 1 if self.maximum_triggers is not None and self.num_triggers >= self.maximum_triggers: - break + return num_triggers_in_batch + + # If no other trigger is coming in this batch, + # we have to inform the Selector about the remaining data in this batch. + + remaining_data = batch[previous_trigger_idx:] + logger.info(f"There are {len(remaining_data)} data points remaining after the trigger.") + + if len(remaining_data) > 0 and last_trigger_id is not None: + # These data points will be included in the next trigger + # because we inform the Selector about them, + # just like other batches with no trigger at all are included. + self._update_pipeline_stage_and_enqueue_msg( + PipelineStage.INFORM_SELECTOR_REMAINING_DATA, + MsgType.ID, + id_submsg(IdType.TRIGGER, last_trigger_id), + ) + self._sw.start("selector_inform", overwrite=True) + selector_log = self.grpc.inform_selector(self.pipeline_id, remaining_data) + self.pipeline_log["supervisor"]["selector_informs"].append( + {"total_selector_time": self._sw.stop(), "selector_log": selector_log} + ) + + self._persist_pipeline_log() + return num_triggers_in_batch def _init_evaluation_writer(self, name: str, trigger_id: int) -> AbstractEvaluationResultWriter: return self.supervisor_supported_eval_result_writers[name](self.pipeline_id, trigger_id, self.eval_directory) From 90d89db2ab05cf9672d3c46901c69f03b7a74662 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Sun, 28 Apr 2024 01:15:40 +0200 Subject: [PATCH 05/32] Update interface to inform trigger id, data points and model id --- .../internal/pipeline_executor/pipeline_executor.py | 3 +-- modyn/supervisor/internal/triggers/amounttrigger.py | 5 +---- .../supervisor/internal/triggers/datadrifttrigger.py | 7 ++----- modyn/supervisor/internal/triggers/timetrigger.py | 7 ++----- modyn/supervisor/internal/triggers/trigger.py | 11 ++++------- 5 files changed, 10 insertions(+), 23 deletions(-) diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index 79e2cc398..e69a1d15d 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -237,7 +237,6 @@ def _run_training(self, trigger_id: int) -> None: # We store the trained model for evaluation in any case. self._sw.start("store_trained_model", overwrite=True) model_id = self.grpc.store_trained_model(self.current_training_id) - self.trigger.inform_previous_trigger(trigger_id) self.trigger.inform_previous_model(model_id) self.pipeline_log["supervisor"]["triggers"][trigger_id]["store_trained_model_time"] = self._sw.stop() @@ -293,8 +292,8 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg num_samples_in_trigger = self.grpc.get_number_of_samples(self.pipeline_id, trigger_id) if num_samples_in_trigger > 0: + self.trigger.inform_previous_trigger_and_data_points(trigger_id, num_samples_in_trigger) self._run_training(trigger_id) # Blocks until training is done. - self.trigger.inform_previous_trigger_data_points(trigger_id, num_samples_in_trigger) self._update_pipeline_stage_and_enqueue_msg( PipelineStage.HANDLE_TRIGGERS_WITHIN_BATCH, MsgType.ID, id_submsg(IdType.TRIGGER, trigger_id) ) diff --git a/modyn/supervisor/internal/triggers/amounttrigger.py b/modyn/supervisor/internal/triggers/amounttrigger.py index 43b211cb0..3b2de7092 100644 --- a/modyn/supervisor/internal/triggers/amounttrigger.py +++ b/modyn/supervisor/internal/triggers/amounttrigger.py @@ -31,11 +31,8 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N yield from triggering_indices - def inform_previous_trigger(self, previous_trigger_id: int) -> None: + def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data_points: int) -> None: pass def inform_previous_model(self, previous_model_id: int) -> None: pass - - def inform_previous_trigger_data_points(self, previous_trigger_id: int, data_points: int) -> None: - pass diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index 14a10f89c..a709c7980 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -192,12 +192,9 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N del self.data_cache[:detection_idx_start] self.leftover_data_points = detection_idx_end - detection_idx_start - def inform_previous_trigger(self, previous_trigger_id: int) -> None: + def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data_points: int) -> None: self.previous_trigger_id = previous_trigger_id + self.previous_data_points = data_points def inform_previous_model(self, previous_model_id: int) -> None: self.previous_model_id = previous_model_id - - def inform_previous_trigger_data_points(self, previous_trigger_id: int, data_points: int) -> None: - assert self.previous_trigger_id == previous_trigger_id - self.previous_data_points = data_points diff --git a/modyn/supervisor/internal/triggers/timetrigger.py b/modyn/supervisor/internal/triggers/timetrigger.py index 9a81ae216..9e15f795e 100644 --- a/modyn/supervisor/internal/triggers/timetrigger.py +++ b/modyn/supervisor/internal/triggers/timetrigger.py @@ -57,11 +57,8 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N yield from triggering_indices - def inform_previous_trigger(self, previous_trigger_id: int) -> None: + def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data_points: int) -> None: pass def inform_previous_model(self, previous_model_id: int) -> None: - pass - - def inform_previous_trigger_data_points(self, previous_trigger_id: int, data_points: int) -> None: - pass + pass \ No newline at end of file diff --git a/modyn/supervisor/internal/triggers/trigger.py b/modyn/supervisor/internal/triggers/trigger.py index cd10895c4..aae01b4a7 100644 --- a/modyn/supervisor/internal/triggers/trigger.py +++ b/modyn/supervisor/internal/triggers/trigger.py @@ -30,13 +30,10 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N """ @abstractmethod - def inform_previous_trigger(self, previous_trigger_id: int) -> None: - """The supervisor informs the Trigger about the previous trigger_id""" + def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data_points: int) -> None: + """The supervisor informs the Trigger about the previous trigger_id + and data points in the previous trigger""" @abstractmethod def inform_previous_model(self, previous_model_id: int) -> None: - """The supervisor informs the Trigger about the model_id of the previous trigger""" - - @abstractmethod - def inform_previous_trigger_data_points(self, previous_trigger_id: int, data_points: int) -> None: - """The supervisor informs the Trigger about data points trained in previous trigger""" + """The supervisor informs the Trigger about the model_id of the previous trigger""" \ No newline at end of file From 272a645287ae81dc69eed0128c404a21cfa4a301 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Sun, 28 Apr 2024 01:34:51 +0200 Subject: [PATCH 06/32] Add model downloader and embedding computation in DataDriftTrigger --- .../internal/triggers/datadrifttrigger.py | 28 ++++- .../triggers/model_downloader/__init__.py | 11 ++ .../model_downloader/model_downloader.py | 103 ++++++++++++++++++ modyn/supervisor/internal/triggers/utils.py | 47 +++++++- 4 files changed, 184 insertions(+), 5 deletions(-) create mode 100644 modyn/supervisor/internal/triggers/model_downloader/__init__.py create mode 100644 modyn/supervisor/internal/triggers/model_downloader/model_downloader.py diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index a709c7980..2e1e4bc19 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -2,13 +2,16 @@ import os import pathlib from typing import Optional, Generator +import pandas as pd # pylint: disable-next=no-name-in-module from modyn.supervisor.internal.triggers.trigger import Trigger from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo +from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader from modyn.supervisor.internal.triggers.utils import ( prepare_trigger_dataloader_by_trigger, prepare_trigger_dataloader_fixed_keys, + get_embeddings_evidently_format ) @@ -28,10 +31,12 @@ def __init__(self, trigger_config: dict): self.previous_trigger_id: Optional[int] = None self.previous_model_id: Optional[int] = None self.previous_data_points: Optional[int] = None + self.model_updated: bool = False self.dataloader_info: Optional[DataLoaderInfo] = None + self.model_downloader: Optional[ModelDownloader] = None - self.detection_interval: int = 1000 + self.detection_interval: int = 2 self.sample_size: Optional[int] = None self.data_cache = [] @@ -108,8 +113,6 @@ def _create_dirs(self) -> None: self.exp_output_dir = self.base_dir / str(self.pipeline_id) / f"datadrift" self.drift_dir = self.exp_output_dir / "drift_results" os.makedirs(self.drift_dir, exist_ok=True) - self.timers_dir = self.exp_output_dir / "timers" - os.makedirs(self.timers_dir, exist_ok=True) def init_trigger( self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path @@ -127,6 +130,10 @@ def init_trigger( self._create_dirs() + def run_detection(self, reference_embeddings_df: pd.DataFrame, current_embeddings_df: pd.DataFrame) -> bool: + return True + + def detect_drift(self, idx_start, idx_end) -> bool: assert self.previous_trigger_id is not None assert self.previous_data_points is not None and self.previous_data_points > 0 @@ -148,6 +155,19 @@ def detect_drift(self, idx_start, idx_end) -> bool: sample_size=self.sample_size, ) + # Download previous model as embedding encoder + # TODO(366) Support custom model as embedding encoder + if self.model_updated: + self.model_downloader.download(self.previous_model_id) + self.model_updated = False + + # Compute embeddings + reference_embeddings_df = get_embeddings_evidently_format(self.model_downloader, reference_dataloader) + current_embeddings_df = get_embeddings_evidently_format(self.model_downloader, current_dataloader) + + drift_detected = self.run_detection(reference_embeddings_df, current_embeddings_df) + + return drift_detected def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: @@ -188,7 +208,7 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N detection_idx_start = detection_idx_end yield trigger_idx - # remove triggered data + # remove triggered data from cache del self.data_cache[:detection_idx_start] self.leftover_data_points = detection_idx_end - detection_idx_start diff --git a/modyn/supervisor/internal/triggers/model_downloader/__init__.py b/modyn/supervisor/internal/triggers/model_downloader/__init__.py new file mode 100644 index 000000000..6e319ec2a --- /dev/null +++ b/modyn/supervisor/internal/triggers/model_downloader/__init__.py @@ -0,0 +1,11 @@ +"""Supervisor module. The supervisor initiates a pipeline and coordinates all components. + +""" + +import os + +from .model_downloader import ModelDownloader # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py new file mode 100644 index 000000000..c79a9514f --- /dev/null +++ b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py @@ -0,0 +1,103 @@ +import io +import json +import logging +import pathlib +from typing import Optional + +import grpc +import pandas as pd +import torch +from modyn.common.ftp import download_trained_model +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.models import TrainedModel + +# pylint: disable-next=no-name-in-module +from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import FetchModelRequest, FetchModelResponse +from modyn.model_storage.internal.grpc.generated.model_storage_pb2_grpc import ModelStorageStub +from modyn.models.coreset_methods_support import CoresetSupportingModule +from modyn.utils import dynamic_module_import +from modyn.utils.utils import grpc_connection_established + +logger = logging.getLogger(__name__) + + +# TODO(366) Unify similar code in trainer_server and evaluator. Create common.utils.ModelDownloader +class ModelDownloader: + def __init__( + self, + modyn_config: dict, + pipeline_id: int, + device: str, + base_dir: pathlib.Path, + model_storage_address: str, + ): + self.modyn_config = modyn_config + self.pipeline_id = pipeline_id + self._device = device + self._device_type = "cuda" if "cuda" in self._device else "cpu" + self.base_dir = base_dir + assert self.base_dir.exists(), f"Temporary Directory {self.base_dir} should have been created." + + self._model_storage_stub = self.connect_to_model_storage(model_storage_address) + + self.model_configuration_dict = None + self.model_handler = None + self._amp: Optional[bool] = None + self._model = None + + @staticmethod + def connect_to_model_storage(model_storage_address: str) -> ModelStorageStub: + model_storage_channel = grpc.insecure_channel(model_storage_address) + assert model_storage_channel is not None + if not grpc_connection_established(model_storage_channel): + raise ConnectionError( + f"Could not establish gRPC connection to model storage at address {model_storage_address}." + ) + return ModelStorageStub(model_storage_channel) + + def _load_state(self, path: pathlib.Path) -> None: + assert path.exists(), "Cannot load state from non-existing file" + + self._info(f"Loading model state from {path}") + with open(path, "rb") as state_file: + checkpoint = torch.load(io.BytesIO(state_file.read()), map_location=torch.device("cpu")) + + assert "model" in checkpoint + self._model.model.load_state_dict(checkpoint["model"]) + + # delete trained model from disk + path.unlink() + + def configure(self, model_id: int) -> None: + with MetadataDatabaseConnection(self.modyn_config) as database: + trained_model: Optional[TrainedModel] = database.session.get(TrainedModel, model_id) + if not trained_model: + logger.error(f"Trained model {model_id} does not exist!") + return + model_class_name, model_config, amp = database.get_model_configuration(trained_model.pipeline_id) + self._amp = amp + model_module = dynamic_module_import("modyn.models") + self.model_handler = getattr(model_module, model_class_name) + self.model_configuration_dict = json.loads(model_config) + self._model = self.model_handler(self.model_configuration_dict, self._device, amp) + assert isinstance(self._model.model, CoresetSupportingModule) + + def download(self, model_id: int) -> None: + self.configure(model_id) + + fetch_request = FetchModelRequest(model_id=model_id, load_metadata=False) + fetch_resp: FetchModelResponse = self._model_storage_stub.FetchModel(fetch_request) + + if not fetch_resp.success: + logger.error(f"Trained model {model_id} cannot be fetched from model storage. ") + raise Exception("Failed to fetch trained model") # pylint: disable=broad-exception-raised + trained_model_path = download_trained_model( + logger=logger, + model_storage_config=self.modyn_config["model_storage"], + remote_path=pathlib.Path(fetch_resp.model_path), + checksum=fetch_resp.checksum, + identifier=self.pipeline_id, + base_directory=self.base_dir, + ) + self._load_state(trained_model_path) + diff --git a/modyn/supervisor/internal/triggers/utils.py b/modyn/supervisor/internal/triggers/utils.py index 80cb0c5de..5201713c6 100644 --- a/modyn/supervisor/internal/triggers/utils.py +++ b/modyn/supervisor/internal/triggers/utils.py @@ -65,4 +65,49 @@ def prepare_trigger_dataloader_fixed_keys( ) logger.debug("Creating fixed keys DataLoader.") - return DataLoader(train_set, batch_size=dataloader_info.batch_size, num_workers=dataloader_info.num_dataloaders) \ No newline at end of file + return DataLoader(train_set, batch_size=dataloader_info.batch_size, num_workers=dataloader_info.num_dataloaders) + + +def get_embeddings(model_downloader: ModelDownloader, dataloader: DataLoader) -> torch.Tensor: + """ + input: model_downloader with downloaded model + output: embeddings Tensor + """ + assert model_downloader._model is not None + all_embeddings: Optional[torch.Tensor] = None + + model_downloader._model.model.eval() + model_downloader._model.model.embedding_recorder.start_recording() + + with torch.no_grad(): + for batch in dataloader: + data: Union[torch.Tensor, dict] + if isinstance(batch[1], torch.Tensor): + data = batch[1].to(model_downloader._device) + elif isinstance(batch[1], dict): + data: dict[str, torch.Tensor] = {} # type: ignore[no-redef] + for name, tensor in batch[1].items(): + data[name] = tensor.to(model_downloader._device) + else: + raise ValueError(f"data type {type(batch[1])} not supported") + + with torch.autocast(model_downloader._device_type, enabled=model_downloader._amp): + model_downloader._model.model(data) + embeddings = model_downloader._model.model.embedding_recorder.embedding + if all_embeddings is None: + all_embeddings = embeddings + else: + all_embeddings = torch.cat((all_embeddings, embeddings), 0) + + model_downloader._model.model.embedding_recorder.end_recording() + + return all_embeddings + + +def get_embeddings_evidently_format( + model_downloader: ModelDownloader, dataloader: torch.utils.data.DataLoader +) -> pd.DataFrame: + embeddings_numpy = get_embeddings(model_downloader, dataloader).cpu().detach().numpy() + embeddings_df = pd.DataFrame(embeddings_numpy).astype("float64") + embeddings_df.columns = ["col_" + str(x) for x in embeddings_df.columns] + return embeddings_df From 07507e21632f2d97b07379d2ff32fda4ba32690f Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Sun, 28 Apr 2024 01:35:33 +0200 Subject: [PATCH 07/32] Add DataDriftTrigger to triggers __init__ --- modyn/supervisor/internal/triggers/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modyn/supervisor/internal/triggers/__init__.py b/modyn/supervisor/internal/triggers/__init__.py index ac411500e..25eff0a86 100644 --- a/modyn/supervisor/internal/triggers/__init__.py +++ b/modyn/supervisor/internal/triggers/__init__.py @@ -4,6 +4,7 @@ import os +from .datadrifttrigger import DataDriftTrigger # noqa: F401 from .amounttrigger import DataAmountTrigger # noqa: F401 from .timetrigger import TimeTrigger # noqa: F401 from .trigger import Trigger # noqa: F401 From 8e99990f64d93dd09cf2bb1c8f9749c08f1513a1 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Sun, 28 Apr 2024 02:29:58 +0200 Subject: [PATCH 08/32] Fix trigger config parser and model downloader in DataDriftTrigger --- .../internal/triggers/datadrifttrigger.py | 21 ++++++++++++------- .../model_downloader/model_downloader.py | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index 2e1e4bc19..b9625c8a4 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -55,11 +55,6 @@ def _parse_trigger_config(self) -> None: assert self.sample_size is None or self.sample_size > 0, "sample_size needs to be at least 1" def _init_dataloader_info(self) -> None: - assert self.pipeline_id is not None - assert self.pipeline_config is not None - assert self.modyn_config is not None - assert self.base_dir is not None - training_config = self.pipeline_config["training"] data_config = self.pipeline_config["data"] @@ -106,6 +101,15 @@ def _init_dataloader_info(self) -> None: tokenizer=tokenizer, ) + def _init_model_downloader(self) -> None: + self.model_downloader = ModelDownloader( + self.modyn_config, + self.pipeline_id, + self.pipeline_config["training"]["device"], + self.base_dir, + f"{self.modyn_config['model_storage']['hostname']}:{self.modyn_config['model_storage']['port']}", + ) + def _create_dirs(self) -> None: assert self.pipeline_id is not None assert self.base_dir is not None @@ -122,11 +126,11 @@ def init_trigger( self.modyn_config = modyn_config self.base_dir = base_dir - if "trigger_config" in self.pipeline_config["trigger"].keys(): - trigger_config = self.pipeline_config["trigger"]["trigger_config"] - self._parse_trigger_config(trigger_config) + if len(self.trigger_config) > 0: + self._parse_trigger_config() self._init_dataloader_info() + self._init_model_downloader() self._create_dirs() @@ -218,3 +222,4 @@ def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data def inform_previous_model(self, previous_model_id: int) -> None: self.previous_model_id = previous_model_id + self.model_updated = True diff --git a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py index c79a9514f..b1672b552 100644 --- a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py +++ b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py @@ -58,7 +58,7 @@ def connect_to_model_storage(model_storage_address: str) -> ModelStorageStub: def _load_state(self, path: pathlib.Path) -> None: assert path.exists(), "Cannot load state from non-existing file" - self._info(f"Loading model state from {path}") + logger.info(f"Loading model state from {path}") with open(path, "rb") as state_file: checkpoint = torch.load(io.BytesIO(state_file.read()), map_location=torch.device("cpu")) From 98bed24fdbeda84bb85268248f46fd51f56eceee Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Sun, 28 Apr 2024 02:58:23 +0200 Subject: [PATCH 09/32] Fix linting --- .../pipeline_executor/pipeline_executor.py | 4 +-- .../supervisor/internal/triggers/__init__.py | 2 +- .../internal/triggers/amounttrigger.py | 9 +++-- .../internal/triggers/datadrifttrigger.py | 34 ++++++++----------- .../model_downloader/model_downloader.py | 2 -- .../internal/triggers/timetrigger.py | 17 +++++----- modyn/supervisor/internal/triggers/trigger.py | 12 +++---- .../triggers/trigger_datasets/__init__.py | 2 +- .../trigger_datasets/fixed_keys_dataset.py | 3 +- modyn/supervisor/internal/triggers/utils.py | 9 ++--- 10 files changed, 40 insertions(+), 54 deletions(-) diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index e69a1d15d..b71304035 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -6,7 +6,7 @@ import sys import traceback from time import sleep -from typing import Any, Optional, Generator +from typing import Any, Generator, Optional from modyn.common.benchmark import Stopwatch from modyn.supervisor.internal.evaluation_result_writer import AbstractEvaluationResultWriter, LogResultWriter @@ -305,7 +305,7 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg # If no other trigger is coming in this batch, # we have to inform the Selector about the remaining data in this batch. - + remaining_data = batch[previous_trigger_idx:] logger.info(f"There are {len(remaining_data)} data points remaining after the trigger.") diff --git a/modyn/supervisor/internal/triggers/__init__.py b/modyn/supervisor/internal/triggers/__init__.py index 25eff0a86..7c296b5ed 100644 --- a/modyn/supervisor/internal/triggers/__init__.py +++ b/modyn/supervisor/internal/triggers/__init__.py @@ -4,8 +4,8 @@ import os -from .datadrifttrigger import DataDriftTrigger # noqa: F401 from .amounttrigger import DataAmountTrigger # noqa: F401 +from .datadrifttrigger import DataDriftTrigger # noqa: F401 from .timetrigger import TimeTrigger # noqa: F401 from .trigger import Trigger # noqa: F401 diff --git a/modyn/supervisor/internal/triggers/amounttrigger.py b/modyn/supervisor/internal/triggers/amounttrigger.py index 3b2de7092..16d0c67b3 100644 --- a/modyn/supervisor/internal/triggers/amounttrigger.py +++ b/modyn/supervisor/internal/triggers/amounttrigger.py @@ -1,5 +1,6 @@ import pathlib from typing import Generator + from modyn.supervisor.internal.triggers.trigger import Trigger @@ -15,10 +16,8 @@ def __init__(self, trigger_config: dict): self.remaining_data_points = 0 super().__init__(trigger_config) - - def init_trigger( - self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path - ) -> None: + + def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path) -> None: pass def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: @@ -30,7 +29,7 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N self.remaining_data_points = (self.remaining_data_points + len(new_data)) % self.data_points_for_trigger yield from triggering_indices - + def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data_points: int) -> None: pass diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index b9625c8a4..f9ab98842 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -1,20 +1,20 @@ import logging import os import pathlib -from typing import Optional, Generator +from typing import Generator, Optional + import pandas as pd +from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader # pylint: disable-next=no-name-in-module from modyn.supervisor.internal.triggers.trigger import Trigger from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo -from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader from modyn.supervisor.internal.triggers.utils import ( + get_embeddings_evidently_format, prepare_trigger_dataloader_by_trigger, prepare_trigger_dataloader_fixed_keys, - get_embeddings_evidently_format ) - logger = logging.getLogger(__name__) @@ -38,22 +38,24 @@ def __init__(self, trigger_config: dict): self.detection_interval: int = 2 self.sample_size: Optional[int] = None - + self.data_cache = [] self.leftover_data_points = 0 + self.exp_output_dir: Optional[pathlib.Path] = None + self.drift_dir: Optional[pathlib.Path] = None + super().__init__(trigger_config) - def _parse_trigger_config(self) -> None: if "data_points_for_detection" in self.trigger_config.keys(): self.detection_interval = self.trigger_config["data_points_for_detection"] assert self.detection_interval > 0, "data_points_for_trigger needs to be at least 1" - + if "sample_size" in self.trigger_config.keys(): self.sample_size = self.trigger_config["sample_size"] assert self.sample_size is None or self.sample_size > 0, "sample_size needs to be at least 1" - + def _init_dataloader_info(self) -> None: training_config = self.pipeline_config["training"] data_config = self.pipeline_config["data"] @@ -100,7 +102,7 @@ def _init_dataloader_info(self) -> None: parallel_prefetch_requests=parallel_prefetch_requests, tokenizer=tokenizer, ) - + def _init_model_downloader(self) -> None: self.model_downloader = ModelDownloader( self.modyn_config, @@ -109,18 +111,16 @@ def _init_model_downloader(self) -> None: self.base_dir, f"{self.modyn_config['model_storage']['hostname']}:{self.modyn_config['model_storage']['port']}", ) - + def _create_dirs(self) -> None: assert self.pipeline_id is not None assert self.base_dir is not None - self.exp_output_dir = self.base_dir / str(self.pipeline_id) / f"datadrift" + self.exp_output_dir = self.base_dir / str(self.pipeline_id) / "datadrift" self.drift_dir = self.exp_output_dir / "drift_results" os.makedirs(self.drift_dir, exist_ok=True) - def init_trigger( - self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path - ) -> None: + def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path) -> None: self.pipeline_id = pipeline_id self.pipeline_config = pipeline_config self.modyn_config = modyn_config @@ -133,11 +133,9 @@ def init_trigger( self._init_model_downloader() self._create_dirs() - def run_detection(self, reference_embeddings_df: pd.DataFrame, current_embeddings_df: pd.DataFrame) -> bool: return True - def detect_drift(self, idx_start, idx_end) -> bool: assert self.previous_trigger_id is not None assert self.previous_data_points is not None and self.previous_data_points > 0 @@ -164,7 +162,7 @@ def detect_drift(self, idx_start, idx_end) -> bool: if self.model_updated: self.model_downloader.download(self.previous_model_id) self.model_updated = False - + # Compute embeddings reference_embeddings_df = get_embeddings_evidently_format(self.model_downloader, reference_dataloader) current_embeddings_df = get_embeddings_evidently_format(self.model_downloader, current_dataloader) @@ -173,7 +171,6 @@ def detect_drift(self, idx_start, idx_end) -> bool: return drift_detected - def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: """ Use Generator here because this data drift trigger @@ -202,7 +199,6 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N # if exist previous trigger, detect drift triggered = self.detect_drift(detection_idx_start, detection_idx_end) - if triggered: trigger_data_points = detection_idx_end - detection_idx_start trigger_idx = len(new_data) - (untriggered_data_points - trigger_data_points) - 1 diff --git a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py index b1672b552..c5b1c01cd 100644 --- a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py +++ b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py @@ -5,7 +5,6 @@ from typing import Optional import grpc -import pandas as pd import torch from modyn.common.ftp import download_trained_model from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection @@ -100,4 +99,3 @@ def download(self, model_id: int) -> None: base_directory=self.base_dir, ) self._load_state(trained_model_path) - diff --git a/modyn/supervisor/internal/triggers/timetrigger.py b/modyn/supervisor/internal/triggers/timetrigger.py index 9e15f795e..26bcdf775 100644 --- a/modyn/supervisor/internal/triggers/timetrigger.py +++ b/modyn/supervisor/internal/triggers/timetrigger.py @@ -1,5 +1,5 @@ import pathlib -from typing import Optional, Generator +from typing import Generator, Optional from modyn.supervisor.internal.triggers.trigger import Trigger from modyn.utils import convert_timestr_to_seconds, validate_timestr @@ -26,24 +26,25 @@ def __init__(self, trigger_config: dict): super().__init__(trigger_config) - def init_trigger( - self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path - ) -> None: - pass + def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path) -> None: + pass def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: if self.next_trigger_at is None: if len(new_data) > 0: self.next_trigger_at = new_data[0][1] + self.trigger_every_s # new_data is sorted else: - return [] + return max_timestamp = new_data[-1][1] # new_data is sorted triggering_indices = [] while self.next_trigger_at <= max_timestamp: # The next line gets the first item which has a timestamp larger or equal to the triggering timestamp - idx = next(idx for (idx, (_, timestamp, _)) in enumerate(new_data) if timestamp >= self.next_trigger_at) + try: + idx = next(idx for (idx, (_, timestamp, _)) in enumerate(new_data) if timestamp >= self.next_trigger_at) + except StopIteration: + break # This index `idx` describes the first item not belonging to the trigger. # Hence, the previous item causes a trigger. # If this is the first item, then we need to emit a trigger for index -1. @@ -61,4 +62,4 @@ def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data pass def inform_previous_model(self, previous_model_id: int) -> None: - pass \ No newline at end of file + pass diff --git a/modyn/supervisor/internal/triggers/trigger.py b/modyn/supervisor/internal/triggers/trigger.py index aae01b4a7..50e847429 100644 --- a/modyn/supervisor/internal/triggers/trigger.py +++ b/modyn/supervisor/internal/triggers/trigger.py @@ -1,16 +1,14 @@ import pathlib -from typing import Generator from abc import ABC, abstractmethod +from typing import Generator class Trigger(ABC): def __init__(self, trigger_config: dict) -> None: assert trigger_config is not None, "trigger_config cannot be None." - + @abstractmethod - def init_trigger( - self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path - ) -> None: + def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path) -> None: """The supervisor initializes the concrete Trigger with Trigger-type-specific configurations base_dir: the base directory to store Trigger outputs. A location at the supervisor. """ @@ -28,7 +26,7 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N Returns: triggering_indices (list[int]): List of all indices that trigger training """ - + @abstractmethod def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data_points: int) -> None: """The supervisor informs the Trigger about the previous trigger_id @@ -36,4 +34,4 @@ def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data @abstractmethod def inform_previous_model(self, previous_model_id: int) -> None: - """The supervisor informs the Trigger about the model_id of the previous trigger""" \ No newline at end of file + """The supervisor informs the Trigger about the model_id of the previous trigger""" diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/__init__.py b/modyn/supervisor/internal/triggers/trigger_datasets/__init__.py index 9805cc02a..87f98de46 100644 --- a/modyn/supervisor/internal/triggers/trigger_datasets/__init__.py +++ b/modyn/supervisor/internal/triggers/trigger_datasets/__init__.py @@ -5,8 +5,8 @@ import os from .dataloader_info import DataLoaderInfo # noqa: F401 -from .online_trigger_dataset import OnlineTriggerDataset # noqa: F401 from .fixed_keys_dataset import FixedKeysDataset # noqa: F401 +from .online_trigger_dataset import OnlineTriggerDataset # noqa: F401 files = os.listdir(os.path.dirname(__file__)) files.remove("__init__.py") diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py b/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py index 4981ceb1e..538481b2d 100644 --- a/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py +++ b/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py @@ -21,7 +21,6 @@ logger = logging.getLogger(__name__) -# TODO: update __init__ # TODO(#275): inherit common abstraction of dataset class FixedKeysDataset(IterableDataset): # pylint: disable=too-many-instance-attributes, abstract-method @@ -127,7 +126,7 @@ def __iter__(self) -> Generator: total_samples = len(self._keys) keys_per_worker = int(math.ceil(total_samples / worker_info.num_workers)) - worker_keys = self._keys[worker_id * keys_per_worker : min(total_samples, (worker_id+1) * keys_per_worker)] + worker_keys = self._keys[worker_id * keys_per_worker : min(total_samples, (worker_id + 1) * keys_per_worker)] # TODO(#175): we might want to do/accelerate prefetching here. for data in self._get_data_from_storage(worker_keys): diff --git a/modyn/supervisor/internal/triggers/utils.py b/modyn/supervisor/internal/triggers/utils.py index 5201713c6..d9c3e0067 100644 --- a/modyn/supervisor/internal/triggers/utils.py +++ b/modyn/supervisor/internal/triggers/utils.py @@ -2,15 +2,10 @@ import random from typing import Optional, Union -from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader -from modyn.supervisor.internal.triggers.trigger_datasets import ( - DataLoaderInfo, - OnlineTriggerDataset, - FixedKeysDataset, -) - import pandas as pd import torch +from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader +from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo, FixedKeysDataset, OnlineTriggerDataset from torch.utils.data import DataLoader logger = logging.getLogger(__name__) From 7c1b0bde8b6e2b68caf7b83fad7358c61a0c54a7 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 30 Apr 2024 19:51:40 +0200 Subject: [PATCH 10/32] Add evidently config parser and data drift detection --- .../internal/triggers/datadrifttrigger.py | 31 ++++++++++++++++++- modyn/supervisor/internal/triggers/utils.py | 24 ++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index f9ab98842..abe755a2b 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -1,5 +1,6 @@ import logging import os +import json import pathlib from typing import Generator, Optional @@ -13,8 +14,14 @@ get_embeddings_evidently_format, prepare_trigger_dataloader_by_trigger, prepare_trigger_dataloader_fixed_keys, + get_evidently_metrics, ) +from evidently import ColumnMapping +from evidently.metrics import EmbeddingsDriftMetric +from evidently.metrics.data_drift.embedding_drift_methods import distance, mmd, model, ratio +from evidently.report import Report + logger = logging.getLogger(__name__) @@ -38,6 +45,8 @@ def __init__(self, trigger_config: dict): self.detection_interval: int = 2 self.sample_size: Optional[int] = None + self.evidently_column_mapping_name = "data" + self.metrics: Optional[list] = None self.data_cache = [] self.leftover_data_points = 0 @@ -55,6 +64,8 @@ def _parse_trigger_config(self) -> None: if "sample_size" in self.trigger_config.keys(): self.sample_size = self.trigger_config["sample_size"] assert self.sample_size is None or self.sample_size > 0, "sample_size needs to be at least 1" + + self.metrics = get_evidently_metrics(self.evidently_column_mapping_name, self.trigger_config) def _init_dataloader_info(self) -> None: training_config = self.pipeline_config["training"] @@ -134,7 +145,25 @@ def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: di self._create_dirs() def run_detection(self, reference_embeddings_df: pd.DataFrame, current_embeddings_df: pd.DataFrame) -> bool: - return True + # Run Evidently detection + column_mapping = ColumnMapping(embeddings={self.evidently_column_mapping_name: reference_embeddings_df.columns}) + + # https://docs.evidentlyai.com/user-guide/customization/embeddings-drift-parameters + report = Report(metrics=self.metrics) + report.run( + reference_data=reference_embeddings_df, current_data=current_embeddings_df, column_mapping=column_mapping + ) + result = report.as_dict() + result_print = [(x["result"]["drift_score"], x["result"]["method_name"], x["result"]["drift_detected"]) for x in result["metrics"]] + logger.info( + f"[DataDriftDetector][Prev Trigger {self.previous_trigger_id}][Dataset {self.dataloader_info.dataset_id}]" + + f"[Result] {result_print}" + ) + + with open(self.drift_dir / f"drift_{self.previous_trigger_id}.json", "w") as f: + json.dump(result, f) + + return result["metrics"][0]["result"]["drift_detected"] def detect_drift(self, idx_start, idx_end) -> bool: assert self.previous_trigger_id is not None diff --git a/modyn/supervisor/internal/triggers/utils.py b/modyn/supervisor/internal/triggers/utils.py index d9c3e0067..54724ad84 100644 --- a/modyn/supervisor/internal/triggers/utils.py +++ b/modyn/supervisor/internal/triggers/utils.py @@ -6,11 +6,35 @@ import torch from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo, FixedKeysDataset, OnlineTriggerDataset + from torch.utils.data import DataLoader +from evidently import ColumnMapping +from evidently.metrics import EmbeddingsDriftMetric +import evidently.metrics.data_drift.embedding_drift_methods as embedding_drift_methods +from evidently.report import Report logger = logging.getLogger(__name__) +def get_evidently_metrics(column_mapping_name: str, trigger_config: dict = {}) -> list[EmbeddingsDriftMetric]: + metric_name: str = "model" + metric_config: dict = {} + if "metric_name" in trigger_config.keys(): + metric_name = trigger_config["metric_name"] + if "metric_config" in trigger_config.keys(): + metric_config = trigger_config["metric_config"] + + metric = getattr(embedding_drift_methods, metric_name)(**metric_config) + + metrics = [ + EmbeddingsDriftMetric( + column_mapping_name, + drift_method=metric, + ) + ] + return metrics + + def prepare_trigger_dataloader_by_trigger( trigger_id: int, dataloader_info: DataLoaderInfo, From 7ffee8abc30f226a2e8cc6cc1935b525101f87f6 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 30 Apr 2024 20:32:23 +0200 Subject: [PATCH 11/32] Add example pipeline configs using DataDriftTrigger --- .../data_drift_trigger/arxiv_datadrift.yaml | 71 +++++++++++++++++ .../huffpost_datadrift.yaml | 73 ++++++++++++++++++ .../yearbook_datadrift.yaml | 76 +++++++++++++++++++ 3 files changed, 220 insertions(+) create mode 100644 benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml create mode 100644 benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml create mode 100644 benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml new file mode 100644 index 000000000..c34b3ad6d --- /dev/null +++ b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml @@ -0,0 +1,71 @@ +pipeline: + name: ArXiv dataset Test Pipeline + description: Example pipeline + version: 1.0.0 +model: + id: ArticleNet + config: + num_classes: 172 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" +training: + gpus: 1 + device: "cuda:0" + dataloader_workers: 2 + use_previous_model: True + initial_model: random + batch_size: 96 + optimizers: + - name: "default" + algorithm: "SGD" + source: "PyTorch" + param_groups: + - module: "model" + config: + lr: 0.00002 + momentum: 0.9 + weight_decay: 0.01 + optimization_criterion: + name: "CrossEntropyLoss" + checkpointing: + activated: False + selection_strategy: + name: NewDataStrategy + maximum_keys_in_memory: 10000 + config: + storage_backend: "database" + limit: -1 + reset_after_trigger: True + seed: 42 + epochs_per_trigger: 1 +data: + dataset_id: arxiv_train + bytes_parser_function: | + def bytes_parser_function(data: bytes) -> str: + return str(data, "utf8") + tokenizer: DistilBertTokenizerTransform + +trigger: + id: DataDriftTrigger + trigger_config: + data_points_for_detection: 100000 + sample_size: 5000 + +evaluation: + device: "cuda:0" + result_writers: ["json"] + datasets: + - dataset_id: arxiv_test + bytes_parser_function: | + def bytes_parser_function(data: bytes) -> str: + return str(data, "utf8") + tokenizer: DistilBertTokenizerTransform + batch_size: 96 + dataloader_workers: 2 + metrics: + - name: "Accuracy" + evaluation_transformer_function: | + import torch + def evaluation_transformer_function(model_output: torch.Tensor) -> torch.Tensor: + return torch.argmax(model_output, dim=-1) \ No newline at end of file diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml new file mode 100644 index 000000000..96d269723 --- /dev/null +++ b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml @@ -0,0 +1,73 @@ +pipeline: + name: Huffpost dataset Test Pipeline + description: Example pipeline + version: 1.0.0 +model: + id: ArticleNet + config: + num_classes: 55 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" +training: + gpus: 1 + device: "cuda:0" + dataloader_workers: 2 + use_previous_model: True + initial_model: random + batch_size: 64 + optimizers: + - name: "default" + algorithm: "SGD" + source: "PyTorch" + param_groups: + - module: "model" + config: + lr: 0.00002 + momentum: 0.9 + weight_decay: 0.01 + optimization_criterion: + name: "CrossEntropyLoss" + checkpointing: + activated: False + selection_strategy: + name: NewDataStrategy + maximum_keys_in_memory: 1000 + config: + storage_backend: "database" + limit: -1 + reset_after_trigger: True + seed: 42 + epochs_per_trigger: 1 +data: + dataset_id: huffpost_train + bytes_parser_function: | + def bytes_parser_function(data: bytes) -> str: + return str(data, "utf8") + tokenizer: DistilBertTokenizerTransform + +trigger: + id: DataDriftTrigger + trigger_config: + data_points_for_detection: 5000 + metric_name: mmd + metric_config: + bootstrap: False + +evaluation: + device: "cuda:0" + result_writers: ["json"] + datasets: + - dataset_id: huffpost_test + bytes_parser_function: | + def bytes_parser_function(data: bytes) -> str: + return str(data, "utf8") + tokenizer: DistilBertTokenizerTransform + batch_size: 64 + dataloader_workers: 2 + metrics: + - name: "Accuracy" + evaluation_transformer_function: | + import torch + def evaluation_transformer_function(model_output: torch.Tensor) -> torch.Tensor: + return torch.argmax(model_output, dim=-1) \ No newline at end of file diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml new file mode 100644 index 000000000..b3aea4a33 --- /dev/null +++ b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml @@ -0,0 +1,76 @@ +pipeline: + name: Yearbook Test Pipeline + description: Example pipeline + version: 1.0.0 +model: + id: YearbookNet + config: + num_input_channels: 1 + num_classes: 2 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" +training: + gpus: 1 + device: "cuda:0" + dataloader_workers: 2 + use_previous_model: True + initial_model: random + batch_size: 64 + optimizers: + - name: "default" + algorithm: "SGD" + source: "PyTorch" + param_groups: + - module: "model" + config: + lr: 0.001 + momentum: 0.9 + optimization_criterion: + name: "CrossEntropyLoss" + checkpointing: + activated: False + selection_strategy: + name: NewDataStrategy + maximum_keys_in_memory: 1000 + config: + storage_backend: "database" + limit: -1 + reset_after_trigger: True + seed: 42 + epochs_per_trigger: 1 +data: + dataset_id: yearbook_train + transformations: [] + bytes_parser_function: | + import torch + import numpy as np + def bytes_parser_function(data: bytes) -> torch.Tensor: + return torch.from_numpy(np.frombuffer(data, dtype=np.float32)).reshape(1, 32, 32) + +trigger: + id: DataDriftTrigger + trigger_config: + data_points_for_detection: 1000 + metric_name: model + metric_config: + threshold: 0.7 + +evaluation: + device: "cuda:0" + result_writers: ["json"] + datasets: + - dataset_id: yearbook_test + bytes_parser_function: | + import torch + import numpy as np + def bytes_parser_function(data: bytes) -> torch.Tensor: + return torch.from_numpy(np.frombuffer(data, dtype=np.float32)).reshape(1, 32, 32) + batch_size: 64 + dataloader_workers: 2 + metrics: + - name: "Accuracy" + evaluation_transformer_function: | + import torch + def evaluation_transformer_function(model_output: torch.Tensor) -> torch.Tensor: + return torch.argmax(model_output, dim=-1) \ No newline at end of file From d4d46d17db1ddba92f256755edba1d9c7dc455f8 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 30 Apr 2024 20:48:50 +0200 Subject: [PATCH 12/32] Fix linting --- .../internal/triggers/datadrifttrigger.py | 20 ++++++++--------- modyn/supervisor/internal/triggers/utils.py | 22 +++++++++++++------ 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index abe755a2b..bd1eebec5 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -1,10 +1,12 @@ +import json import logging import os -import json import pathlib from typing import Generator, Optional import pandas as pd +from evidently import ColumnMapping +from evidently.report import Report from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader # pylint: disable-next=no-name-in-module @@ -12,16 +14,11 @@ from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo from modyn.supervisor.internal.triggers.utils import ( get_embeddings_evidently_format, + get_evidently_metrics, prepare_trigger_dataloader_by_trigger, prepare_trigger_dataloader_fixed_keys, - get_evidently_metrics, ) -from evidently import ColumnMapping -from evidently.metrics import EmbeddingsDriftMetric -from evidently.metrics.data_drift.embedding_drift_methods import distance, mmd, model, ratio -from evidently.report import Report - logger = logging.getLogger(__name__) @@ -64,7 +61,7 @@ def _parse_trigger_config(self) -> None: if "sample_size" in self.trigger_config.keys(): self.sample_size = self.trigger_config["sample_size"] assert self.sample_size is None or self.sample_size > 0, "sample_size needs to be at least 1" - + self.metrics = get_evidently_metrics(self.evidently_column_mapping_name, self.trigger_config) def _init_dataloader_info(self) -> None: @@ -154,13 +151,16 @@ def run_detection(self, reference_embeddings_df: pd.DataFrame, current_embedding reference_data=reference_embeddings_df, current_data=current_embeddings_df, column_mapping=column_mapping ) result = report.as_dict() - result_print = [(x["result"]["drift_score"], x["result"]["method_name"], x["result"]["drift_detected"]) for x in result["metrics"]] + result_print = [ + (x["result"]["drift_score"], x["result"]["method_name"], x["result"]["drift_detected"]) + for x in result["metrics"] + ] logger.info( f"[DataDriftDetector][Prev Trigger {self.previous_trigger_id}][Dataset {self.dataloader_info.dataset_id}]" + f"[Result] {result_print}" ) - with open(self.drift_dir / f"drift_{self.previous_trigger_id}.json", "w") as f: + with open(self.drift_dir / f"drift_{self.previous_trigger_id}.json", "w", encoding="utf-8") as f: json.dump(result, f) return result["metrics"][0]["result"]["drift_detected"] diff --git a/modyn/supervisor/internal/triggers/utils.py b/modyn/supervisor/internal/triggers/utils.py index 54724ad84..e848658e3 100644 --- a/modyn/supervisor/internal/triggers/utils.py +++ b/modyn/supervisor/internal/triggers/utils.py @@ -4,21 +4,29 @@ import pandas as pd import torch +from evidently.metrics import EmbeddingsDriftMetric +from evidently.metrics.data_drift import embedding_drift_methods from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo, FixedKeysDataset, OnlineTriggerDataset - from torch.utils.data import DataLoader -from evidently import ColumnMapping -from evidently.metrics import EmbeddingsDriftMetric -import evidently.metrics.data_drift.embedding_drift_methods as embedding_drift_methods -from evidently.report import Report logger = logging.getLogger(__name__) -def get_evidently_metrics(column_mapping_name: str, trigger_config: dict = {}) -> list[EmbeddingsDriftMetric]: - metric_name: str = "model" +def get_evidently_metrics( + column_mapping_name: str, trigger_config: Optional[dict] = None +) -> list[EmbeddingsDriftMetric]: + """Evidently metric configurations follow exactly the four DriftMethods defined in embedding_drift_methods: + model, distance, mmd, ratio + If metric_name not given, we use the default 'model' metric. + Otherwise, we use the metric given by metric_name, with optional metric configuration specific to the metric. + """ + if trigger_config is None: + trigger_config = {} + + metric_name: str = "model" metric_config: dict = {} + if "metric_name" in trigger_config.keys(): metric_name = trigger_config["metric_name"] if "metric_config" in trigger_config.keys(): From d0bb0c9ba7fd35758f0648d97a1880f4af138d02 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 30 Apr 2024 21:44:48 +0200 Subject: [PATCH 13/32] Fix mypy typing --- .../pipeline_executor/pipeline_executor.py | 6 ++-- .../internal/triggers/datadrifttrigger.py | 16 ++++++++-- .../model_downloader/model_downloader.py | 7 ++++- .../internal/triggers/test_amounttrigger.py | 30 +++++++++---------- .../internal/triggers/test_timetrigger.py | 24 +++++++-------- 5 files changed, 50 insertions(+), 33 deletions(-) diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index b71304035..b9f29710a 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -191,7 +191,7 @@ def _handle_new_data(self, new_data: list[tuple[int, int, int]]) -> bool: def _handle_new_data_batch(self, batch: list[tuple[int, int, int]]) -> bool: self._sw.start("trigger_inform", overwrite=True) - triggering_indices: Generator[int] = self.trigger.inform(batch) + triggering_indices: Generator[int, None, None] = self.trigger.inform(batch) num_triggers = self._handle_triggers_within_batch(batch, triggering_indices) logger.info(f"There are {num_triggers} triggers in this batch.") @@ -263,7 +263,9 @@ def _run_training(self, trigger_id: int) -> None: writers = [self._init_evaluation_writer(name, trigger_id) for name in writer_names] self.grpc.store_evaluation_results(writers, evaluations) - def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], triggering_indices: list[int]) -> None: + def _handle_triggers_within_batch( + self, batch: list[tuple[int, int, int]], triggering_indices: Generator[int, None, None] + ) -> int: previous_trigger_idx = 0 logger.info("Handling triggers within batch.") self._update_pipeline_stage_and_enqueue_msg(PipelineStage.HANDLE_TRIGGERS_WITHIN_BATCH, MsgType.GENERAL) diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index bd1eebec5..6ee18fc0e 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -40,12 +40,12 @@ def __init__(self, trigger_config: dict): self.dataloader_info: Optional[DataLoaderInfo] = None self.model_downloader: Optional[ModelDownloader] = None - self.detection_interval: int = 2 + self.detection_interval: int = 1000 self.sample_size: Optional[int] = None self.evidently_column_mapping_name = "data" self.metrics: Optional[list] = None - self.data_cache = [] + self.data_cache: list[tuple[int, int, int]] = [] self.leftover_data_points = 0 self.exp_output_dir: Optional[pathlib.Path] = None @@ -65,6 +65,9 @@ def _parse_trigger_config(self) -> None: self.metrics = get_evidently_metrics(self.evidently_column_mapping_name, self.trigger_config) def _init_dataloader_info(self) -> None: + assert self.pipeline_config is not None + assert self.modyn_config is not None + training_config = self.pipeline_config["training"] data_config = self.pipeline_config["data"] @@ -112,6 +115,9 @@ def _init_dataloader_info(self) -> None: ) def _init_model_downloader(self) -> None: + assert self.pipeline_config is not None + assert self.modyn_config is not None + self.model_downloader = ModelDownloader( self.modyn_config, self.pipeline_id, @@ -142,6 +148,9 @@ def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: di self._create_dirs() def run_detection(self, reference_embeddings_df: pd.DataFrame, current_embeddings_df: pd.DataFrame) -> bool: + assert self.dataloader_info is not None + assert self.drift_dir is not None + # Run Evidently detection column_mapping = ColumnMapping(embeddings={self.evidently_column_mapping_name: reference_embeddings_df.columns}) @@ -165,11 +174,12 @@ def run_detection(self, reference_embeddings_df: pd.DataFrame, current_embedding return result["metrics"][0]["result"]["drift_detected"] - def detect_drift(self, idx_start, idx_end) -> bool: + def detect_drift(self, idx_start: int, idx_end: int) -> bool: assert self.previous_trigger_id is not None assert self.previous_data_points is not None and self.previous_data_points > 0 assert self.previous_model_id is not None assert self.dataloader_info is not None + assert self.model_downloader is not None reference_dataloader = prepare_trigger_dataloader_by_trigger( self.previous_trigger_id, diff --git a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py index c5b1c01cd..86f0464fa 100644 --- a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py +++ b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py @@ -39,7 +39,7 @@ def __init__( self._model_storage_stub = self.connect_to_model_storage(model_storage_address) - self.model_configuration_dict = None + self.model_configuration_dict: Optional[dict] = None self.model_handler = None self._amp: Optional[bool] = None self._model = None @@ -56,6 +56,7 @@ def connect_to_model_storage(model_storage_address: str) -> ModelStorageStub: def _load_state(self, path: pathlib.Path) -> None: assert path.exists(), "Cannot load state from non-existing file" + assert self._model is not None logger.info(f"Loading model state from {path}") with open(path, "rb") as state_file: @@ -77,8 +78,12 @@ def configure(self, model_id: int) -> None: self._amp = amp model_module = dynamic_module_import("modyn.models") self.model_handler = getattr(model_module, model_class_name) + assert self.model_handler is not None + self.model_configuration_dict = json.loads(model_config) self._model = self.model_handler(self.model_configuration_dict, self._device, amp) + + assert self._model is not None assert isinstance(self._model.model, CoresetSupportingModule) def download(self, model_id: int) -> None: diff --git a/modyn/tests/supervisor/internal/triggers/test_amounttrigger.py b/modyn/tests/supervisor/internal/triggers/test_amounttrigger.py index 7b681f1f5..40ef8e312 100644 --- a/modyn/tests/supervisor/internal/triggers/test_amounttrigger.py +++ b/modyn/tests/supervisor/internal/triggers/test_amounttrigger.py @@ -19,24 +19,24 @@ def test_init_fails_if_invalid() -> None: def test_inform() -> None: trigger = DataAmountTrigger({"data_points_for_trigger": 1}) # pylint: disable-next=use-implicit-booleaness-not-comparison - assert trigger.inform([]) == [] - assert trigger.inform([(10, 1)]) == [0] - assert trigger.inform([(10, 1), (10, 1)]) == [0, 1] - assert trigger.inform([(10, 1), (10, 1), (10, 1)]) == [0, 1, 2] + assert list(trigger.inform([])) == [] + assert list(trigger.inform([(10, 1)])) == [0] + assert list(trigger.inform([(10, 1), (10, 1)])) == [0, 1] + assert list(trigger.inform([(10, 1), (10, 1), (10, 1)])) == [0, 1, 2] trigger = DataAmountTrigger({"data_points_for_trigger": 2}) # pylint: disable-next=use-implicit-booleaness-not-comparison - assert trigger.inform([(10, 1)]) == [] - assert trigger.inform([(10, 1)]) == [0] - assert trigger.inform([(10, 1), (10, 1)]) == [1] - assert trigger.inform([(10, 1), (10, 1), (10, 1), (10, 1)]) == [1, 3] - assert trigger.inform([(10, 1), (10, 1), (10, 1)]) == [1] - assert trigger.inform([(10, 1)]) == [0] - assert trigger.inform([(10, 1), (10, 1), (10, 1), (10, 1), (10, 1)]) == [1, 3] - assert trigger.inform([(10, 1)]) == [0] + assert list(trigger.inform([(10, 1)])) == [] + assert list(trigger.inform([(10, 1)])) == [0] + assert list(trigger.inform([(10, 1), (10, 1)])) == [1] + assert list(trigger.inform([(10, 1), (10, 1), (10, 1), (10, 1)])) == [1, 3] + assert list(trigger.inform([(10, 1), (10, 1), (10, 1)])) == [1] + assert list(trigger.inform([(10, 1)])) == [0] + assert list(trigger.inform([(10, 1), (10, 1), (10, 1), (10, 1), (10, 1)])) == [1, 3] + assert list(trigger.inform([(10, 1)])) == [0] trigger = DataAmountTrigger({"data_points_for_trigger": 5}) # pylint: disable-next=use-implicit-booleaness-not-comparison - assert trigger.inform([(10, 1), (10, 1), (10, 1), (10, 1)]) == [] - assert trigger.inform([(10, 1), (10, 1), (10, 1)]) == [0] - assert trigger.inform([(10, 1), (10, 1), (10, 1)]) == [2] + assert list(trigger.inform([(10, 1), (10, 1), (10, 1), (10, 1)])) == [] + assert list(trigger.inform([(10, 1), (10, 1), (10, 1)])) == [0] + assert list(trigger.inform([(10, 1), (10, 1), (10, 1)])) == [2] diff --git a/modyn/tests/supervisor/internal/triggers/test_timetrigger.py b/modyn/tests/supervisor/internal/triggers/test_timetrigger.py index 306759df2..56ae266e6 100644 --- a/modyn/tests/supervisor/internal/triggers/test_timetrigger.py +++ b/modyn/tests/supervisor/internal/triggers/test_timetrigger.py @@ -20,26 +20,26 @@ def test_inform() -> None: trigger = TimeTrigger({"trigger_every": "1000s"}) LABEL = 2 # pylint: disable=invalid-name # pylint: disable-next=use-implicit-booleaness-not-comparison - assert trigger.inform([]) == [] + assert list(trigger.inform([])) == [] # pylint: disable-next=use-implicit-booleaness-not-comparison - assert trigger.inform([(10, 0, LABEL)]) == [] + assert list(trigger.inform([(10, 0, LABEL)])) == [] # pylint: disable-next=use-implicit-booleaness-not-comparison - assert trigger.inform([(10, 500, LABEL)]) == [] + assert list(trigger.inform([(10, 500, LABEL)])) == [] # pylint: disable-next=use-implicit-booleaness-not-comparison - assert trigger.inform([(10, 999, LABEL)]) == [] - assert trigger.inform([(10, 1000, LABEL)]) == [-1] # Trigger includes 0, 500, 900, but not 1000 - assert trigger.inform([(10, 1500, LABEL), (10, 1600, LABEL), (10, 2000, LABEL)]) == [ + assert list(trigger.inform([(10, 999, LABEL)])) == [] + assert list(trigger.inform([(10, 1000, LABEL)])) == [-1] # Trigger includes 0, 500, 900, but not 1000 + assert list(trigger.inform([(10, 1500, LABEL), (10, 1600, LABEL), (10, 2000, LABEL)])) == [ 1 ] # 2000 enables us to know that 1600 should trigger! - assert trigger.inform([(10, 3000, LABEL), (10, 4000, LABEL)]) == [-1, 0] + assert list(trigger.inform([(10, 3000, LABEL), (10, 4000, LABEL)])) == [-1, 0] # pylint: disable-next=use-implicit-booleaness-not-comparison - assert trigger.inform([(10, 4100, LABEL), (10, 4200, LABEL)]) == [] - assert trigger.inform([(10, 5000, LABEL)]) == [-1] - assert trigger.inform([(10, 6000, LABEL), (10, 7000, LABEL), (10, 8000, LABEL), (10, 9000, LABEL)]) == [ + assert list(trigger.inform([(10, 4100, LABEL), (10, 4200, LABEL)])) == [] + assert list(trigger.inform([(10, 5000, LABEL)])) == [-1] + assert list(trigger.inform([(10, 6000, LABEL), (10, 7000, LABEL), (10, 8000, LABEL), (10, 9000, LABEL)])) == [ -1, 0, 1, 2, ] - assert trigger.inform([(10, 15000, LABEL)]) == [-1, -1, -1, -1, -1, -1] - assert trigger.inform([(10, 17000, LABEL), (10, 18000, LABEL)]) == [-1, -1, 0] + assert list(trigger.inform([(10, 15000, LABEL)])) == [-1, -1, -1, -1, -1, -1] + assert list(trigger.inform([(10, 17000, LABEL), (10, 18000, LABEL)])) == [-1, -1, 0] From ca0e1c08bebceecf329efbe7dee5509a7e000e90 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 30 Apr 2024 22:08:27 +0200 Subject: [PATCH 14/32] Add evidently to known-third-party for github checks --- .pylintrc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pylintrc b/.pylintrc index 11f3267d3..8e1027db6 100644 --- a/.pylintrc +++ b/.pylintrc @@ -414,7 +414,8 @@ int-import-graph= known-standard-library= # Force import order to recognize a module as part of a third party library. -known-third-party=enchant +known-third-party=enchant, + evidently # Couples of modules and preferred modules, separated by a comma. preferred-modules= From 038bfc3877f187f0e34dccf66b1a3c0365c85d1c Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 30 Apr 2024 22:41:08 +0200 Subject: [PATCH 15/32] Revert "Add evidently to known-third-party for github checks" This reverts commit ca0e1c08bebceecf329efbe7dee5509a7e000e90. --- .pylintrc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.pylintrc b/.pylintrc index 8e1027db6..11f3267d3 100644 --- a/.pylintrc +++ b/.pylintrc @@ -414,8 +414,7 @@ int-import-graph= known-standard-library= # Force import order to recognize a module as part of a third party library. -known-third-party=enchant, - evidently +known-third-party=enchant # Couples of modules and preferred modules, separated by a comma. preferred-modules= From 0c87fc43dd22c10e7f19caff09a0559562123e33 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 30 Apr 2024 22:42:04 +0200 Subject: [PATCH 16/32] Add evidently to environment --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 70bda36f3..838bb8050 100644 --- a/environment.yml +++ b/environment.yml @@ -21,6 +21,7 @@ dependencies: - protobuf - pip: - grpcio==1.59.0 + - evidently - jsonschema - psycopg2 - sqlalchemy>=2.0 From f094d2322737d86b69ec1290f3bf41939257fb5e Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Mon, 13 May 2024 19:58:13 +0200 Subject: [PATCH 17/32] Minor fixes to datadrifttrigger. Add datadrifttrigger unit tests --- .../internal/triggers/datadrifttrigger.py | 25 +- .../trigger_datasets/fixed_keys_dataset.py | 4 +- .../triggers/test_datadrifttrigger.py | 209 +++++++++++++ .../test_fixed_keys_dataset.py | 275 ++++++++++++++++++ .../test_online_trigger_dataset.py | 84 ++++++ 5 files changed, 583 insertions(+), 14 deletions(-) create mode 100644 modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py create mode 100644 modyn/tests/supervisor/internal/triggers/trigger_datasets/test_fixed_keys_dataset.py create mode 100644 modyn/tests/supervisor/internal/triggers/trigger_datasets/test_online_trigger_dataset.py diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index 6ee18fc0e..6f206494b 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -26,7 +26,6 @@ class DataDriftTrigger(Trigger): """Triggers when a certain number of data points have been used.""" def __init__(self, trigger_config: dict): - self.trigger_config = trigger_config self.pipeline_id: Optional[int] = None self.pipeline_config: Optional[dict] = None self.modyn_config: Optional[dict] = None @@ -45,24 +44,27 @@ def __init__(self, trigger_config: dict): self.evidently_column_mapping_name = "data" self.metrics: Optional[list] = None + self.exp_output_dir: Optional[pathlib.Path] = None + self.drift_dir: Optional[pathlib.Path] = None + self.data_cache: list[tuple[int, int, int]] = [] self.leftover_data_points = 0 - self.exp_output_dir: Optional[pathlib.Path] = None - self.drift_dir: Optional[pathlib.Path] = None + if len(trigger_config) > 0: + self._parse_trigger_config(trigger_config) super().__init__(trigger_config) - def _parse_trigger_config(self) -> None: - if "data_points_for_detection" in self.trigger_config.keys(): - self.detection_interval = self.trigger_config["data_points_for_detection"] - assert self.detection_interval > 0, "data_points_for_trigger needs to be at least 1" + def _parse_trigger_config(self, trigger_config) -> None: + if "data_points_for_detection" in trigger_config.keys(): + self.detection_interval = trigger_config["data_points_for_detection"] + assert self.detection_interval > 0, "data_points_for_detection needs to be at least 1" - if "sample_size" in self.trigger_config.keys(): - self.sample_size = self.trigger_config["sample_size"] + if "sample_size" in trigger_config.keys(): + self.sample_size = trigger_config["sample_size"] assert self.sample_size is None or self.sample_size > 0, "sample_size needs to be at least 1" - self.metrics = get_evidently_metrics(self.evidently_column_mapping_name, self.trigger_config) + self.metrics = get_evidently_metrics(self.evidently_column_mapping_name, trigger_config) def _init_dataloader_info(self) -> None: assert self.pipeline_config is not None @@ -140,9 +142,6 @@ def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: di self.modyn_config = modyn_config self.base_dir = base_dir - if len(self.trigger_config) > 0: - self._parse_trigger_config() - self._init_dataloader_info() self._init_model_downloader() self._create_dirs() diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py b/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py index 538481b2d..2f98af0b2 100644 --- a/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py +++ b/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py @@ -109,8 +109,10 @@ def __iter__(self) -> Generator: if worker_info is None: # Non-multi-threaded data loading. We use worker_id 0. worker_id = 0 + num_workers = 1 else: worker_id = worker_info.id + num_workers = worker_info.num_workers if self._first_call: self._first_call = False @@ -125,7 +127,7 @@ def __iter__(self) -> Generator: assert self._transform is not None total_samples = len(self._keys) - keys_per_worker = int(math.ceil(total_samples / worker_info.num_workers)) + keys_per_worker = int(math.ceil(total_samples / num_workers)) worker_keys = self._keys[worker_id * keys_per_worker : min(total_samples, (worker_id + 1) * keys_per_worker)] # TODO(#175): we might want to do/accelerate prefetching here. diff --git a/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py b/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py new file mode 100644 index 000000000..caa2671fe --- /dev/null +++ b/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py @@ -0,0 +1,209 @@ +# pylint: disable=unused-argument, no-name-in-module, no-value-for-parameter +from unittest.mock import patch, MagicMock +import pathlib +import os +from typing import Optional + +import pytest +from modyn.supervisor.internal.triggers import DataDriftTrigger +from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader +from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo + +BASEDIR: pathlib.Path = pathlib.Path(os.path.realpath(__file__)).parent / "test_eval_dir" +PIPELINE_ID = 42 +SAMPLE = (10, 1, 1) + + + +def get_minimal_training_config() -> dict: + return { + "gpus": 1, + "device": "cpu", + "dataloader_workers": 1, + "use_previous_model": True, + "initial_model": "random", + "learning_rate": 0.1, + "batch_size": 42, + "optimizers": [ + {"name": "default1", "algorithm": "SGD", "source": "PyTorch", "param_groups": [{"module": "model"}]}, + ], + "optimization_criterion": {"name": "CrossEntropyLoss"}, + "checkpointing": {"activated": False}, + "selection_strategy": {"name": "NewDataStrategy", "maximum_keys_in_memory": 10}, + } + + +def get_minimal_evaluation_config() -> dict: + return { + "device": "cpu", + "datasets": [ + { + "dataset_id": "MNIST_eval", + "bytes_parser_function": "def bytes_parser_function(data: bytes) -> bytes:\n\treturn data", + "dataloader_workers": 2, + "batch_size": 64, + "metrics": [{"name": "Accuracy"}], + } + ], + } + +def get_minimal_trigger_config() -> dict: + return {} + +def get_minimal_pipeline_config() -> dict: + return { + "pipeline": {"name": "Test"}, + "model": {"id": "ResNet18"}, + "model_storage": {"full_model_strategy": {"name": "PyTorchFullModel"}}, + "training": get_minimal_training_config(), + "data": {"dataset_id": "test", "bytes_parser_function": "def bytes_parser_function(x):\n\treturn x"}, + "trigger": {"id": "DataDriftTrigger", "trigger_config": get_minimal_trigger_config()}, + } + + +def get_simple_system_config() -> dict: + return { + "storage": {"hostname": "test", "port": 42}, + "selector": {"hostname": "test", "port": 42}, + "model_storage": {"hostname": "test", "port": 42}, + } + +def noop(self) -> None: + pass + +def noop_model_downloader_constructor_mock( + self, + modyn_config: dict, + pipeline_id: int, + device: str, + base_dir: pathlib.Path, + model_storage_address: str, +) -> None: + pass + +def noop_dataloader_info_constructor_mock( + self, + pipeline_id: int, + dataset_id: str, + num_dataloaders: int, + batch_size: int, + bytes_parser: str, + transform_list: list[str], + storage_address: str, + selector_address: str, + num_prefetched_partitions: int, + parallel_prefetch_requests: int, + tokenizer: Optional[None], +) -> None: + pass + + +def test_initialization() -> None: + trigger = DataDriftTrigger({"data_points_for_detection": 42}) + assert trigger.detection_interval == 42 + assert trigger.previous_trigger_id is None + assert trigger.previous_model_id is None + assert not trigger.model_updated + assert not trigger.data_cache + assert trigger.leftover_data_points == 0 + + +def test_init_fails_if_invalid() -> None: + with pytest.raises(AssertionError, match="data_points_for_detection needs to be at least 1"): + DataDriftTrigger({"data_points_for_detection": 0}) + + with pytest.raises(AssertionError, match="sample_size needs to be at least 1"): + DataDriftTrigger({"sample_size": 0}) + + +@patch.object(ModelDownloader, "__init__", noop_model_downloader_constructor_mock) +@patch.object(DataLoaderInfo, "__init__", noop_dataloader_info_constructor_mock) +def test_init_trigger() -> None: + trigger = DataDriftTrigger(get_minimal_trigger_config()) + # pylint: disable-next=use-implicit-booleaness-not-comparison + with patch("os.makedirs", return_value=None): + pipeline_config = get_minimal_pipeline_config() + modyn_config = get_simple_system_config() + trigger.init_trigger(PIPELINE_ID, pipeline_config, modyn_config, BASEDIR) + assert trigger.pipeline_id == PIPELINE_ID + assert trigger.pipeline_config == pipeline_config + assert trigger.modyn_config == modyn_config + assert trigger.base_dir == BASEDIR + assert trigger.drift_dir is not None + assert isinstance(trigger.dataloader_info, DataLoaderInfo) + assert isinstance(trigger.model_downloader, ModelDownloader) + + +def test_inform_previous_trigger_and_data_points() -> None: + trigger = DataDriftTrigger(get_minimal_trigger_config()) + # pylint: disable-next=use-implicit-booleaness-not-comparison + trigger.inform_previous_trigger_and_data_points(42, 42) + assert trigger.previous_trigger_id == 42 + assert trigger.previous_data_points == 42 + + +def test_inform_previous_model_id() -> None: + trigger = DataDriftTrigger(get_minimal_trigger_config()) + # pylint: disable-next=use-implicit-booleaness-not-comparison + trigger.inform_previous_model(42) + assert trigger.previous_model_id == 42 + + +@patch.object(DataDriftTrigger, "detect_drift", return_value=True) +def test_inform_always_drift(test_detect_drift) -> None: + trigger = DataDriftTrigger({"data_points_for_detection": 1}) + num_triggers = 0 + for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + num_triggers += 1 + trigger.inform_previous_trigger_and_data_points(num_triggers, 42) + trigger.inform_previous_model(num_triggers) + # pylint: disable-next=use-implicit-booleaness-not-comparison + assert num_triggers == 5 + + trigger = DataDriftTrigger({"data_points_for_detection": 2}) + num_triggers = 0 + for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + num_triggers += 1 + trigger.inform_previous_trigger_and_data_points(num_triggers, 42) + trigger.inform_previous_model(num_triggers) + # pylint: disable-next=use-implicit-booleaness-not-comparison + assert num_triggers == 2 + + trigger = DataDriftTrigger({"data_points_for_detection": 5}) + num_triggers = 0 + for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + num_triggers += 1 + trigger.inform_previous_trigger_and_data_points(num_triggers, 42) + trigger.inform_previous_model(num_triggers) + # pylint: disable-next=use-implicit-booleaness-not-comparison + assert num_triggers == 1 + + +@patch.object(DataDriftTrigger, "detect_drift", return_value=False) +def test_inform_no_drift(test_detect_no_drift) -> None: + trigger = DataDriftTrigger({"data_points_for_detection": 1}) + num_triggers = 0 + for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + num_triggers += 1 + trigger.inform_previous_trigger_and_data_points(num_triggers, 42) + trigger.inform_previous_model(num_triggers) + # pylint: disable-next=use-implicit-booleaness-not-comparison + assert num_triggers == 1 + + trigger = DataDriftTrigger({"data_points_for_detection": 2}) + num_triggers = 0 + for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + num_triggers += 1 + trigger.inform_previous_trigger_and_data_points(num_triggers, 42) + trigger.inform_previous_model(num_triggers) + # pylint: disable-next=use-implicit-booleaness-not-comparison + assert num_triggers == 1 + + trigger = DataDriftTrigger({"data_points_for_detection": 5}) + num_triggers = 0 + for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + num_triggers += 1 + trigger.inform_previous_trigger_and_data_points(num_triggers, 42) + trigger.inform_previous_model(num_triggers) + # pylint: disable-next=use-implicit-booleaness-not-comparison + assert num_triggers == 1 diff --git a/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_fixed_keys_dataset.py b/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_fixed_keys_dataset.py new file mode 100644 index 000000000..8904613d2 --- /dev/null +++ b/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_fixed_keys_dataset.py @@ -0,0 +1,275 @@ +# pylint: disable=unused-argument, no-name-in-module, no-value-for-parameter +import platform +import math +from unittest.mock import patch + +import grpc +import pytest +import torch +from modyn.supervisor.internal.triggers.trigger_datasets import FixedKeysDataset +from modyn.models.tokenizers import DistilBertTokenizerTransform +from modyn.storage.internal.grpc.generated.storage_pb2 import ( + GetDataPerWorkerRequest, + GetDataPerWorkerResponse, + GetRequest, + GetResponse, +) +from torchvision import transforms + +DATASET_ID = "MNIST" +TRIGGER_ID = 42 +STORAGE_ADDR = "localhost:1234" +KEYS = list(range(10)) + + +def get_identity_bytes_parser(): + return "def bytes_parser_function(x):\n\treturn x" + + +def bytes_parser_function(data): + return data + + +class MockStorageStub: + def __init__(self, channel) -> None: + pass + + def Get(self, request: GetRequest): # pylint: disable=invalid-name + yield GetResponse( + samples=[key.to_bytes(2, "big") for key in request.keys], keys=request.keys, labels=[1] * len(request.keys) + ) + + +def test_invalid_bytes_parser(): + with pytest.raises(AssertionError): + FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser="", + serialized_transforms=[], + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=KEYS, + )._init_transforms() + + with pytest.raises(ValueError): + FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser="bytes_parser_function=1", + serialized_transforms=[], + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=KEYS, + )._init_transforms() + + +def test_init(): + fixed_keys_dataset = FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser=get_identity_bytes_parser(), + serialized_transforms=[], + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=KEYS, + tokenizer="DistilBertTokenizerTransform", + ) + + assert fixed_keys_dataset._trigger_id == TRIGGER_ID + assert fixed_keys_dataset._dataset_id == DATASET_ID + assert fixed_keys_dataset._first_call + assert fixed_keys_dataset._bytes_parser_function is None + assert fixed_keys_dataset._storagestub is None + assert isinstance(fixed_keys_dataset._tokenizer, DistilBertTokenizerTransform) + assert fixed_keys_dataset._keys == KEYS + + +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch.object(grpc, "insecure_channel", return_value=None) +def test_init_grpc(test_insecure_channel, test_grpc_connection_established): + fixed_keys_dataset = FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser=get_identity_bytes_parser(), + serialized_transforms=[], + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=KEYS, + ) + + assert fixed_keys_dataset._storagestub is None + fixed_keys_dataset._init_grpc() + assert isinstance(fixed_keys_dataset._storagestub, MockStorageStub) + + +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch.object(grpc, "insecure_channel", return_value=None) +def test_get_data_from_storage(test_insecure_channel, test_grpc_connection_established): + fixed_keys_dataset = FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser=get_identity_bytes_parser(), + serialized_transforms=[], + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=KEYS, + ) + + fixed_keys_dataset._init_grpc() + all_data = fixed_keys_dataset._get_data_from_storage(KEYS) + for data in all_data: + for i, d in enumerate(data): + assert d == (i, i.to_bytes(2, "big"), 1) + + +def test_init_transforms(): + fixed_keys_dataset = FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser="def bytes_parser_function(x):\n\treturn int.from_bytes(x, 'big')", + serialized_transforms=[], + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=KEYS, + ) + + assert fixed_keys_dataset._bytes_parser_function is None + assert fixed_keys_dataset._transform is None + + with patch.object(fixed_keys_dataset, "_setup_composed_transform") as tv_ds: + fixed_keys_dataset._init_transforms() + assert fixed_keys_dataset._bytes_parser_function is not None + assert fixed_keys_dataset._bytes_parser_function(b"\x03") == 3 + + assert fixed_keys_dataset._transform is not None + + tv_ds.assert_called_once() + + +@pytest.mark.parametrize( + "serialized_transforms,transforms_list", + [ + pytest.param( + [ + "transforms.RandomResizedCrop(224)", + "transforms.RandomAffine(degrees=(0, 90))", + "transforms.ToTensor()", + "transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])", + ], + [ + transforms.RandomResizedCrop(224), + transforms.RandomAffine(degrees=(0, 90)), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ], + ) + ], +) +def test__setup_composed_transform(serialized_transforms, transforms_list): + fixed_keys_dataset = FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser=get_identity_bytes_parser(), + serialized_transforms=list(serialized_transforms), + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=KEYS, + tokenizer="DistilBertTokenizerTransform", + ) + fixed_keys_dataset._bytes_parser_function = bytes_parser_function + fixed_keys_dataset._setup_composed_transform() + assert isinstance(fixed_keys_dataset._transform.transforms, list) + assert fixed_keys_dataset._transform.transforms[0].__name__ == "bytes_parser_function" + for transform1, transform2 in zip(fixed_keys_dataset._transform.transforms[1:-1], transforms_list): + assert transform1.__dict__ == transform2.__dict__ + assert isinstance(fixed_keys_dataset._transform.transforms[-1], DistilBertTokenizerTransform) + + +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch.object(grpc, "insecure_channel", return_value=None) +def test_dataset_iter(test_insecure_channel, test_grpc_connection_established): + fixed_keys_dataset = FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser=get_identity_bytes_parser(), + serialized_transforms=[], + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=KEYS, + ) + + all_data = list(fixed_keys_dataset) + assert [x[0] for x in all_data] == KEYS + assert [x[1] for x in all_data] == [x.to_bytes(2, "big") for x in KEYS] + assert [x[2] for x in all_data] == [1] * len(KEYS) + + +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch.object(grpc, "insecure_channel", return_value=None) +def test_dataset_iter_with_parsing(test_insecure_channel, test_grpc_connection_established): + fixed_keys_dataset = FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser="def bytes_parser_function(x):\n\treturn int.from_bytes(x, 'big')", + serialized_transforms=[], + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=KEYS, + ) + dataset_iter = iter(fixed_keys_dataset) + all_data = list(dataset_iter) + assert [x[0] for x in all_data] == KEYS + assert [x[1] for x in all_data] == KEYS + assert [x[2] for x in all_data] == [1] * len(KEYS) + + +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch.object(grpc, "insecure_channel", return_value=None) +def test_dataloader_dataset(test_insecure_channel, test_grpc_connection_established): + fixed_keys_dataset = FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser="def bytes_parser_function(x):\n\treturn int.from_bytes(x, 'big')", + serialized_transforms=[], + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=KEYS, + ) + + dataloader = torch.utils.data.DataLoader(fixed_keys_dataset, batch_size=2) + for i, batch in enumerate(dataloader): + assert len(batch) == 3 + assert i < math.floor(len(KEYS) / 2) + assert batch[0].tolist() == [2 * i, 2 * i + 1] + assert torch.equal(batch[1], torch.Tensor([2 * i, 2 * i + 1])) + assert torch.equal(batch[2], torch.ones(2, dtype=torch.float64)) + + +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) +@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch.object(grpc, "insecure_channel", return_value=None) +def test_dataloader_dataset_multi_worker(test_insecure_channel, test_grpc_connection_established): + if platform.system() == "Darwin": + # On macOS, spawn is the default, which loses the mocks + # Hence the test does not work on macOS, only on Linux. + return + + fixed_keys_dataset = FixedKeysDataset( + dataset_id=DATASET_ID, + bytes_parser="def bytes_parser_function(x):\n\treturn int.from_bytes(x, 'big')", + serialized_transforms=[], + storage_address=STORAGE_ADDR, + trigger_id=TRIGGER_ID, + keys=list(range(16)), + ) + dataloader = torch.utils.data.DataLoader(fixed_keys_dataset, batch_size=4, num_workers=4) + + data = list(dataloader) + data.sort(key=lambda batch_data: batch_data[0].min()) + + batch_num = -1 + for batch_num, batch in enumerate(data): + assert len(batch) == 3 + assert batch[0].tolist() == [4 * batch_num, 4 * batch_num + 1, 4 * batch_num + 2, 4 * batch_num + 3] + assert torch.equal( + batch[1], torch.Tensor([4 * batch_num, 4 * batch_num + 1, 4 * batch_num + 2, 4 * batch_num + 3]) + ) + assert torch.equal(batch[2], torch.ones(4, dtype=torch.float64)) + + assert batch_num == 3 diff --git a/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_online_trigger_dataset.py b/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_online_trigger_dataset.py new file mode 100644 index 000000000..7fb8737ee --- /dev/null +++ b/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_online_trigger_dataset.py @@ -0,0 +1,84 @@ +# pylint: disable=unused-argument, no-name-in-module, no-value-for-parameter +import pathlib +from typing import Generator, Optional +from unittest.mock import patch + +from modyn.supervisor.internal.triggers.trigger_datasets import OnlineTriggerDataset +from modyn.trainer_server.internal.dataset.online_dataset import OnlineDataset + + +NUM_SAMPLES=10 + +def get_mock_bytes_parser(): + return "def bytes_parser_function(x):\n\treturn x" + + +def bytes_parser_function(data): + return data + + +def noop_constructor_mock( + self, + pipeline_id: int, + trigger_id: int, + dataset_id: str, + bytes_parser: str, + serialized_transforms: list[str], + storage_address: str, + selector_address: str, + training_id: int, + num_prefetched_partitions: int, + parallel_prefetch_requests: int, + tokenizer: Optional[str], + log_path: Optional[pathlib.Path], +) -> None: + pass + + +def mock_data_generator(self) -> Generator: + yield from list(range(NUM_SAMPLES)) + + +def test_init(): + online_trigger_dataset = OnlineTriggerDataset( + pipeline_id=1, + trigger_id=1, + dataset_id="MNIST", + bytes_parser=get_mock_bytes_parser(), + serialized_transforms=[], + storage_address="localhost:1234", + selector_address="localhost:1234", + training_id=42, + tokenizer=None, + num_prefetched_partitions=1, + parallel_prefetch_requests=1, + sample_prob=0.5, + ) + assert online_trigger_dataset._pipeline_id == 1 + assert online_trigger_dataset._trigger_id == 1 + assert online_trigger_dataset._dataset_id == "MNIST" + assert online_trigger_dataset._first_call + assert online_trigger_dataset._bytes_parser_function is None + assert online_trigger_dataset._storagestub is None + assert online_trigger_dataset._sample_prob == 0.5 + + +@patch.object(OnlineDataset, "__iter__", mock_data_generator) +def test_dataset_iter(): + online_trigger_dataset = OnlineTriggerDataset( + pipeline_id=1, + trigger_id=1, + dataset_id="MNIST", + bytes_parser=get_mock_bytes_parser(), + serialized_transforms=[], + storage_address="localhost:1234", + selector_address="localhost:1234", + training_id=42, + tokenizer=None, + num_prefetched_partitions=1, + parallel_prefetch_requests=1, + sample_prob=0.5, + ) + + all_trigger_data = list(online_trigger_dataset) + assert len(all_trigger_data) < NUM_SAMPLES From 9d59f846e082a1b533d09d5f87d0e07d4505d075 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Mon, 13 May 2024 20:17:24 +0200 Subject: [PATCH 18/32] Fix linting --- .../internal/triggers/datadrifttrigger.py | 2 +- .../triggers/test_datadrifttrigger.py | 22 ++++++---- .../test_fixed_keys_dataset.py | 41 ++++++++++++------- .../test_online_trigger_dataset.py | 2 +- 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index 6f206494b..d4aca4ed0 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -55,7 +55,7 @@ def __init__(self, trigger_config: dict): super().__init__(trigger_config) - def _parse_trigger_config(self, trigger_config) -> None: + def _parse_trigger_config(self, trigger_config: dict) -> None: if "data_points_for_detection" in trigger_config.keys(): self.detection_interval = trigger_config["data_points_for_detection"] assert self.detection_interval > 0, "data_points_for_detection needs to be at least 1" diff --git a/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py b/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py index caa2671fe..cfd0ac173 100644 --- a/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py +++ b/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py @@ -1,8 +1,8 @@ # pylint: disable=unused-argument, no-name-in-module, no-value-for-parameter -from unittest.mock import patch, MagicMock -import pathlib import os +import pathlib from typing import Optional +from unittest.mock import patch import pytest from modyn.supervisor.internal.triggers import DataDriftTrigger @@ -14,7 +14,6 @@ SAMPLE = (10, 1, 1) - def get_minimal_training_config() -> dict: return { "gpus": 1, @@ -47,9 +46,11 @@ def get_minimal_evaluation_config() -> dict: ], } + def get_minimal_trigger_config() -> dict: return {} + def get_minimal_pipeline_config() -> dict: return { "pipeline": {"name": "Test"}, @@ -68,9 +69,11 @@ def get_simple_system_config() -> dict: "model_storage": {"hostname": "test", "port": 42}, } + def noop(self) -> None: pass + def noop_model_downloader_constructor_mock( self, modyn_config: dict, @@ -81,6 +84,7 @@ def noop_model_downloader_constructor_mock( ) -> None: pass + def noop_dataloader_info_constructor_mock( self, pipeline_id: int, @@ -153,7 +157,7 @@ def test_inform_previous_model_id() -> None: def test_inform_always_drift(test_detect_drift) -> None: trigger = DataDriftTrigger({"data_points_for_detection": 1}) num_triggers = 0 - for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + for _ in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): num_triggers += 1 trigger.inform_previous_trigger_and_data_points(num_triggers, 42) trigger.inform_previous_model(num_triggers) @@ -162,7 +166,7 @@ def test_inform_always_drift(test_detect_drift) -> None: trigger = DataDriftTrigger({"data_points_for_detection": 2}) num_triggers = 0 - for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + for _ in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): num_triggers += 1 trigger.inform_previous_trigger_and_data_points(num_triggers, 42) trigger.inform_previous_model(num_triggers) @@ -171,7 +175,7 @@ def test_inform_always_drift(test_detect_drift) -> None: trigger = DataDriftTrigger({"data_points_for_detection": 5}) num_triggers = 0 - for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + for _ in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): num_triggers += 1 trigger.inform_previous_trigger_and_data_points(num_triggers, 42) trigger.inform_previous_model(num_triggers) @@ -183,7 +187,7 @@ def test_inform_always_drift(test_detect_drift) -> None: def test_inform_no_drift(test_detect_no_drift) -> None: trigger = DataDriftTrigger({"data_points_for_detection": 1}) num_triggers = 0 - for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + for _ in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): num_triggers += 1 trigger.inform_previous_trigger_and_data_points(num_triggers, 42) trigger.inform_previous_model(num_triggers) @@ -192,7 +196,7 @@ def test_inform_no_drift(test_detect_no_drift) -> None: trigger = DataDriftTrigger({"data_points_for_detection": 2}) num_triggers = 0 - for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + for _ in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): num_triggers += 1 trigger.inform_previous_trigger_and_data_points(num_triggers, 42) trigger.inform_previous_model(num_triggers) @@ -201,7 +205,7 @@ def test_inform_no_drift(test_detect_no_drift) -> None: trigger = DataDriftTrigger({"data_points_for_detection": 5}) num_triggers = 0 - for t in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): + for _ in trigger.inform([SAMPLE, SAMPLE, SAMPLE, SAMPLE, SAMPLE]): num_triggers += 1 trigger.inform_previous_trigger_and_data_points(num_triggers, 42) trigger.inform_previous_model(num_triggers) diff --git a/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_fixed_keys_dataset.py b/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_fixed_keys_dataset.py index 8904613d2..a1ff8fca9 100644 --- a/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_fixed_keys_dataset.py +++ b/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_fixed_keys_dataset.py @@ -1,19 +1,14 @@ # pylint: disable=unused-argument, no-name-in-module, no-value-for-parameter -import platform import math +import platform from unittest.mock import patch import grpc import pytest import torch -from modyn.supervisor.internal.triggers.trigger_datasets import FixedKeysDataset from modyn.models.tokenizers import DistilBertTokenizerTransform -from modyn.storage.internal.grpc.generated.storage_pb2 import ( - GetDataPerWorkerRequest, - GetDataPerWorkerResponse, - GetRequest, - GetResponse, -) +from modyn.storage.internal.grpc.generated.storage_pb2 import GetRequest, GetResponse +from modyn.supervisor.internal.triggers.trigger_datasets import FixedKeysDataset from torchvision import transforms DATASET_ID = "MNIST" @@ -83,7 +78,10 @@ def test_init(): @patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) -@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch( + "modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", + return_value=True, +) @patch.object(grpc, "insecure_channel", return_value=None) def test_init_grpc(test_insecure_channel, test_grpc_connection_established): fixed_keys_dataset = FixedKeysDataset( @@ -101,7 +99,10 @@ def test_init_grpc(test_insecure_channel, test_grpc_connection_established): @patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) -@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch( + "modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", + return_value=True, +) @patch.object(grpc, "insecure_channel", return_value=None) def test_get_data_from_storage(test_insecure_channel, test_grpc_connection_established): fixed_keys_dataset = FixedKeysDataset( @@ -182,7 +183,10 @@ def test__setup_composed_transform(serialized_transforms, transforms_list): @patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) -@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch( + "modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", + return_value=True, +) @patch.object(grpc, "insecure_channel", return_value=None) def test_dataset_iter(test_insecure_channel, test_grpc_connection_established): fixed_keys_dataset = FixedKeysDataset( @@ -201,7 +205,10 @@ def test_dataset_iter(test_insecure_channel, test_grpc_connection_established): @patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) -@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch( + "modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", + return_value=True, +) @patch.object(grpc, "insecure_channel", return_value=None) def test_dataset_iter_with_parsing(test_insecure_channel, test_grpc_connection_established): fixed_keys_dataset = FixedKeysDataset( @@ -220,7 +227,10 @@ def test_dataset_iter_with_parsing(test_insecure_channel, test_grpc_connection_e @patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) -@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch( + "modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", + return_value=True, +) @patch.object(grpc, "insecure_channel", return_value=None) def test_dataloader_dataset(test_insecure_channel, test_grpc_connection_established): fixed_keys_dataset = FixedKeysDataset( @@ -242,7 +252,10 @@ def test_dataloader_dataset(test_insecure_channel, test_grpc_connection_establis @patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.StorageStub", MockStorageStub) -@patch("modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", return_value=True) +@patch( + "modyn.supervisor.internal.triggers.trigger_datasets.fixed_keys_dataset.grpc_connection_established", + return_value=True, +) @patch.object(grpc, "insecure_channel", return_value=None) def test_dataloader_dataset_multi_worker(test_insecure_channel, test_grpc_connection_established): if platform.system() == "Darwin": diff --git a/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_online_trigger_dataset.py b/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_online_trigger_dataset.py index 7fb8737ee..cd352544d 100644 --- a/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_online_trigger_dataset.py +++ b/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_online_trigger_dataset.py @@ -6,8 +6,8 @@ from modyn.supervisor.internal.triggers.trigger_datasets import OnlineTriggerDataset from modyn.trainer_server.internal.dataset.online_dataset import OnlineDataset +NUM_SAMPLES = 10 -NUM_SAMPLES=10 def get_mock_bytes_parser(): return "def bytes_parser_function(x):\n\treturn x" From a6dcf684990c328c849a4b308ffcd017b2c835ad Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Mon, 13 May 2024 23:33:46 +0200 Subject: [PATCH 19/32] Address comments. Add docstrings --- .../internal/triggers/amounttrigger.py | 6 --- .../internal/triggers/datadrifttrigger.py | 54 ++++++++++--------- .../model_downloader/model_downloader.py | 2 +- .../internal/triggers/timetrigger.py | 6 --- modyn/supervisor/internal/triggers/trigger.py | 8 +-- .../trigger_datasets/dataloader_info.py | 2 +- .../trigger_datasets/fixed_keys_dataset.py | 8 +++ .../online_trigger_dataset.py | 8 +++ modyn/supervisor/internal/triggers/utils.py | 18 ++++--- .../triggers/test_datadrifttrigger.py | 1 - 10 files changed, 63 insertions(+), 50 deletions(-) diff --git a/modyn/supervisor/internal/triggers/amounttrigger.py b/modyn/supervisor/internal/triggers/amounttrigger.py index 16d0c67b3..fcd7994a3 100644 --- a/modyn/supervisor/internal/triggers/amounttrigger.py +++ b/modyn/supervisor/internal/triggers/amounttrigger.py @@ -29,9 +29,3 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N self.remaining_data_points = (self.remaining_data_points + len(new_data)) % self.data_points_for_trigger yield from triggering_indices - - def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data_points: int) -> None: - pass - - def inform_previous_model(self, previous_model_id: int) -> None: - pass diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index d4aca4ed0..6ce80a795 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -1,6 +1,4 @@ -import json import logging -import os import pathlib from typing import Generator, Optional @@ -13,7 +11,8 @@ from modyn.supervisor.internal.triggers.trigger import Trigger from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo from modyn.supervisor.internal.triggers.utils import ( - get_embeddings_evidently_format, + convert_tensor_to_df, + get_embeddings, get_evidently_metrics, prepare_trigger_dataloader_by_trigger, prepare_trigger_dataloader_fixed_keys, @@ -44,9 +43,6 @@ def __init__(self, trigger_config: dict): self.evidently_column_mapping_name = "data" self.metrics: Optional[list] = None - self.exp_output_dir: Optional[pathlib.Path] = None - self.drift_dir: Optional[pathlib.Path] = None - self.data_cache: list[tuple[int, int, int]] = [] self.leftover_data_points = 0 @@ -128,14 +124,6 @@ def _init_model_downloader(self) -> None: f"{self.modyn_config['model_storage']['hostname']}:{self.modyn_config['model_storage']['port']}", ) - def _create_dirs(self) -> None: - assert self.pipeline_id is not None - assert self.base_dir is not None - - self.exp_output_dir = self.base_dir / str(self.pipeline_id) / "datadrift" - self.drift_dir = self.exp_output_dir / "drift_results" - os.makedirs(self.drift_dir, exist_ok=True) - def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path) -> None: self.pipeline_id = pipeline_id self.pipeline_config = pipeline_config @@ -144,13 +132,14 @@ def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: di self._init_dataloader_info() self._init_model_downloader() - self._create_dirs() def run_detection(self, reference_embeddings_df: pd.DataFrame, current_embeddings_df: pd.DataFrame) -> bool: assert self.dataloader_info is not None - assert self.drift_dir is not None # Run Evidently detection + # ColumnMapping is {mapping name: column indices}, + # an Evidently way of identifying (sub)columns to use in the detection. + # e.g. {"even columns": [0,2,4]}. column_mapping = ColumnMapping(embeddings={self.evidently_column_mapping_name: reference_embeddings_df.columns}) # https://docs.evidentlyai.com/user-guide/customization/embeddings-drift-parameters @@ -168,12 +157,15 @@ def run_detection(self, reference_embeddings_df: pd.DataFrame, current_embedding + f"[Result] {result_print}" ) - with open(self.drift_dir / f"drift_{self.previous_trigger_id}.json", "w", encoding="utf-8") as f: - json.dump(result, f) - return result["metrics"][0]["result"]["drift_detected"] def detect_drift(self, idx_start: int, idx_end: int) -> bool: + """Compare current data against reference data. + current data: all untriggered samples in the sliding window in inform(). + reference data: the training samples of the previous trigger. + Get the dataloaders, download the embedding encoder model if necessary, + compute embeddings of current and reference data, then run detection on the embeddings. + """ assert self.previous_trigger_id is not None assert self.previous_data_points is not None and self.previous_data_points > 0 assert self.previous_model_id is not None @@ -196,23 +188,36 @@ def detect_drift(self, idx_start: int, idx_end: int) -> bool: ) # Download previous model as embedding encoder - # TODO(366) Support custom model as embedding encoder + # TODO(417) Support custom model as embedding encoder if self.model_updated: self.model_downloader.download(self.previous_model_id) self.model_updated = False # Compute embeddings - reference_embeddings_df = get_embeddings_evidently_format(self.model_downloader, reference_dataloader) - current_embeddings_df = get_embeddings_evidently_format(self.model_downloader, current_dataloader) + reference_embeddings = get_embeddings(self.model_downloader, reference_dataloader) + current_embeddings = get_embeddings(self.model_downloader, current_dataloader) + reference_embeddings_df = convert_tensor_to_df(reference_embeddings, "col_") + current_embeddings_df = convert_tensor_to_df(current_embeddings, "col_") drift_detected = self.run_detection(reference_embeddings_df, current_embeddings_df) return drift_detected def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: - """ + """The DataDriftTrigger takes a batch of new data as input. It adds the new data to its data_cache. + Then, it iterates through the data_cache with a sliding window of current data for drift detection. + The sliding window is determined by two pointers: detection_idx_start and detection_idx_end. + We fix the start pointer and advance the end pointer by detection_interval in every iteration. + In every iteration we run data drift detection on the sliding window of current data. + Note, if we have remaining untriggered data from the previous batch of new data, + we include all of them in the first drift detection. + The remaining untriggered data has been processed in the previous new data batch, + so there's no need to run detection only on remaining data in this batch. + If a retraining is triggered, all data in the sliding window becomes triggering data. Advance the start ptr. + After traversing the data_cache, we remove all the triggered data from the cache + and record the number of remaining untriggered samples. Use Generator here because this data drift trigger - needs to wait for the previous trigger to finish and get the model + needs to wait for the previous trigger to finish and get the model. """ # add new data to data_cache self.data_cache.extend(new_data) @@ -239,6 +244,7 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N if triggered: trigger_data_points = detection_idx_end - detection_idx_start + # Index of the last sample of the trigger. Index is relative to the new_data list. trigger_idx = len(new_data) - (untriggered_data_points - trigger_data_points) - 1 # update bookkeeping and sliding window diff --git a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py index 86f0464fa..5bd3b453b 100644 --- a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py +++ b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) -# TODO(366) Unify similar code in trainer_server and evaluator. Create common.utils.ModelDownloader +# TODO(433) Unify similar code in trainer_server and evaluator. Create common.utils.ModelDownloader class ModelDownloader: def __init__( self, diff --git a/modyn/supervisor/internal/triggers/timetrigger.py b/modyn/supervisor/internal/triggers/timetrigger.py index 26bcdf775..f92e3e545 100644 --- a/modyn/supervisor/internal/triggers/timetrigger.py +++ b/modyn/supervisor/internal/triggers/timetrigger.py @@ -57,9 +57,3 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N self.next_trigger_at += self.trigger_every_s yield from triggering_indices - - def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data_points: int) -> None: - pass - - def inform_previous_model(self, previous_model_id: int) -> None: - pass diff --git a/modyn/supervisor/internal/triggers/trigger.py b/modyn/supervisor/internal/triggers/trigger.py index 50e847429..bccf300da 100644 --- a/modyn/supervisor/internal/triggers/trigger.py +++ b/modyn/supervisor/internal/triggers/trigger.py @@ -27,11 +27,13 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N triggering_indices (list[int]): List of all indices that trigger training """ - @abstractmethod + # pylint: disable=unnecessary-pass def inform_previous_trigger_and_data_points(self, previous_trigger_id: int, data_points: int) -> None: """The supervisor informs the Trigger about the previous trigger_id - and data points in the previous trigger""" + and data points in the previous trigger.""" + pass - @abstractmethod + # pylint: disable=unnecessary-pass def inform_previous_model(self, previous_model_id: int) -> None: """The supervisor informs the Trigger about the model_id of the previous trigger""" + pass diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/dataloader_info.py b/modyn/supervisor/internal/triggers/trigger_datasets/dataloader_info.py index 8d9cf7189..68a2a5b04 100644 --- a/modyn/supervisor/internal/triggers/trigger_datasets/dataloader_info.py +++ b/modyn/supervisor/internal/triggers/trigger_datasets/dataloader_info.py @@ -1,7 +1,7 @@ from typing import Optional -# TODO(366): Unify with similar classes in trainer_server and evaluator +# TODO(415): Unify with similar classes in trainer_server and evaluator class DataLoaderInfo: def __init__( self, diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py b/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py index 2f98af0b2..fc094a3e0 100644 --- a/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py +++ b/modyn/supervisor/internal/triggers/trigger_datasets/fixed_keys_dataset.py @@ -23,7 +23,15 @@ # TODO(#275): inherit common abstraction of dataset class FixedKeysDataset(IterableDataset): + """The FixedKeysDataset is created given a list of fixed sample keys. It fetches samples by the given sample keys. + It can used when sample keys are known, but the corresponding trigger_id is unknown. + It can also be used when user wants a dataset containing samples from multiple triggers if the keys are known. + In DataDriftTrigger, for example, FixedKeysDataset is used for current untriggered samples + because they belong to a future trigger whose trigger_id is unknown. + """ + # pylint: disable=too-many-instance-attributes, abstract-method + def __init__( self, dataset_id: str, diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/online_trigger_dataset.py b/modyn/supervisor/internal/triggers/trigger_datasets/online_trigger_dataset.py index 6f8993ada..0816bc2da 100644 --- a/modyn/supervisor/internal/triggers/trigger_datasets/online_trigger_dataset.py +++ b/modyn/supervisor/internal/triggers/trigger_datasets/online_trigger_dataset.py @@ -10,7 +10,15 @@ # TODO(#275): inherit common abstraction of dataset class OnlineTriggerDataset(OnlineDataset, IterableDataset): + """The OnlineTriggerDataset is a wrapper around OnlineDataset in trainer_server. + It uses logic in OnlineDataset obtain samples by trigger_id. + It supports random sampling to reduce the number of samples if the sample_prob is provided. + Random sampling is needed for example in DataDriftTrigger to reduce the number of samples + processed in data drift detection in case there are too many untriggered samples. + """ + # pylint: disable=too-many-instance-attributes, abstract-method + def __init__( self, pipeline_id: int, diff --git a/modyn/supervisor/internal/triggers/utils.py b/modyn/supervisor/internal/triggers/utils.py index e848658e3..c3fb9fcfa 100644 --- a/modyn/supervisor/internal/triggers/utils.py +++ b/modyn/supervisor/internal/triggers/utils.py @@ -16,7 +16,10 @@ def get_evidently_metrics( column_mapping_name: str, trigger_config: Optional[dict] = None ) -> list[EmbeddingsDriftMetric]: - """Evidently metric configurations follow exactly the four DriftMethods defined in embedding_drift_methods: + """This function instantiates an Evidently metric given metric configuration. + If we want to support multiple metrics in the future, we can change the code to looping through the configurations. + + Evidently metric configurations follow exactly the four DriftMethods defined in embedding_drift_methods: model, distance, mmd, ratio If metric_name not given, we use the default 'model' metric. Otherwise, we use the metric given by metric_name, with optional metric configuration specific to the metric. @@ -131,10 +134,9 @@ def get_embeddings(model_downloader: ModelDownloader, dataloader: DataLoader) -> return all_embeddings -def get_embeddings_evidently_format( - model_downloader: ModelDownloader, dataloader: torch.utils.data.DataLoader -) -> pd.DataFrame: - embeddings_numpy = get_embeddings(model_downloader, dataloader).cpu().detach().numpy() - embeddings_df = pd.DataFrame(embeddings_numpy).astype("float64") - embeddings_df.columns = ["col_" + str(x) for x in embeddings_df.columns] - return embeddings_df +def convert_tensor_to_df(t: torch.Tensor, column_name_prefix: Optional[str] = None) -> pd.DataFrame: + matrix_numpy = t.cpu().detach().numpy() + df = pd.DataFrame(matrix_numpy).astype("float64") + if column_name_prefix is not None: + df.columns = [column_name_prefix + str(x) for x in df.columns] + return df diff --git a/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py b/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py index cfd0ac173..2a986bd14 100644 --- a/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py +++ b/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py @@ -133,7 +133,6 @@ def test_init_trigger() -> None: assert trigger.pipeline_config == pipeline_config assert trigger.modyn_config == modyn_config assert trigger.base_dir == BASEDIR - assert trigger.drift_dir is not None assert isinstance(trigger.dataloader_info, DataLoaderInfo) assert isinstance(trigger.model_downloader, ModelDownloader) From 1fb40421196482edbbb1333adcf8affa2c4e9671 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Mon, 13 May 2024 23:50:35 +0200 Subject: [PATCH 20/32] Fix merge bugs --- .../pipeline_executor/pipeline_executor.py | 49 +++---------------- 1 file changed, 8 insertions(+), 41 deletions(-) diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index 4881581a9..a5392e35d 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -179,22 +179,15 @@ def _handle_new_data(self, new_data: list[tuple[int, int, int]]) -> bool: def _handle_new_data_batch(self, batch: list[tuple[int, int, int]]) -> bool: self._sw.start("trigger_inform", overwrite=True) - triggering_indices: Generator[int, None, None] = self.trigger.inform(batch) - num_triggers = self._handle_triggers_within_batch(batch, triggering_indices) - - logger.info(f"There are {num_triggers} triggers in this batch.") - self.pipeline_log["supervisor"]["num_triggers"] += num_triggers + triggering_indices = self.trigger.inform(batch) + num_triggers = len(triggering_indices) + self.pipeline_log["supervisor"]["num_triggers"] += len(triggering_indices) self.pipeline_log["supervisor"]["trigger_batch_times"].append( {"batch_size": len(batch), "time": self._sw.stop("trigger_inform"), "num_triggers": num_triggers} ) - if num_triggers == 0: - self._sw.start("selector_inform", overwrite=True) - selector_log = self.grpc.inform_selector(self.pipeline_id, batch) - self.pipeline_log["supervisor"]["selector_informs"].append( - {"total_selector_time": self._sw.stop(), "selector_log": selector_log} - ) - + logger.info(f"There are {num_triggers} triggers in this batch.") + self._handle_triggers_within_batch(batch, triggering_indices) return num_triggers > 0 def _run_training(self, trigger_id: int) -> None: @@ -284,12 +277,8 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg previous_trigger_idx = 0 logger.info("Handling triggers within batch.") self._update_pipeline_stage_and_enqueue_msg(PipelineStage.HANDLE_TRIGGERS_WITHIN_BATCH, MsgType.GENERAL) - num_triggers_in_batch = 0 - last_trigger_id: Optional[int] = None - - for triggering_idx in triggering_indices: - num_triggers_in_batch += 1 + for i, triggering_idx in enumerate(triggering_indices): self._update_pipeline_stage_and_enqueue_msg(PipelineStage.INFORM_SELECTOR_AND_TRIGGER, MsgType.GENERAL) triggering_data = batch[previous_trigger_idx : triggering_idx + 1] previous_trigger_idx = triggering_idx + 1 @@ -320,33 +309,11 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg ) else: logger.info(f"Skipping training on empty trigger {trigger_id}]") + self.num_triggers = self.num_triggers + 1 if self.maximum_triggers is not None and self.num_triggers >= self.maximum_triggers: - return num_triggers_in_batch - - # If no other trigger is coming in this batch, - # we have to inform the Selector about the remaining data in this batch. - - remaining_data = batch[previous_trigger_idx:] - logger.info(f"There are {len(remaining_data)} data points remaining after the trigger.") - - if len(remaining_data) > 0 and last_trigger_id is not None: - # These data points will be included in the next trigger - # because we inform the Selector about them, - # just like other batches with no trigger at all are included. - self._update_pipeline_stage_and_enqueue_msg( - PipelineStage.INFORM_SELECTOR_REMAINING_DATA, - MsgType.ID, - id_submsg(IdType.TRIGGER, last_trigger_id), - ) - self._sw.start("selector_inform", overwrite=True) - selector_log = self.grpc.inform_selector(self.pipeline_id, remaining_data) - self.pipeline_log["supervisor"]["selector_informs"].append( - {"total_selector_time": self._sw.stop(), "selector_log": selector_log} - ) + break - self._persist_pipeline_log() - return num_triggers_in_batch # we have to inform the Selector about the remaining data in this batch. if len(triggering_indices) == 0: From d4ce734497f72ca8a28de1f22520923479e09537 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 01:04:32 +0200 Subject: [PATCH 21/32] Fix pipeline_executor --- modyn/config/schema/pipeline.py | 2 -- .../pipeline_executor/pipeline_executor.py | 31 +++++++++++++------ 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/modyn/config/schema/pipeline.py b/modyn/config/schema/pipeline.py index 8f2dfff48..eaa161f4c 100644 --- a/modyn/config/schema/pipeline.py +++ b/modyn/config/schema/pipeline.py @@ -164,7 +164,6 @@ class _BaseSelectionStrategyConfig(BaseModel): class FreshnessSamplingStrategyConfig(_BaseSelectionStrategyConfig): - unused_data_ratio: float = Field( 0.0, description=( @@ -175,7 +174,6 @@ class FreshnessSamplingStrategyConfig(_BaseSelectionStrategyConfig): class NewDataSelectionStrategyConfig(_BaseSelectionStrategyConfig): - limit_reset: LimitResetStrategy = Field( description=( "Strategy to follow for respecting the limit in case of reset. Only used when reset_after_trigger == true." diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index a5392e35d..a0de8236b 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -179,15 +179,22 @@ def _handle_new_data(self, new_data: list[tuple[int, int, int]]) -> bool: def _handle_new_data_batch(self, batch: list[tuple[int, int, int]]) -> bool: self._sw.start("trigger_inform", overwrite=True) - triggering_indices = self.trigger.inform(batch) - num_triggers = len(triggering_indices) - self.pipeline_log["supervisor"]["num_triggers"] += len(triggering_indices) + triggering_indices: Generator[int, None, None] = self.trigger.inform(batch) + num_triggers = self._handle_triggers_within_batch(batch, triggering_indices) + + logger.info(f"There are {num_triggers} triggers in this batch.") + self.pipeline_log["supervisor"]["num_triggers"] += num_triggers self.pipeline_log["supervisor"]["trigger_batch_times"].append( {"batch_size": len(batch), "time": self._sw.stop("trigger_inform"), "num_triggers": num_triggers} ) - logger.info(f"There are {num_triggers} triggers in this batch.") - self._handle_triggers_within_batch(batch, triggering_indices) + if num_triggers == 0: + self._sw.start("selector_inform", overwrite=True) + selector_log = self.grpc.inform_selector(self.pipeline_id, batch) + self.pipeline_log["supervisor"]["selector_informs"].append( + {"total_selector_time": self._sw.stop(), "selector_log": selector_log} + ) + return num_triggers > 0 def _run_training(self, trigger_id: int) -> None: @@ -273,12 +280,15 @@ def _get_trigger_timespan( return first_timestamp, last_timestamp - def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], triggering_indices: list[int]) -> None: + def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], triggering_indices: list[int]) -> int: previous_trigger_idx = 0 logger.info("Handling triggers within batch.") self._update_pipeline_stage_and_enqueue_msg(PipelineStage.HANDLE_TRIGGERS_WITHIN_BATCH, MsgType.GENERAL) + num_triggers = 0 + last_trigger_id: Optional[int] = None for i, triggering_idx in enumerate(triggering_indices): + num_triggers += 1 self._update_pipeline_stage_and_enqueue_msg(PipelineStage.INFORM_SELECTOR_AND_TRIGGER, MsgType.GENERAL) triggering_data = batch[previous_trigger_idx : triggering_idx + 1] previous_trigger_idx = triggering_idx + 1 @@ -298,6 +308,7 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg num_samples_in_trigger = self.grpc.get_number_of_samples(self.pipeline_id, trigger_id) if num_samples_in_trigger > 0: + self.trigger.inform_previous_trigger_and_data_points(trigger_id, num_samples_in_trigger) first_timestamp, last_timestamp = self._get_trigger_timespan(i == 0, triggering_data) self.pipeline_log["supervisor"]["triggers"][trigger_id]["first_timestamp"] = first_timestamp self.pipeline_log["supervisor"]["triggers"][trigger_id]["last_timestamp"] = last_timestamp @@ -312,8 +323,7 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg self.num_triggers = self.num_triggers + 1 if self.maximum_triggers is not None and self.num_triggers >= self.maximum_triggers: - break - + return num_triggers # we have to inform the Selector about the remaining data in this batch. if len(triggering_indices) == 0: @@ -322,7 +332,7 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg remaining_data = batch[triggering_indices[-1] + 1 :] logger.info(f"There are {len(remaining_data)} data points remaining after the trigger.") - if len(remaining_data) > 0: + if len(remaining_data) > 0 and last_trigger_id is not None: # These data points will be included in the next trigger # because we inform the Selector about them, # just like other batches with no trigger at all are included. @@ -339,6 +349,9 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg else: self.remaining_data_range = None + self._persist_pipeline_log() + return num_triggers + def _init_evaluation_writer(self, name: str, trigger_id: int) -> LogResultWriter: return self.supervisor_supported_eval_result_writers[name](self.pipeline_id, trigger_id, self.eval_directory) From f3532f49761fa4652ead92d3a2080c86d35dbe3e Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 01:39:47 +0200 Subject: [PATCH 22/32] Fix mypy and unittest --- .../pipeline_executor/pipeline_executor.py | 18 +++++++++++------- .../internal/triggers/datadrifttrigger.py | 4 ++++ .../model_downloader/model_downloader.py | 1 + 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index a0de8236b..fcd23b883 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -114,7 +114,8 @@ def _setup_trigger(self) -> None: trigger_module = dynamic_module_import("modyn.supervisor.internal.triggers") self.trigger: Trigger = getattr(trigger_module, trigger_id)(trigger_config) self.trigger.init_trigger(self.pipeline_id, self.pipeline_config, self.modyn_config, self.eval_directory) - self.trigger.inform_previous_model(self.previous_model_id) + if self.previous_model_id is not None: + self.trigger.inform_previous_model(self.previous_model_id) assert self.trigger is not None, "Error during trigger initialization" @@ -280,14 +281,18 @@ def _get_trigger_timespan( return first_timestamp, last_timestamp - def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], triggering_indices: list[int]) -> int: + def _handle_triggers_within_batch( + self, batch: list[tuple[int, int, int]], triggering_indices: Generator[int, None, None] + ) -> int: previous_trigger_idx = 0 logger.info("Handling triggers within batch.") self._update_pipeline_stage_and_enqueue_msg(PipelineStage.HANDLE_TRIGGERS_WITHIN_BATCH, MsgType.GENERAL) num_triggers = 0 - last_trigger_id: Optional[int] = None + + triggering_idx_list = [] for i, triggering_idx in enumerate(triggering_indices): + triggering_idx_list.append(triggering_idx) num_triggers += 1 self._update_pipeline_stage_and_enqueue_msg(PipelineStage.INFORM_SELECTOR_AND_TRIGGER, MsgType.GENERAL) triggering_data = batch[previous_trigger_idx : triggering_idx + 1] @@ -299,7 +304,6 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg # that all data until that point has been processed by the selector. self._sw.start("selector_inform", overwrite=True) trigger_id, selector_log = self.grpc.inform_selector_and_trigger(self.pipeline_id, triggering_data) - last_trigger_id = trigger_id self.pipeline_log["supervisor"]["triggers"][trigger_id] = { "total_selector_time": self._sw.stop(), "selector_log": selector_log, @@ -326,13 +330,13 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg return num_triggers # we have to inform the Selector about the remaining data in this batch. - if len(triggering_indices) == 0: + if len(triggering_idx_list) == 0: remaining_data = batch else: - remaining_data = batch[triggering_indices[-1] + 1 :] + remaining_data = batch[triggering_idx_list[-1] + 1 :] logger.info(f"There are {len(remaining_data)} data points remaining after the trigger.") - if len(remaining_data) > 0 and last_trigger_id is not None: + if len(remaining_data) > 0: # These data points will be included in the next trigger # because we inform the Selector about them, # just like other batches with no trigger at all are included. diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index 6ce80a795..204622446 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -63,6 +63,7 @@ def _parse_trigger_config(self, trigger_config: dict) -> None: self.metrics = get_evidently_metrics(self.evidently_column_mapping_name, trigger_config) def _init_dataloader_info(self) -> None: + assert self.pipeline_id is not None assert self.pipeline_config is not None assert self.modyn_config is not None @@ -113,8 +114,10 @@ def _init_dataloader_info(self) -> None: ) def _init_model_downloader(self) -> None: + assert self.pipeline_id is not None assert self.pipeline_config is not None assert self.modyn_config is not None + assert self.base_dir is not None self.model_downloader = ModelDownloader( self.modyn_config, @@ -180,6 +183,7 @@ def detect_drift(self, idx_start: int, idx_end: int) -> bool: ) current_keys, _, _ = zip(*self.data_cache[idx_start:idx_end]) # type: ignore + current_keys: list[int] = list(current_keys) current_dataloader = prepare_trigger_dataloader_fixed_keys( self.previous_trigger_id + 1, self.dataloader_info, diff --git a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py index 5bd3b453b..7f6898058 100644 --- a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py +++ b/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py @@ -103,4 +103,5 @@ def download(self, model_id: int) -> None: identifier=self.pipeline_id, base_directory=self.base_dir, ) + assert trained_model_path is not None self._load_state(trained_model_path) From 81263987256ce66b54e07f56a71ea2320279d503 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 01:48:41 +0200 Subject: [PATCH 23/32] Fix mypy --- modyn/supervisor/internal/triggers/datadrifttrigger.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index 204622446..8ed6cfcc3 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -183,13 +183,12 @@ def detect_drift(self, idx_start: int, idx_end: int) -> bool: ) current_keys, _, _ = zip(*self.data_cache[idx_start:idx_end]) # type: ignore - current_keys: list[int] = list(current_keys) current_dataloader = prepare_trigger_dataloader_fixed_keys( self.previous_trigger_id + 1, self.dataloader_info, current_keys, sample_size=self.sample_size, - ) + ) # type: ignore # Download previous model as embedding encoder # TODO(417) Support custom model as embedding encoder From d8d4c5a87e3582a260267ed79e511ba425b0a6b0 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 02:57:30 +0200 Subject: [PATCH 24/32] Fix unittest --- .../internal/pipeline_executor/pipeline_executor.py | 8 -------- modyn/supervisor/internal/triggers/datadrifttrigger.py | 2 +- .../internal/pipeline_executor/test_pipeline_executor.py | 7 ++++++- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index fcd23b883..05d4b3292 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -189,13 +189,6 @@ def _handle_new_data_batch(self, batch: list[tuple[int, int, int]]) -> bool: {"batch_size": len(batch), "time": self._sw.stop("trigger_inform"), "num_triggers": num_triggers} ) - if num_triggers == 0: - self._sw.start("selector_inform", overwrite=True) - selector_log = self.grpc.inform_selector(self.pipeline_id, batch) - self.pipeline_log["supervisor"]["selector_informs"].append( - {"total_selector_time": self._sw.stop(), "selector_log": selector_log} - ) - return num_triggers > 0 def _run_training(self, trigger_id: int) -> None: @@ -353,7 +346,6 @@ def _handle_triggers_within_batch( else: self.remaining_data_range = None - self._persist_pipeline_log() return num_triggers def _init_evaluation_writer(self, name: str, trigger_id: int) -> LogResultWriter: diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index 8ed6cfcc3..872abd66e 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -188,7 +188,7 @@ def detect_drift(self, idx_start: int, idx_end: int) -> bool: self.dataloader_info, current_keys, sample_size=self.sample_size, - ) # type: ignore + ) # type: ignore # Download previous model as embedding encoder # TODO(417) Support custom model as embedding encoder diff --git a/modyn/tests/supervisor/internal/pipeline_executor/test_pipeline_executor.py b/modyn/tests/supervisor/internal/pipeline_executor/test_pipeline_executor.py index 3da959e25..5a102d616 100644 --- a/modyn/tests/supervisor/internal/pipeline_executor/test_pipeline_executor.py +++ b/modyn/tests/supervisor/internal/pipeline_executor/test_pipeline_executor.py @@ -301,7 +301,12 @@ def test__handle_new_data_batch_no_triggers(test_inform_selector: MagicMock): pe.pipeline_id = 42 batch = [(10, 1), (11, 2)] - with patch.object(pe.trigger, "inform", return_value=[]) as inform_mock: + with patch.object(pe.trigger, "inform") as inform_mock: + + def fake(self): + yield from [] + + inform_mock.side_effect = fake assert not pe._handle_new_data_batch(batch) inform_mock.assert_called_once_with(batch) From 5e1e98494a5099c870ee3efe6e98b9a3d1b43de9 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 09:11:20 +0200 Subject: [PATCH 25/32] Fix mypy --- modyn/supervisor/internal/triggers/datadrifttrigger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index 872abd66e..2fde80038 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -186,9 +186,9 @@ def detect_drift(self, idx_start: int, idx_end: int) -> bool: current_dataloader = prepare_trigger_dataloader_fixed_keys( self.previous_trigger_id + 1, self.dataloader_info, - current_keys, + current_keys, # type: ignore sample_size=self.sample_size, - ) # type: ignore + ) # Download previous model as embedding encoder # TODO(417) Support custom model as embedding encoder From 55797c1c12e22c1c2eca7f27e2f4eadfd041b4b3 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 10:12:21 +0200 Subject: [PATCH 26/32] Fix linting --- modyn/supervisor/internal/triggers/datadrifttrigger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index 2fde80038..a1531640b 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -186,7 +186,7 @@ def detect_drift(self, idx_start: int, idx_end: int) -> bool: current_dataloader = prepare_trigger_dataloader_fixed_keys( self.previous_trigger_id + 1, self.dataloader_info, - current_keys, # type: ignore + current_keys, # type: ignore sample_size=self.sample_size, ) From 16d07808567b8d40963c1089ce914c4e81fc50f9 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 14:31:55 +0200 Subject: [PATCH 27/32] Add embedding encoder utils. Change model downloader to embedding encoder downloader. --- .../internal/triggers/datadrifttrigger.py | 24 ++++---- .../__init__.py | 3 +- .../embedding_encoder.py | 49 +++++++++++++++ .../embedding_encoder_downloader.py} | 60 +++++-------------- modyn/supervisor/internal/triggers/utils.py | 24 ++++---- .../triggers/test_datadrifttrigger.py | 9 ++- 6 files changed, 96 insertions(+), 73 deletions(-) rename modyn/supervisor/internal/triggers/{model_downloader => embedding_encoder_utils}/__init__.py (62%) create mode 100644 modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py rename modyn/supervisor/internal/triggers/{model_downloader/model_downloader.py => embedding_encoder_utils/embedding_encoder_downloader.py} (61%) diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py index a1531640b..a6b375dd8 100644 --- a/modyn/supervisor/internal/triggers/datadrifttrigger.py +++ b/modyn/supervisor/internal/triggers/datadrifttrigger.py @@ -5,7 +5,7 @@ import pandas as pd from evidently import ColumnMapping from evidently.report import Report -from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader +from modyn.supervisor.internal.triggers.embedding_encoder_utils import EmbeddingEncoder, EmbeddingEncoderDownloader # pylint: disable-next=no-name-in-module from modyn.supervisor.internal.triggers.trigger import Trigger @@ -36,7 +36,8 @@ def __init__(self, trigger_config: dict): self.model_updated: bool = False self.dataloader_info: Optional[DataLoaderInfo] = None - self.model_downloader: Optional[ModelDownloader] = None + self.encoder_downloader: Optional[EmbeddingEncoderDownloader] = None + self.embedding_encoder: Optional[EmbeddingEncoder] = None self.detection_interval: int = 1000 self.sample_size: Optional[int] = None @@ -113,16 +114,15 @@ def _init_dataloader_info(self) -> None: tokenizer=tokenizer, ) - def _init_model_downloader(self) -> None: + def _init_encoder_downloader(self) -> None: assert self.pipeline_id is not None assert self.pipeline_config is not None assert self.modyn_config is not None assert self.base_dir is not None - self.model_downloader = ModelDownloader( + self.encoder_downloader = EmbeddingEncoderDownloader( self.modyn_config, self.pipeline_id, - self.pipeline_config["training"]["device"], self.base_dir, f"{self.modyn_config['model_storage']['hostname']}:{self.modyn_config['model_storage']['port']}", ) @@ -134,7 +134,7 @@ def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: di self.base_dir = base_dir self._init_dataloader_info() - self._init_model_downloader() + self._init_encoder_downloader() def run_detection(self, reference_embeddings_df: pd.DataFrame, current_embeddings_df: pd.DataFrame) -> bool: assert self.dataloader_info is not None @@ -173,7 +173,8 @@ def detect_drift(self, idx_start: int, idx_end: int) -> bool: assert self.previous_data_points is not None and self.previous_data_points > 0 assert self.previous_model_id is not None assert self.dataloader_info is not None - assert self.model_downloader is not None + assert self.encoder_downloader is not None + assert self.pipeline_config is not None reference_dataloader = prepare_trigger_dataloader_by_trigger( self.previous_trigger_id, @@ -193,12 +194,15 @@ def detect_drift(self, idx_start: int, idx_end: int) -> bool: # Download previous model as embedding encoder # TODO(417) Support custom model as embedding encoder if self.model_updated: - self.model_downloader.download(self.previous_model_id) + self.embedding_encoder = self.encoder_downloader.setup_encoder( + self.previous_model_id, self.pipeline_config["training"]["device"] + ) self.model_updated = False # Compute embeddings - reference_embeddings = get_embeddings(self.model_downloader, reference_dataloader) - current_embeddings = get_embeddings(self.model_downloader, current_dataloader) + assert self.embedding_encoder is not None + reference_embeddings = get_embeddings(self.embedding_encoder, reference_dataloader) + current_embeddings = get_embeddings(self.embedding_encoder, current_dataloader) reference_embeddings_df = convert_tensor_to_df(reference_embeddings, "col_") current_embeddings_df = convert_tensor_to_df(current_embeddings, "col_") diff --git a/modyn/supervisor/internal/triggers/model_downloader/__init__.py b/modyn/supervisor/internal/triggers/embedding_encoder_utils/__init__.py similarity index 62% rename from modyn/supervisor/internal/triggers/model_downloader/__init__.py rename to modyn/supervisor/internal/triggers/embedding_encoder_utils/__init__.py index 6e319ec2a..982eaa1ca 100644 --- a/modyn/supervisor/internal/triggers/model_downloader/__init__.py +++ b/modyn/supervisor/internal/triggers/embedding_encoder_utils/__init__.py @@ -4,7 +4,8 @@ import os -from .model_downloader import ModelDownloader # noqa: F401 +from .embedding_encoder import EmbeddingEncoder # noqa: F401 +from .embedding_encoder_downloader import EmbeddingEncoderDownloader # noqa: F401 files = os.listdir(os.path.dirname(__file__)) files.remove("__init__.py") diff --git a/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py new file mode 100644 index 000000000..dfcd5014c --- /dev/null +++ b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py @@ -0,0 +1,49 @@ +import io +import json +import logging +import pathlib + +import torch +from modyn.models.coreset_methods_support import CoresetSupportingModule +from modyn.utils import dynamic_module_import + +logger = logging.getLogger(__name__) + + +class EmbeddingEncoder: + def __init__( + self, + model_id: int, + model_class_name: str, + model_config: str, + device: str, + amp: bool, + ): + self.model_id = model_id + self.device = device + self.device_type = "cuda" if "cuda" in self.device else "cpu" + self.amp = amp + + self.model_class_name = model_class_name + model_module = dynamic_module_import("modyn.models") + self.model_handler = getattr(model_module, model_class_name) + assert self.model_handler is not None + + self.model_configuration_dict = json.loads(model_config) + + self._model = self.model_handler(self.model_configuration_dict, device, amp) + assert self._model is not None + assert isinstance(self._model.model, CoresetSupportingModule) + + def _load_state(self, path: pathlib.Path) -> None: + assert path.exists(), "Cannot load state from non-existing file" + + logger.info(f"Loading model state from {path}") + with open(path, "rb") as state_file: + checkpoint = torch.load(io.BytesIO(state_file.read()), map_location=torch.device("cpu")) + + assert "model" in checkpoint + self._model.model.load_state_dict(checkpoint["model"]) + + # delete trained model from disk + path.unlink() diff --git a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py similarity index 61% rename from modyn/supervisor/internal/triggers/model_downloader/model_downloader.py rename to modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py index 7f6898058..4ad9efdc0 100644 --- a/modyn/supervisor/internal/triggers/model_downloader/model_downloader.py +++ b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py @@ -1,11 +1,8 @@ -import io -import json import logging import pathlib from typing import Optional import grpc -import torch from modyn.common.ftp import download_trained_model from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models import TrainedModel @@ -13,37 +10,26 @@ # pylint: disable-next=no-name-in-module from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import FetchModelRequest, FetchModelResponse from modyn.model_storage.internal.grpc.generated.model_storage_pb2_grpc import ModelStorageStub -from modyn.models.coreset_methods_support import CoresetSupportingModule -from modyn.utils import dynamic_module_import +from modyn.supervisor.internal.triggers.embedding_encoder_utils import EmbeddingEncoder from modyn.utils.utils import grpc_connection_established logger = logging.getLogger(__name__) -# TODO(433) Unify similar code in trainer_server and evaluator. Create common.utils.ModelDownloader -class ModelDownloader: +class EmbeddingEncoderDownloader: def __init__( self, modyn_config: dict, pipeline_id: int, - device: str, base_dir: pathlib.Path, model_storage_address: str, ): self.modyn_config = modyn_config self.pipeline_id = pipeline_id - self._device = device - self._device_type = "cuda" if "cuda" in self._device else "cpu" self.base_dir = base_dir assert self.base_dir.exists(), f"Temporary Directory {self.base_dir} should have been created." - self._model_storage_stub = self.connect_to_model_storage(model_storage_address) - self.model_configuration_dict: Optional[dict] = None - self.model_handler = None - self._amp: Optional[bool] = None - self._model = None - @staticmethod def connect_to_model_storage(model_storage_address: str) -> ModelStorageStub: model_storage_channel = grpc.insecure_channel(model_storage_address) @@ -54,41 +40,18 @@ def connect_to_model_storage(model_storage_address: str) -> ModelStorageStub: ) return ModelStorageStub(model_storage_channel) - def _load_state(self, path: pathlib.Path) -> None: - assert path.exists(), "Cannot load state from non-existing file" - assert self._model is not None - - logger.info(f"Loading model state from {path}") - with open(path, "rb") as state_file: - checkpoint = torch.load(io.BytesIO(state_file.read()), map_location=torch.device("cpu")) - - assert "model" in checkpoint - self._model.model.load_state_dict(checkpoint["model"]) - - # delete trained model from disk - path.unlink() - - def configure(self, model_id: int) -> None: + def configure(self, model_id: int, device: str) -> Optional[EmbeddingEncoder]: with MetadataDatabaseConnection(self.modyn_config) as database: trained_model: Optional[TrainedModel] = database.session.get(TrainedModel, model_id) if not trained_model: logger.error(f"Trained model {model_id} does not exist!") - return + return None model_class_name, model_config, amp = database.get_model_configuration(trained_model.pipeline_id) - self._amp = amp - model_module = dynamic_module_import("modyn.models") - self.model_handler = getattr(model_module, model_class_name) - assert self.model_handler is not None - - self.model_configuration_dict = json.loads(model_config) - self._model = self.model_handler(self.model_configuration_dict, self._device, amp) - - assert self._model is not None - assert isinstance(self._model.model, CoresetSupportingModule) - def download(self, model_id: int) -> None: - self.configure(model_id) + embedding_encoder = EmbeddingEncoder(model_id, model_class_name, model_config, device, amp) + return embedding_encoder + def download(self, model_id: int) -> pathlib.Path: fetch_request = FetchModelRequest(model_id=model_id, load_metadata=False) fetch_resp: FetchModelResponse = self._model_storage_stub.FetchModel(fetch_request) @@ -104,4 +67,11 @@ def download(self, model_id: int) -> None: base_directory=self.base_dir, ) assert trained_model_path is not None - self._load_state(trained_model_path) + return trained_model_path + + def setup_encoder(self, model_id: int, device: str) -> EmbeddingEncoder: + embedding_encoder = self.configure(model_id, device) + assert embedding_encoder is not None + trained_model_path = self.download(model_id) + embedding_encoder._load_state(trained_model_path) + return embedding_encoder diff --git a/modyn/supervisor/internal/triggers/utils.py b/modyn/supervisor/internal/triggers/utils.py index c3fb9fcfa..9b087fc70 100644 --- a/modyn/supervisor/internal/triggers/utils.py +++ b/modyn/supervisor/internal/triggers/utils.py @@ -6,7 +6,7 @@ import torch from evidently.metrics import EmbeddingsDriftMetric from evidently.metrics.data_drift import embedding_drift_methods -from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader +from modyn.supervisor.internal.triggers.embedding_encoder_utils import EmbeddingEncoder from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo, FixedKeysDataset, OnlineTriggerDataset from torch.utils.data import DataLoader @@ -98,38 +98,38 @@ def prepare_trigger_dataloader_fixed_keys( return DataLoader(train_set, batch_size=dataloader_info.batch_size, num_workers=dataloader_info.num_dataloaders) -def get_embeddings(model_downloader: ModelDownloader, dataloader: DataLoader) -> torch.Tensor: +def get_embeddings(embedding_encoder: EmbeddingEncoder, dataloader: DataLoader) -> torch.Tensor: """ - input: model_downloader with downloaded model + input: embedding_encoder with downloaded model output: embeddings Tensor """ - assert model_downloader._model is not None + assert embedding_encoder._model is not None all_embeddings: Optional[torch.Tensor] = None - model_downloader._model.model.eval() - model_downloader._model.model.embedding_recorder.start_recording() + embedding_encoder._model.model.eval() + embedding_encoder._model.model.embedding_recorder.start_recording() with torch.no_grad(): for batch in dataloader: data: Union[torch.Tensor, dict] if isinstance(batch[1], torch.Tensor): - data = batch[1].to(model_downloader._device) + data = batch[1].to(embedding_encoder.device) elif isinstance(batch[1], dict): data: dict[str, torch.Tensor] = {} # type: ignore[no-redef] for name, tensor in batch[1].items(): - data[name] = tensor.to(model_downloader._device) + data[name] = tensor.to(embedding_encoder.device) else: raise ValueError(f"data type {type(batch[1])} not supported") - with torch.autocast(model_downloader._device_type, enabled=model_downloader._amp): - model_downloader._model.model(data) - embeddings = model_downloader._model.model.embedding_recorder.embedding + with torch.autocast(embedding_encoder.device_type, enabled=embedding_encoder.amp): + embedding_encoder._model.model(data) + embeddings = embedding_encoder._model.model.embedding_recorder.embedding if all_embeddings is None: all_embeddings = embeddings else: all_embeddings = torch.cat((all_embeddings, embeddings), 0) - model_downloader._model.model.embedding_recorder.end_recording() + embedding_encoder._model.model.embedding_recorder.end_recording() return all_embeddings diff --git a/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py b/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py index 2a986bd14..6ab793f29 100644 --- a/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py +++ b/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py @@ -6,7 +6,7 @@ import pytest from modyn.supervisor.internal.triggers import DataDriftTrigger -from modyn.supervisor.internal.triggers.model_downloader import ModelDownloader +from modyn.supervisor.internal.triggers.embedding_encoder_utils import EmbeddingEncoderDownloader from modyn.supervisor.internal.triggers.trigger_datasets import DataLoaderInfo BASEDIR: pathlib.Path = pathlib.Path(os.path.realpath(__file__)).parent / "test_eval_dir" @@ -74,11 +74,10 @@ def noop(self) -> None: pass -def noop_model_downloader_constructor_mock( +def noop_embedding_encoder_downloader_constructor_mock( self, modyn_config: dict, pipeline_id: int, - device: str, base_dir: pathlib.Path, model_storage_address: str, ) -> None: @@ -120,7 +119,7 @@ def test_init_fails_if_invalid() -> None: DataDriftTrigger({"sample_size": 0}) -@patch.object(ModelDownloader, "__init__", noop_model_downloader_constructor_mock) +@patch.object(EmbeddingEncoderDownloader, "__init__", noop_embedding_encoder_downloader_constructor_mock) @patch.object(DataLoaderInfo, "__init__", noop_dataloader_info_constructor_mock) def test_init_trigger() -> None: trigger = DataDriftTrigger(get_minimal_trigger_config()) @@ -134,7 +133,7 @@ def test_init_trigger() -> None: assert trigger.modyn_config == modyn_config assert trigger.base_dir == BASEDIR assert isinstance(trigger.dataloader_info, DataLoaderInfo) - assert isinstance(trigger.model_downloader, ModelDownloader) + assert isinstance(trigger.encoder_downloader, EmbeddingEncoderDownloader) def test_inform_previous_trigger_and_data_points() -> None: From f25f77fce29a14da2317d20046c0bd80f85e3821 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 14:38:55 +0200 Subject: [PATCH 28/32] Add docstrings. --- .../triggers/embedding_encoder_utils/embedding_encoder.py | 4 ++++ .../embedding_encoder_utils/embedding_encoder_downloader.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py index dfcd5014c..d7e2eab7d 100644 --- a/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py +++ b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py @@ -11,6 +11,9 @@ class EmbeddingEncoder: + """The EmbeddingEncoder stores a model and its metadata. + DataDriftTrigger uses EmbeddingEncoder to run the model. + """ def __init__( self, model_id: int, @@ -33,6 +36,7 @@ def __init__( self._model = self.model_handler(self.model_configuration_dict, device, amp) assert self._model is not None + # The model must be able to record embeddings. assert isinstance(self._model.model, CoresetSupportingModule) def _load_state(self, path: pathlib.Path) -> None: diff --git a/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py index 4ad9efdc0..a1d363d98 100644 --- a/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py +++ b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py @@ -17,6 +17,10 @@ class EmbeddingEncoderDownloader: + """The embedding encoder downloader provides a simple interface setup_encoder() to the DataDriftTrigger. + Given a model_id and a device, it creates an EmbeddingEncoder, + downloads model parameters and loads model state. + """ def __init__( self, modyn_config: dict, From 3dee498d2e9e72df11b22112aaf7291bd638b72d Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 14:42:54 +0200 Subject: [PATCH 29/32] Fix linting --- .../triggers/embedding_encoder_utils/embedding_encoder.py | 1 + .../embedding_encoder_utils/embedding_encoder_downloader.py | 1 + 2 files changed, 2 insertions(+) diff --git a/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py index d7e2eab7d..45b53c675 100644 --- a/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py +++ b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder.py @@ -14,6 +14,7 @@ class EmbeddingEncoder: """The EmbeddingEncoder stores a model and its metadata. DataDriftTrigger uses EmbeddingEncoder to run the model. """ + def __init__( self, model_id: int, diff --git a/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py index a1d363d98..cde0c9167 100644 --- a/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py +++ b/modyn/supervisor/internal/triggers/embedding_encoder_utils/embedding_encoder_downloader.py @@ -21,6 +21,7 @@ class EmbeddingEncoderDownloader: Given a model_id and a device, it creates an EmbeddingEncoder, downloads model parameters and loads model state. """ + def __init__( self, modyn_config: dict, From cfe7a0576c0301036eadf62de1ca166c307c3c1a Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 15:59:19 +0200 Subject: [PATCH 30/32] make init_trigger non-abstract --- modyn/supervisor/internal/triggers/amounttrigger.py | 2 -- modyn/supervisor/internal/triggers/timetrigger.py | 2 -- modyn/supervisor/internal/triggers/trigger.py | 3 ++- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/modyn/supervisor/internal/triggers/amounttrigger.py b/modyn/supervisor/internal/triggers/amounttrigger.py index fcd7994a3..972be0d87 100644 --- a/modyn/supervisor/internal/triggers/amounttrigger.py +++ b/modyn/supervisor/internal/triggers/amounttrigger.py @@ -17,8 +17,6 @@ def __init__(self, trigger_config: dict): super().__init__(trigger_config) - def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path) -> None: - pass def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: assert self.remaining_data_points < self.data_points_for_trigger, "Inconsistent remaining datapoints" diff --git a/modyn/supervisor/internal/triggers/timetrigger.py b/modyn/supervisor/internal/triggers/timetrigger.py index f92e3e545..422854fc3 100644 --- a/modyn/supervisor/internal/triggers/timetrigger.py +++ b/modyn/supervisor/internal/triggers/timetrigger.py @@ -26,8 +26,6 @@ def __init__(self, trigger_config: dict): super().__init__(trigger_config) - def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path) -> None: - pass def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: if self.next_trigger_at is None: diff --git a/modyn/supervisor/internal/triggers/trigger.py b/modyn/supervisor/internal/triggers/trigger.py index bccf300da..097e37d1b 100644 --- a/modyn/supervisor/internal/triggers/trigger.py +++ b/modyn/supervisor/internal/triggers/trigger.py @@ -7,11 +7,12 @@ class Trigger(ABC): def __init__(self, trigger_config: dict) -> None: assert trigger_config is not None, "trigger_config cannot be None." - @abstractmethod + # pylint: disable=unnecessary-pass def init_trigger(self, pipeline_id: int, pipeline_config: dict, modyn_config: dict, base_dir: pathlib.Path) -> None: """The supervisor initializes the concrete Trigger with Trigger-type-specific configurations base_dir: the base directory to store Trigger outputs. A location at the supervisor. """ + pass @abstractmethod def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: From 03cc675568b1b5668053233429c823b187b3edbb Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 16:16:24 +0200 Subject: [PATCH 31/32] Fix linting --- modyn/supervisor/internal/triggers/amounttrigger.py | 2 -- modyn/supervisor/internal/triggers/timetrigger.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/modyn/supervisor/internal/triggers/amounttrigger.py b/modyn/supervisor/internal/triggers/amounttrigger.py index 972be0d87..76ea28b14 100644 --- a/modyn/supervisor/internal/triggers/amounttrigger.py +++ b/modyn/supervisor/internal/triggers/amounttrigger.py @@ -1,4 +1,3 @@ -import pathlib from typing import Generator from modyn.supervisor.internal.triggers.trigger import Trigger @@ -17,7 +16,6 @@ def __init__(self, trigger_config: dict): super().__init__(trigger_config) - def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: assert self.remaining_data_points < self.data_points_for_trigger, "Inconsistent remaining datapoints" diff --git a/modyn/supervisor/internal/triggers/timetrigger.py b/modyn/supervisor/internal/triggers/timetrigger.py index 422854fc3..ee74f03d9 100644 --- a/modyn/supervisor/internal/triggers/timetrigger.py +++ b/modyn/supervisor/internal/triggers/timetrigger.py @@ -1,4 +1,3 @@ -import pathlib from typing import Generator, Optional from modyn.supervisor.internal.triggers.trigger import Trigger @@ -26,7 +25,6 @@ def __init__(self, trigger_config: dict): super().__init__(trigger_config) - def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]: if self.next_trigger_at is None: if len(new_data) > 0: From 4e9f072618550518c5a05dbf0c2bde03887e7074 Mon Sep 17 00:00:00 2001 From: Jingyi Zhu Date: Tue, 14 May 2024 21:03:06 +0200 Subject: [PATCH 32/32] Address change requests: remove local var num_triggers in handle_triggers_within_batch. Add back persist_pipeline_log() --- .../internal/pipeline_executor/pipeline_executor.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index 05d4b3292..3f7d59cc0 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -280,13 +280,11 @@ def _handle_triggers_within_batch( previous_trigger_idx = 0 logger.info("Handling triggers within batch.") self._update_pipeline_stage_and_enqueue_msg(PipelineStage.HANDLE_TRIGGERS_WITHIN_BATCH, MsgType.GENERAL) - num_triggers = 0 triggering_idx_list = [] for i, triggering_idx in enumerate(triggering_indices): triggering_idx_list.append(triggering_idx) - num_triggers += 1 self._update_pipeline_stage_and_enqueue_msg(PipelineStage.INFORM_SELECTOR_AND_TRIGGER, MsgType.GENERAL) triggering_data = batch[previous_trigger_idx : triggering_idx + 1] previous_trigger_idx = triggering_idx + 1 @@ -317,10 +315,11 @@ def _handle_triggers_within_batch( ) else: logger.info(f"Skipping training on empty trigger {trigger_id}]") + self._persist_pipeline_log() self.num_triggers = self.num_triggers + 1 if self.maximum_triggers is not None and self.num_triggers >= self.maximum_triggers: - return num_triggers + return len(triggering_idx_list) # we have to inform the Selector about the remaining data in this batch. if len(triggering_idx_list) == 0: @@ -346,7 +345,7 @@ def _handle_triggers_within_batch( else: self.remaining_data_range = None - return num_triggers + return len(triggering_idx_list) def _init_evaluation_writer(self, name: str, trigger_id: int) -> LogResultWriter: return self.supervisor_supported_eval_result_writers[name](self.pipeline_id, trigger_id, self.eval_directory)