eth-easl · jenny011 · May 14, 2024 · Apr 27, 2024 · Apr 27, 2024 · Apr 27, 2024
diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml
@@ -0,0 +1,71 @@
+pipeline:
+  name: ArXiv dataset Test Pipeline
+  description: Example pipeline
+  version: 1.0.0
+model:
+  id: ArticleNet
+  config:
+    num_classes: 172
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
+training:
+  gpus: 1
+  device: "cuda:0"
+  dataloader_workers: 2
+  use_previous_model: True
+  initial_model: random
+  batch_size: 96
+  optimizers:
+    - name: "default"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model"
+          config:
+            lr: 0.00002
+            momentum: 0.9
+            weight_decay: 0.01
+  optimization_criterion:
+    name: "CrossEntropyLoss"
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 10000
+    config:
+      storage_backend: "database"
+      limit: -1
+      reset_after_trigger: True
+  seed: 42
+  epochs_per_trigger: 1
+data:
+  dataset_id: arxiv_train
+  bytes_parser_function: |
+    def bytes_parser_function(data: bytes) -> str:
+      return str(data, "utf8")
+  tokenizer: DistilBertTokenizerTransform
+
+trigger:
+  id: DataDriftTrigger
+  trigger_config:
+    data_points_for_detection: 100000
+    sample_size: 5000
+
+evaluation:
+  device: "cuda:0"
+  result_writers: ["json"]
+  datasets:
+    - dataset_id: arxiv_test
+      bytes_parser_function: |
+        def bytes_parser_function(data: bytes) -> str:
+          return str(data, "utf8")
+      tokenizer: DistilBertTokenizerTransform
+      batch_size: 96
+      dataloader_workers: 2
+      metrics:
+        - name: "Accuracy"
+          evaluation_transformer_function: |
+            import torch
+            def evaluation_transformer_function(model_output: torch.Tensor) -> torch.Tensor:
+              return torch.argmax(model_output, dim=-1)
diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml
@@ -0,0 +1,73 @@
+pipeline:
+  name: Huffpost dataset Test Pipeline
+  description: Example pipeline
+  version: 1.0.0
+model:
+  id: ArticleNet
+  config:
+    num_classes: 55
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
+training:
+  gpus: 1
+  device: "cuda:0"
+  dataloader_workers: 2
+  use_previous_model: True
+  initial_model: random
+  batch_size: 64
+  optimizers:
+    - name: "default"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model"
+          config:
+            lr: 0.00002
+            momentum: 0.9
+            weight_decay: 0.01
+  optimization_criterion:
+    name: "CrossEntropyLoss"
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 1000
+    config:
+      storage_backend: "database"
+      limit: -1
+      reset_after_trigger: True
+  seed: 42
+  epochs_per_trigger: 1
+data:
+  dataset_id: huffpost_train
+  bytes_parser_function: |
+    def bytes_parser_function(data: bytes) -> str:
+      return str(data, "utf8")
+  tokenizer: DistilBertTokenizerTransform
+
+trigger:
+  id: DataDriftTrigger
+  trigger_config:
+    data_points_for_detection: 5000
+    metric_name: mmd
+    metric_config:
+      bootstrap: False
+
+evaluation:
+  device: "cuda:0"
+  result_writers: ["json"]
+  datasets:
+    - dataset_id: huffpost_test
+      bytes_parser_function: |
+        def bytes_parser_function(data: bytes) -> str:
+          return str(data, "utf8")
+      tokenizer: DistilBertTokenizerTransform      
+      batch_size: 64
+      dataloader_workers: 2
+      metrics:
+        - name: "Accuracy"
+          evaluation_transformer_function: |
+            import torch
+            def evaluation_transformer_function(model_output: torch.Tensor) -> torch.Tensor:
+              return torch.argmax(model_output, dim=-1)
diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml
@@ -0,0 +1,76 @@
+pipeline:
+  name: Yearbook Test Pipeline
+  description: Example pipeline
+  version: 1.0.0
+model:
+  id: YearbookNet
+  config:
+    num_input_channels: 1
+    num_classes: 2
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
+training:
+  gpus: 1
+  device: "cuda:0"
+  dataloader_workers: 2
+  use_previous_model: True
+  initial_model: random
+  batch_size: 64
+  optimizers:
+    - name: "default"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model"
+          config:
+            lr: 0.001
+            momentum: 0.9
+  optimization_criterion:
+    name: "CrossEntropyLoss"
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 1000
+    config:
+      storage_backend: "database"
+      limit: -1
+      reset_after_trigger: True
+  seed: 42
+  epochs_per_trigger: 1
+data:
+  dataset_id: yearbook_train
+  transformations: []
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(data: bytes) -> torch.Tensor:
+      return torch.from_numpy(np.frombuffer(data, dtype=np.float32)).reshape(1, 32, 32)
+
+trigger:
+  id: DataDriftTrigger
+  trigger_config:
+    data_points_for_detection: 1000
+    metric_name: model
+    metric_config:
+      threshold: 0.7
+
+evaluation:
+  device: "cuda:0"
+  result_writers: ["json"]
+  datasets:
+    - dataset_id: yearbook_test
+      bytes_parser_function: |
+        import torch
+        import numpy as np
+        def bytes_parser_function(data: bytes) -> torch.Tensor:
+          return torch.from_numpy(np.frombuffer(data, dtype=np.float32)).reshape(1, 32, 32)
+      batch_size: 64
+      dataloader_workers: 2
+      metrics:
+        - name: "Accuracy"
+          evaluation_transformer_function: |
+            import torch
+            def evaluation_transformer_function(model_output: torch.Tensor) -> torch.Tensor:
+              return torch.argmax(model_output, dim=-1)
diff --git a/environment.yml b/environment.yml
@@ -22,6 +22,7 @@ dependencies:
     - grpcio>=1.63
     - protobuf==5.26.*
     - types-protobuf==5.26.*
+    - evidently
   - jsonschema
   - psycopg2
   - sqlalchemy>=2.0

diff --git a/modyn/config/schema/pipeline.py b/modyn/config/schema/pipeline.py
@@ -164,7 +164,6 @@ class _BaseSelectionStrategyConfig(BaseModel):
 
 
 class FreshnessSamplingStrategyConfig(_BaseSelectionStrategyConfig):
-
     unused_data_ratio: float = Field(
         0.0,
         description=(
@@ -175,7 +174,6 @@ class FreshnessSamplingStrategyConfig(_BaseSelectionStrategyConfig):
 
 
 class NewDataSelectionStrategyConfig(_BaseSelectionStrategyConfig):
-
     limit_reset: LimitResetStrategy = Field(
         description=(
             "Strategy to follow for respecting the limit in case of reset. Only used when reset_after_trigger == true."

diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py
@@ -6,7 +6,7 @@
 import sys
 import traceback
 from time import sleep
-from typing import Any, Optional
+from typing import Any, Generator, Optional
 
 from modyn.common.benchmark import Stopwatch
 from modyn.supervisor.internal.evaluation_result_writer import LogResultWriter
@@ -113,6 +113,9 @@ def _setup_trigger(self) -> None:
 
         trigger_module = dynamic_module_import("modyn.supervisor.internal.triggers")
         self.trigger: Trigger = getattr(trigger_module, trigger_id)(trigger_config)
+        self.trigger.init_trigger(self.pipeline_id, self.pipeline_config, self.modyn_config, self.eval_directory)
+        if self.previous_model_id is not None:
+            self.trigger.inform_previous_model(self.previous_model_id)
 
         assert self.trigger is not None, "Error during trigger initialization"
 
@@ -177,15 +180,15 @@ def _handle_new_data(self, new_data: list[tuple[int, int, int]]) -> bool:
 
     def _handle_new_data_batch(self, batch: list[tuple[int, int, int]]) -> bool:
         self._sw.start("trigger_inform", overwrite=True)
-        triggering_indices = self.trigger.inform(batch)
-        num_triggers = len(triggering_indices)
-        self.pipeline_log["supervisor"]["num_triggers"] += len(triggering_indices)
+        triggering_indices: Generator[int, None, None] = self.trigger.inform(batch)
+        num_triggers = self._handle_triggers_within_batch(batch, triggering_indices)
+
+        logger.info(f"There are {num_triggers} triggers in this batch.")
+        self.pipeline_log["supervisor"]["num_triggers"] += num_triggers
         self.pipeline_log["supervisor"]["trigger_batch_times"].append(
             {"batch_size": len(batch), "time": self._sw.stop("trigger_inform"), "num_triggers": num_triggers}
         )
 
-        logger.info(f"There are {num_triggers} triggers in this batch.")
-        self._handle_triggers_within_batch(batch, triggering_indices)
         return num_triggers > 0
 
     def _run_training(self, trigger_id: int) -> None:
@@ -223,6 +226,7 @@ def _run_training(self, trigger_id: int) -> None:
         # We store the trained model for evaluation in any case.
         self._sw.start("store_trained_model", overwrite=True)
         model_id = self.grpc.store_trained_model(self.current_training_id)
+        self.trigger.inform_previous_model(model_id)
         self.pipeline_log["supervisor"]["triggers"][trigger_id]["store_trained_model_time"] = self._sw.stop()
 
         # Only if the pipeline actually wants to continue the training on it, we set previous model.
@@ -270,12 +274,17 @@ def _get_trigger_timespan(
 
         return first_timestamp, last_timestamp
 
-    def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], triggering_indices: list[int]) -> None:
+    def _handle_triggers_within_batch(
+        self, batch: list[tuple[int, int, int]], triggering_indices: Generator[int, None, None]
+    ) -> int:
         previous_trigger_idx = 0
         logger.info("Handling triggers within batch.")
         self._update_pipeline_stage_and_enqueue_msg(PipelineStage.HANDLE_TRIGGERS_WITHIN_BATCH, MsgType.GENERAL)
 
+        triggering_idx_list = []
+
         for i, triggering_idx in enumerate(triggering_indices):
+            triggering_idx_list.append(triggering_idx)
             self._update_pipeline_stage_and_enqueue_msg(PipelineStage.INFORM_SELECTOR_AND_TRIGGER, MsgType.GENERAL)
             triggering_data = batch[previous_trigger_idx : triggering_idx + 1]
             previous_trigger_idx = triggering_idx + 1
@@ -294,6 +303,7 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg
 
             num_samples_in_trigger = self.grpc.get_number_of_samples(self.pipeline_id, trigger_id)
             if num_samples_in_trigger > 0:
+                self.trigger.inform_previous_trigger_and_data_points(trigger_id, num_samples_in_trigger)
                 first_timestamp, last_timestamp = self._get_trigger_timespan(i == 0, triggering_data)
                 self.pipeline_log["supervisor"]["triggers"][trigger_id]["first_timestamp"] = first_timestamp
                 self.pipeline_log["supervisor"]["triggers"][trigger_id]["last_timestamp"] = last_timestamp
@@ -309,13 +319,13 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg
 
             self.num_triggers = self.num_triggers + 1
             if self.maximum_triggers is not None and self.num_triggers >= self.maximum_triggers:
-                break
+                return len(triggering_idx_list)
 
         # we have to inform the Selector about the remaining data in this batch.
-        if len(triggering_indices) == 0:
+        if len(triggering_idx_list) == 0:
             remaining_data = batch
         else:
-            remaining_data = batch[triggering_indices[-1] + 1 :]
+            remaining_data = batch[triggering_idx_list[-1] + 1 :]
 
         logger.info(f"There are {len(remaining_data)} data points remaining after the trigger.")
         if len(remaining_data) > 0:
@@ -335,6 +345,8 @@ def _handle_triggers_within_batch(self, batch: list[tuple[int, int, int]], trigg
         else:
             self.remaining_data_range = None
 
+        return len(triggering_idx_list)
+
     def _init_evaluation_writer(self, name: str, trigger_id: int) -> LogResultWriter:
         return self.supervisor_supported_eval_result_writers[name](self.pipeline_id, trigger_id, self.eval_directory)
 

diff --git a/modyn/supervisor/internal/triggers/__init__.py b/modyn/supervisor/internal/triggers/__init__.py
@@ -5,6 +5,7 @@
 import os
 
 from .amounttrigger import DataAmountTrigger  # noqa: F401
+from .datadrifttrigger import DataDriftTrigger  # noqa: F401
 from .timetrigger import TimeTrigger  # noqa: F401
 from .trigger import Trigger  # noqa: F401
 

diff --git a/modyn/supervisor/internal/triggers/amounttrigger.py b/modyn/supervisor/internal/triggers/amounttrigger.py
@@ -1,3 +1,5 @@
+from typing import Generator
+
 from modyn.supervisor.internal.triggers.trigger import Trigger
 
 
@@ -14,12 +16,12 @@ def __init__(self, trigger_config: dict):
 
         super().__init__(trigger_config)
 
-    def inform(self, new_data: list[tuple[int, int, int]]) -> list[int]:
+    def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]:
         assert self.remaining_data_points < self.data_points_for_trigger, "Inconsistent remaining datapoints"
 
         first_idx = self.data_points_for_trigger - self.remaining_data_points - 1
         triggering_indices = list(range(first_idx, len(new_data), self.data_points_for_trigger))
 
         self.remaining_data_points = (self.remaining_data_points + len(new_data)) % self.data_points_for_trigger
 
-        return triggering_indices
+        yield from triggering_indices