ecmwf · HCookie · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,9 @@ Keep it human-readable, your future self will thank you!
 - Feature: New `Boolean1DMask` class. Enables rollout training for limited area models. [#79](https://github.com/ecmwf/anemoi-training/pulls/79)
 
 ### Fixed
+- Refactored callbacks. [#60](https://github.com/ecmwf/anemoi-training/pulls/60)
+- Refactored rollout [#87](https://github.com/ecmwf/anemoi-training/pulls/87)
+    - Enable longer validation rollout than training
 - Mlflow-sync to handle creation of new experiments in the remote server [#83] (https://github.com/ecmwf/anemoi-training/pull/83)
 - ci: fix pyshtools install error (#100) https://github.com/ecmwf/anemoi-training/pull/100
 

diff --git a/docs/modules/diagnostics.rst b/docs/modules/diagnostics.rst
@@ -21,18 +21,17 @@ functionality to use both Weights & Biases and Tensorboard.
 
 The callbacks can also be used to evaluate forecasts over longer
 rollouts beyond the forecast time that the model is trained on. The
-number of rollout steps (or forecast iteration steps) is set using
-``config.eval.rollout = *num_of_rollout_steps*``.
+number of rollout steps for verification (or forecast iteration steps)
+is set using ``config.dataloader.validation_rollout =
+*num_of_rollout_steps*``.
 
 Note the user has the option to evaluate the callbacks asynchronously
 (using the following config option
 ``config.diagnostics.plot.asynchronous``, which means that the model
 training doesn't stop whilst the callbacks are being evaluated).
-However, note that callbacks can still be slow, and therefore the
-plotting callbacks can be switched off by setting
-``config.diagnostics.plot.enabled`` to ``False`` or all the callbacks
-can be completely switched off by setting
-``config.diagnostics.eval.enabled`` to ``False``.
+Callbacks are configured in the config file under the
+``config.diagnostics.callbacks`` key, and plotting callbacks under the
+``config.diagnostics.plot`` key.
 
 Below is the documentation for the default callbacks provided, but it is
 also possible for users to add callbacks using the same structure:

diff --git a/src/anemoi/training/config/config.yaml b/src/anemoi/training/config/config.yaml
@@ -1,7 +1,7 @@
 defaults:
 - data: zarr
 - dataloader: native_grid
-- diagnostics: eval_rollout
+- diagnostics: evaluation
 - hardware: example
 - graph: multi_scale
 - model: gnn

diff --git a/src/anemoi/training/config/dataloader/native_grid.yaml b/src/anemoi/training/config/dataloader/native_grid.yaml
@@ -45,6 +45,8 @@ training:
   frequency: ${data.frequency}
   drop:  []
 
+validation_rollout: 1 # number of rollouts to use for validation, must be equal or greater than rollout expected by callbacks
+
 validation:
   dataset: ${dataloader.dataset}
   start: 2021

diff --git a/src/anemoi/training/config/diagnostics/callbacks/pretraining.yaml b/src/anemoi/training/config/diagnostics/callbacks/pretraining.yaml
@@ -0,0 +1 @@
+# Add callbacks here
diff --git a/src/anemoi/training/config/diagnostics/callbacks/rollout_eval.yaml b/src/anemoi/training/config/diagnostics/callbacks/rollout_eval.yaml
@@ -0,0 +1,4 @@
+# Add callbacks here
+- _target_: anemoi.training.diagnostics.callbacks.evaluation.RolloutEval
+  rollout: ${dataloader.validation_rollout}
+  frequency: 20
diff --git a/...ning/config/diagnostics/eval_rollout.yaml → ...aining/config/diagnostics/evaluation.yaml b/...ning/config/diagnostics/eval_rollout.yaml → ...aining/config/diagnostics/evaluation.yaml
@@ -1,53 +1,8 @@
 ---
-eval:
-  enabled: False
-  # use this to evaluate the model over longer rollouts, every so many validation batches
-  rollout: 12
-  frequency: 20
-plot:
-  enabled: True
-  asynchronous: True
-  frequency: 750
-  sample_idx: 0
-  per_sample: 6
-  parameters:
-  - z_500
-  - t_850
-  - u_850
-  - v_850
-  - 2t
-  - 10u
-  - 10v
-  - sp
-  - tp
-  - cp
-  #Defining the accumulation levels for precipitation related fields and the colormap
-  accumulation_levels_plot: [0, 0.05, 0.1, 0.25, 0.5, 1, 1.5, 2, 3, 4, 5, 6, 7, 100] # in mm
-  cmap_accumulation: ["#ffffff", "#04e9e7", "#019ff4", "#0300f4", "#02fd02", "#01c501", "#008e00", "#fdf802", "#e5bc00", "#fd9500", "#fd0000", "#d40000", "#bc0000", "#f800fd"]
-  precip_and_related_fields: [tp, cp]
-  # Histogram and Spectrum plots
-  parameters_histogram:
-  - z_500
-  - tp
-  - 2t
-  - 10u
-  - 10v
-  parameters_spectrum:
-  - z_500
-  - tp
-  - 2t
-  - 10u
-  - 10v
-  # group parameters by categories when visualizing contributions to the loss
-  # one-parameter groups are possible to highlight individual parameters
-  parameter_groups:
-    moisture: [tp, cp, tcw]
-    sfc_wind: [10u, 10v]
-  learned_features: False
-  longrollout:
-    enabled: False
-    rollout: [60]
-    frequency: 20 # every X epochs
+defaults:
+  - plot: detailed
+  - callbacks: pretraining
+
 
 debug:
   # this will detect and trace back NaNs / Infs etc. but will slow down training
@@ -57,6 +12,7 @@ debug:
 # remember to also activate the tensorboard logger (below)
 profiler: False
 
+enable_checkpointing: True
 checkpoint:
   every_n_minutes:
     save_frequency: 30 # Approximate, as this is checked at the end of training steps

diff --git a/src/anemoi/training/config/diagnostics/plot/detailed.yaml b/src/anemoi/training/config/diagnostics/plot/detailed.yaml
@@ -0,0 +1,67 @@
+asynchronous: True # Whether to plot asynchronously
+frequency: # Frequency of the plotting
+  batch: 750
+  epoch: 5
+
+# Parameters to plot
+parameters:
+- z_500
+- t_850
+- u_850
+- v_850
+- 2t
+- 10u
+- 10v
+- sp
+- tp
+- cp
+
+# Sample index
+sample_idx: 0
+
+# Precipitation and related fields
+precip_and_related_fields: [tp, cp]
+
+callbacks:
+  # Add plot callbacks here
+  - _target_: anemoi.training.diagnostics.callbacks.plot.GraphNodeTrainableFeaturesPlot
+  - _target_: anemoi.training.diagnostics.callbacks.plot.GraphEdgeTrainableFeaturesPlot
+    epoch_frequency: 5
+  - _target_: anemoi.training.diagnostics.callbacks.plot.PlotLoss
+    # group parameters by categories when visualizing contributions to the loss
+    # one-parameter groups are possible to highlight individual parameters
+    parameter_groups:
+      moisture: [tp, cp, tcw]
+      sfc_wind: [10u, 10v]
+  - _target_: anemoi.training.diagnostics.callbacks.plot.PlotSample
+    sample_idx: ${diagnostics.plot.sample_idx}
+    per_sample : 6
+    parameters: ${diagnostics.plot.parameters}
+    #Defining the accumulation levels for precipitation related fields and the colormap
+    accumulation_levels_plot: [0, 0.05, 0.1, 0.25, 0.5, 1, 1.5, 2, 3, 4, 5, 6, 7, 100] # in mm
+    cmap_accumulation: ["#ffffff", "#04e9e7", "#019ff4", "#0300f4", "#02fd02", "#01c501", "#008e00", "#fdf802", "#e5bc00", "#fd9500", "#fd0000", "#d40000", "#bc0000", "#f800fd"]
+    precip_and_related_fields: ${diagnostics.plot.precip_and_related_fields}
+
+  - _target_: anemoi.training.diagnostics.callbacks.plot.PlotSpectrum
+    # batch_frequency: 100 # Override for batch frequency
+    sample_idx: ${diagnostics.plot.sample_idx}
+    precip_and_related_fields: ${diagnostics.plot.precip_and_related_fields}
+    parameters:
+    - z_500
+    - tp
+    - 2t
+    - 10u
+    - 10v
+  - _target_: anemoi.training.diagnostics.callbacks.plot.PlotHistogram
+    sample_idx: ${diagnostics.plot.sample_idx}
+    precip_and_related_fields: ${diagnostics.plot.precip_and_related_fields}
+    parameters:
+    - z_500
+    - tp
+    - 2t
+    - 10u
+    - 10v
+  - _target_:  anemoi.training.diagnostics.callbacks.plot.LongRolloutPlots
+    rollout:
+      - ${dataloader.validation_rollout}
+    epoch_frequency: 20
diff --git a/src/anemoi/training/config/diagnostics/plot/none.yaml b/src/anemoi/training/config/diagnostics/plot/none.yaml
@@ -0,0 +1 @@
+callbacks: []
diff --git a/src/anemoi/training/config/diagnostics/plot/simple.yaml b/src/anemoi/training/config/diagnostics/plot/simple.yaml
@@ -0,0 +1,40 @@
+asynchronous: True # Whether to plot asynchronously
+frequency: # Frequency of the plotting
+  batch: 750
+  epoch: 10
+
+# Parameters to plot
+parameters:
+- z_500
+- t_850
+- u_850
+- v_850
+- 2t
+- 10u
+- 10v
+- sp
+- tp
+- cp
+
+# Sample index
+sample_idx: 0
+
+# Precipitation and related fields
+precip_and_related_fields: [tp, cp]
+
+callbacks:
+  # Add plot callbacks here
+  - _target_: anemoi.training.diagnostics.callbacks.plot.PlotLoss
+    # group parameters by categories when visualizing contributions to the loss
+    # one-parameter groups are possible to highlight individual parameters
+    parameter_groups:
+      moisture: [tp, cp, tcw]
+      sfc_wind: [10u, 10v]
+  - _target_: anemoi.training.diagnostics.callbacks.plot.PlotSample
+    sample_idx: ${diagnostics.plot.sample_idx}
+    per_sample : 6
+    parameters: ${diagnostics.plot.parameters}
+    #Defining the accumulation levels for precipitation related fields and the colormap
+    accumulation_levels_plot: [0, 0.05, 0.1, 0.25, 0.5, 1, 1.5, 2, 3, 4, 5, 6, 7, 100] # in mm
+    cmap_accumulation: ["#ffffff", "#04e9e7", "#019ff4", "#0300f4", "#02fd02", "#01c501", "#008e00", "#fdf802", "#e5bc00", "#fd9500", "#fd0000", "#d40000", "#bc0000", "#f800fd"]
+    precip_and_related_fields: ${diagnostics.plot.precip_and_related_fields}
diff --git a/src/anemoi/training/data/datamodule.py b/src/anemoi/training/data/datamodule.py
@@ -126,10 +126,8 @@ def ds_train(self) -> NativeGridDataset:
     @cached_property
     def ds_valid(self) -> NativeGridDataset:
         r = self.rollout
-        if self.config.diagnostics.eval.enabled:
-            r = max(r, self.config.diagnostics.eval.rollout)
-        if self.config.diagnostics.plot.get("longrollout") and self.config.diagnostics.plot.longrollout.enabled:
-            r = max(r, max(self.config.diagnostics.plot.longrollout.rollout))
+        r = max(r, self.config.dataloader.get("validation_rollout", 1))
+
         assert self.config.dataloader.training.end < self.config.dataloader.validation.start, (
             f"Training end date {self.config.dataloader.training.end} is not before"
             f"validation start date {self.config.dataloader.validation.start}"