ecmwf · Rilwan-Adewoyin · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024 · Oct 1, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -37,6 +37,7 @@ Keep it human-readable, your future self will thank you!
 ### Changed
 
 - Updated configuration examples in documentation and corrected links - [#46](https://github.com/ecmwf/anemoi-training/pull/46)
+- Modified training configuration to support max_steps and tied lr iterations to max_steps by default [#67](https://github.com/ecmwf/anemoi-training/pull/67)
 
 ## [0.1.0 - Anemoi training - First release](https://github.com/ecmwf/anemoi-training/releases/tag/0.1.0) - 2024-08-16
 

diff --git a/src/anemoi/training/config/training/default.yaml b/src/anemoi/training/config/training/default.yaml
@@ -46,10 +46,13 @@ rollout:
   # maximum rollout to use
   max: 1
 
-max_epochs: 200
+# Set max_epochs or max_steps. Training stops at the first limit reached.
+max_epochs: null
+max_steps: 150000
+
 lr:
   rate: 0.625e-4 #local_lr
-  iterations: 300000
+  iterations: ${training.max_steps} # NOTE: When max_epochs < max_steps, scheduler will run for max_steps
   min: 3e-7 #Not scaled by #GPU
 
 # Changes in per-gpu batch_size should come with a rescaling of the local_lr

diff --git a/src/anemoi/training/train/train.py b/src/anemoi/training/train/train.py
@@ -323,6 +323,7 @@ def train(self) -> None:
             num_nodes=self.config.hardware.num_nodes,
             precision=self.config.training.precision,
             max_epochs=self.config.training.max_epochs,
+            max_steps=self.config.training.max_steps or -1,
             logger=self.loggers,
             log_every_n_steps=self.config.diagnostics.log.interval,
             # run a fixed no of batches per epoch (helpful when debugging)