Fix CI

stanford-crfm · Dec 18, 2024 · 3e60320 · 3e60320
1 parent 3ae3014
commit 3e60320
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 11 deletions.
diff --git a/src/levanter/main/train_asr.py b/src/levanter/main/train_asr.py
@@ -88,10 +88,8 @@ def compute_loss(
         example: AudioTextExample,
         *,
         key=None,
-        reduction: Optional[hax.ReductionFunction] = hax.mean,
-        reduction_axis: Optional[hax.AxisSelection] = None,
     ) -> jax.numpy.ndarray | hax.NamedArray:
-        return m.compute_loss(example, key=key, reduction=reduction, reduction_axis=reduction_axis)
+        return m.compute_loss(example, key=key)
 
     # Using the trainer as a context manager does 3 things:
     # 1. Sets the device mesh

diff --git a/src/levanter/main/viz_logprobs.py b/src/levanter/main/viz_logprobs.py
@@ -74,7 +74,7 @@ def main(config: VizGpt2Config):
         def compute_log_probs(model: LmHeadModel, example: LmExample):
             model = inference_mode(model, True)
             model = mp.cast_to_compute(model)
-            logprobs = compute_next_token_loss(model, example, reduction=None)
+            logprobs, where, _ = compute_next_token_loss(model, example)
             # roll forward to get the loss for each predicted token
             logprobs = hax.roll(logprobs, 1, Pos)
             return logprobs.rearrange((EvalBatch, Pos)).array

diff --git a/src/levanter/models/asr_model.py b/src/levanter/models/asr_model.py
@@ -11,6 +11,7 @@
 
 from levanter.models.attention import AttentionMask
 from levanter.models.lm_model import LmConfig
+from levanter.utils.types import Extras
 
 
 class AudioTextExample(eqx.Module):
@@ -97,9 +98,7 @@ def compute_loss(
         example: AudioTextExample,
         *,
         key=None,
-        reduction: Optional[hax.ReductionFunction] = hax.mean,
-        reduction_axis: Optional[hax.AxisSelection] = None,
-    ) -> jnp.ndarray | NamedArray:
+    ) -> tuple[jnp.ndarray | NamedArray, NamedArray, Extras]:
         """
         Computes the cross-entropy loss for predicted ASR tokens. If reduction is not None, the loss is reduced
         across the reduction axis (with reduction_axis=None meaning all axes). If reduction is None, the loss is not
@@ -110,10 +109,13 @@ def compute_loss(
         targets = hax.roll(example.tokens, -1, axis=self.Pos.name)
         target_y = hax.nn.one_hot(targets, self.Vocab, dtype=logits.dtype)
         loss = cross_entropy_loss(
-            logits, self.Vocab, target_y, reduction, reduction_axis=reduction_axis, where=example.loss_mask
+            logits,
+            self.Vocab,
+            target_y,
+            reduction=None,
         )
 
-        return loss
+        return loss, example.loss_mask, {}
 
     @property
     def vocab_size(self) -> int:

diff --git a/tests/test_text.py b/tests/test_text.py
@@ -40,12 +40,14 @@ def test_lm_example_handles_ignore_id():
     lm_head = hax.zeros((Embed, Vocab))
     lm_head = lm_head.at[Vocab, ignore_id].set(-100)
 
-    ignored_loss = maybe_fused_next_token_loss(
+    ignored_loss, ignored_where = maybe_fused_next_token_loss(
         Pos, Embed, Vocab, logits, lm_head, tokens, loss_mask=ex_ignore.loss_mask
     )
-    no_ignore_loss = maybe_fused_next_token_loss(
+    ignored_loss = hax.sum(ignored_loss, where=ignored_where)
+    no_ignore_loss, no_ignore_where = maybe_fused_next_token_loss(
         Pos, Embed, Vocab, logits, lm_head, tokens, loss_mask=ex_no_ignore.loss_mask
     )
+    no_ignore_loss = hax.sum(no_ignore_loss, where=no_ignore_where)
 
     assert no_ignore_loss.item() >= ignored_loss.item() + 100 / Pos.size