stanford-crfm · Aphoh · Jun 15, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 18, 2024
diff --git a/src/levanter/callbacks.py b/src/levanter/callbacks.py
@@ -47,6 +47,8 @@ def eval_loss_loop(loss_fn, model, dataset, max_batches: Optional[int] = None, n
         load_time = time.time() - time_in
         total_load_time += load_time
         loss = loss_fn(model, batch)
+        if isinstance(loss, tuple):
+            loss, _ = loss
         total_loss += loss.item()
         n += 1
         loss_time = time.time() - time_in - load_time

diff --git a/src/levanter/eval.py b/src/levanter/eval.py
@@ -15,6 +15,7 @@
 from levanter.data import Dataset, ReplicatedBatchLoader
 from levanter.logging import LoadingTimeTrackerIterator
 from levanter.models.lm_model import LmExample, LmHeadModel
+from levanter.tracker.histograms import NBINS
 from levanter.trainer import StepInfo
 from levanter.utils.stat_utils import RunningMean
 from levanter.utils.tree_utils import inference_mode
@@ -34,6 +35,7 @@ class EvalResult:
     tag_macro_losses: dict[str, float]  # per tag average-per-token loss
     tag_micro_losses: dict[str, float]  # per tag total loss, for "parent" tags
     total_eval_loading_time: float
+    extras: dict[str, float]
 
 
 class DomainTaggedDataset(Dataset[tuple[T, hax.NamedArray]]):
@@ -123,6 +125,17 @@ def eval_callback(step: StepInfo):
             _join_prefix(prefix, "loading_time"): result.total_eval_loading_time,
             _join_prefix(prefix, "total_time"): time_fn(),
         }
+        if (gate_hist := result.extras.get("gate_hist", None)) is not None:
+            pos_idx = NBINS // 2 + 1
+            log_dict[_join_prefix(prefix, "gate_hist/all")] = np.array(gate_hist.sum(axis=0))
+            num_gt0 = gate_hist[:, pos_idx:].sum().item()
+            total = gate_hist.sum().item()
+            log_dict[_join_prefix(prefix, "gate_gt0/all")] = num_gt0 / total
+            for i in range(gate_hist.shape[0]):
+                log_dict[_join_prefix(prefix, f"gate_hist/layer{i+1}")] = np.array(gate_hist[i])
+                num_gt0 = gate_hist[i, pos_idx:].sum().item()
+                total = gate_hist[i].sum().item()
+                log_dict[_join_prefix(prefix, f"gate_gt0/layer{i+1}")] = num_gt0 / total
 
         logger.info(f"{prefix} loss: {result.micro_avg_loss:.3f}")
         for tag, loss in result.tag_macro_losses.items():
@@ -185,12 +198,12 @@ def __init__(
 
         @hax.named_jit(out_axis_resources=axis_mapping)
         def accum_for_batch(
-            m: LmHeadModel, state: tuple[RunningMean, RunningMean], batch: LmExample, tags: hax.NamedArray
+            m: LmHeadModel, state: tuple[RunningMean, RunningMean, dict], batch: LmExample, tags: hax.NamedArray
         ):
             m = inference_mode(m, True)
             with hax.axis_mapping(axis_mapping):
-                total_mean, mean_per_tag = state
-                losses = m.compute_loss(batch, reduction=None, reduction_axis=())
+                total_mean, mean_per_tag, total_extras = state
+                losses, extras = m.compute_loss(batch, reduction=None, reduction_axis=())
                 mask = batch.loss_mask  # [Batch, Token]
                 this_tokens = hax.einsum("->", mask)
                 this_loss = hax.einsum("->", losses, mask)  # to scalar
@@ -203,23 +216,32 @@ def accum_for_batch(
                 safe_mean = hax.where(this_tokens_per_tag, this_loss_per_tag / this_tokens_per_tag, 0.0)
                 mean_per_tag = mean_per_tag.add(safe_mean, this_tokens_per_tag)
 
-            return mean, mean_per_tag
+                if extras:
+                    for key in extras:
+                        curr = total_extras.get(key, jnp.zeros_like(extras[key]))
+                        total_extras[key] = extras[key] + curr
+
+            return mean, mean_per_tag, total_extras
 
         self.accum_for_batch = accum_for_batch
 
     def evaluate(self, m: LmHeadModel):
         total_loss = jnp.zeros(())
         mean_losses_per_tag = hax.zeros(self.dataset.Tag, dtype=np.float32)
 
-        state = (RunningMean.zeros_like(total_loss), RunningMean.zeros_like(mean_losses_per_tag))
+        state: tuple[RunningMean, RunningMean, dict] = (
+            RunningMean.zeros_like(total_loss),
+            RunningMean.zeros_like(mean_losses_per_tag),
+            {},
+        )
         state = hax.shard(state)
 
         iterator = LoadingTimeTrackerIterator(self.loader)
 
         for batch, tags in tqdm.tqdm(iterator, "eval"):
             state = self.accum_for_batch(m, state, batch, tags)
 
-        total_loss, losses_per_tag = state
+        total_loss, losses_per_tag, extras = state
 
         micro_avg_loss = total_loss.mean.item()
         tag_avg_loss = losses_per_tag.mean
@@ -252,4 +274,4 @@ def evaluate(self, m: LmHeadModel):
             tag_micro_loss[tag] = mean_loss_per_tag_cpu[index]
             # no macro loss for the leaf tags
 
-        return EvalResult(micro_avg_loss, macro_avg_loss, tag_macro_loss, tag_micro_loss, iterator.total_time)
+        return EvalResult(micro_avg_loss, macro_avg_loss, tag_macro_loss, tag_micro_loss, iterator.total_time, extras)
diff --git a/src/levanter/main/viz_logprobs.py b/src/levanter/main/viz_logprobs.py
@@ -72,7 +72,7 @@ def main(config: VizGpt2Config):
         def compute_log_probs(model: LmHeadModel, example: LmExample):
             model = inference_mode(model, True)
             model = mp.cast_to_compute(model)
-            logprobs = model.compute_loss(example, reduction=None)
+            logprobs, _ = model.compute_loss(example, reduction=None)
             # roll forward to get the loss for each predicted token
             logprobs = hax.roll(logprobs, 1, Pos)
             return logprobs.rearrange((EvalBatch, Pos)).array

diff --git a/src/levanter/models/asr_model.py b/src/levanter/models/asr_model.py
@@ -105,15 +105,15 @@ def compute_loss(
         across the reduction axis (with reduction_axis=None meaning all axes). If reduction is None, the loss is not
         reduced, and the result is a named array with axes (*batch axes, sequence_length).
         """
-        logits = self(example.audio, example.tokens, example.attn_mask, key=key)
+        logits, extras = self(example.audio, example.tokens, example.attn_mask, key=key)
         logits = logits.astype(jnp.float32)
         targets = hax.roll(example.tokens, -1, axis=self.Pos.name)
         target_y = hax.nn.one_hot(targets, self.Vocab, dtype=logits.dtype)
         loss = cross_entropy_loss(
             logits, self.Vocab, target_y, reduction, reduction_axis=reduction_axis, where=example.loss_mask
         )
 
-        return loss
+        return loss, extras
 
     @property
     def vocab_size(self) -> int:

diff --git a/src/levanter/models/gemma.py b/src/levanter/models/gemma.py
@@ -30,6 +30,7 @@
 from levanter.models.lm_model import LmConfig, LmHeadModel
 from levanter.types import BlockFoldable
 from levanter.utils.flop_utils import lm_flops_per_token
+from levanter.utils.py_utils import cached_classproperty
 
 
 silence_transformer_nag()
@@ -75,7 +76,6 @@ class GemmaConfig(HFCompatConfig):
     vocab_size: int = 256_000
     num_layers: int = 18
     num_heads: int = 8
-    head_dim: int = 256
     num_kv_heads: int = 1
     attn_dropout = 0.0
     norm_eps = 1e-6
@@ -106,10 +106,14 @@ class GemmaConfig(HFCompatConfig):
     Mlp = property(lambda self: Axis(name="mlp", size=self.intermediate_dim))
     HeadSize = property(lambda self: Axis(name="head_size", size=self.hidden_dim // self.num_heads))
 
+    @property
+    def head_dim(self) -> int: return self.hidden_dim // self.num_heads
+
     def __post_init__(self):
         assert (
             self.num_heads % self.num_kv_heads == 0
         ), f"num_heads={self.num_heads} not divisible by num_kv_heads={self.num_kv_heads}."
+        assert (self.head_dim * self.num_heads) == self.hidden_dim, "head_dim * num_heads must equal hidden_dim."
 
     def hf_checkpoint_converter(self) -> HFCheckpointConverter["GemmaConfig"]:  # type: ignore
         return HFCheckpointConverter(
@@ -129,7 +133,9 @@ def from_hf_config(cls, hf_config: HfConfig):
         if hf_config.hidden_activation:
             activation_function = hf_config.hidden_activation
         else:
-            activation_function = hf_config.hidden_act
+            # This is the implementation in huggingface
+            # https://github.com/huggingface/transformers/blob/12b1620e615592fbf099d4ec44af7b9f2d1b48aa/src/transformers/models/gemma/modeling_gemma.py#L200
+            activation_function = "gelu_pytorch_tanh"
 
         if activation_function == "gelu_pytorch_tanh":
             activation_function = "gelu_new"
@@ -168,7 +174,7 @@ def to_hf_config(self, vocab_size: int, config_overrides: Optional[Dict] = None)
             num_hidden_layers=self.num_layers,
             num_attention_heads=self.num_heads,
             num_key_value_heads=self.num_kv_heads,
-            head_dim=self.hidden_dim // self.num_heads,
+            head_dim=self.head_dim,
             hidden_activation=(
                 "gelu_pytorch_tanh" if self.activation_function == "gelu_new" else self.activation_function
             ),
@@ -263,9 +269,9 @@ def __call__(self, x: NamedArray, mask: Optional[NamedArray | AttentionMask], *,
         # MLP and skip connection
         residual = x
         x = self.post_attention_layernorm(x)
-        mlp_output = self.mlp(x, key=k_mlp)
+        mlp_output, extras = self.mlp(x, key=k_mlp)
         output = residual + mlp_output
-        return output
+        return output, extras
 
 
 class GemmaTransformer(StateDictSerializationMixin, eqx.Module):
@@ -292,10 +298,10 @@ def init(config: GemmaConfig, *, key) -> "GemmaTransformer":
     @named_call
     def __call__(self, x: NamedArray, attn_mask: Optional[NamedArray | AttentionMask], *, key) -> NamedArray:
         keys = maybe_rng_split(key, self.config.num_layers) if key is not None else None
-        x = self.layers.fold(x, mask=attn_mask, key=keys)
+        x, extras = self.layers.scan(x, mask=attn_mask, key=keys)
         x = self.norm(x)
 
-        return x
+        return x, extras
 
     def from_state_dict(self, state_dict: StateDict, prefix: Optional[str] = None):
         if isinstance(self.layers, Stacked):
@@ -358,9 +364,9 @@ def __call__(
                 The attn_mask from training pipeline may be an AttentionMask object instead of NamedArray
         """
         x = self.embeddings.embed(input_ids)
-        x = self.transformer(x, attn_mask=attn_mask, key=key)
+        x, extras = self.transformer(x, attn_mask=attn_mask, key=key)
         lm_logits = self.embeddings.unembed(x)
-        return lm_logits
+        return lm_logits, extras
 
     def resize_vocab(self, new_size: int, key=None) -> "LmHeadModel[GemmaConfig]":
         new_embeddings = self.embeddings.resize_embeddings(new_size, key=key)

diff --git a/src/levanter/models/gpt2.py b/src/levanter/models/gpt2.py
@@ -29,6 +29,7 @@
 from levanter.models.attention import AttentionBackend, AttentionMask, dot_product_attention
 from levanter.models.lm_model import LmConfig
 from levanter.utils.flop_utils import lm_flops_per_token
+from levanter.utils.py_utils import cached_classproperty
 
 
 silence_transformer_nag()

diff --git a/src/levanter/models/llama.py b/src/levanter/models/llama.py
@@ -30,6 +30,8 @@
 from levanter.models.lm_model import LmConfig, LmHeadModel
 from levanter.types import BlockFoldable
 from levanter.utils.flop_utils import lm_flops_per_token
+from levanter.utils.py_utils import cached_classproperty
+from levanter.tracker.histograms import get_bins, sharded_histogram
 
 
 silence_transformer_nag()
@@ -78,6 +80,7 @@ class LlamaConfig(HFCompatConfig):
     use_bias: bool = False
     use_layer_norm_weight: bool = True
     rope_scaling: Optional[dict] = None
+    measure_act_stats: bool = True
 
     reference_checkpoint: str = "meta-llama/Llama-2-7b-hf"
     tokenizer: Optional[str] = None
@@ -181,10 +184,17 @@ class LlamaMlp(eqx.Module, StateDictSerializationMixin):
     up_proj: hnn.Linear  # projection from Embed to Mlp
     down_proj: hnn.Linear  # projection from Mlp to Embed
     act: Callable = eqx.static_field()
+    measure_act_stats: bool = False
 
     @staticmethod
     def init(
-        Embed: Axis, Mlp: Axis, activation_fn: Union[str, Callable], *, key, use_bias: bool = False
+        Embed: Axis,
+        Mlp: Axis,
+        activation_fn: Union[str, Callable],
+        *,
+        key,
+        use_bias: bool = False,
+        measure_act_stats=False,
     ) -> "LlamaMlp":
         k_fc, k_up_proj, k_down_proj = jrandom.split(key, 3)
         gate_proj = hnn.Linear.init(Out=Mlp, In=Embed, key=k_fc, use_bias=use_bias, out_first=True)
@@ -193,16 +203,20 @@ def init(
         if isinstance(activation_fn, str):
             activation_fn = ACT2FN[activation_fn]
         act = activation_fn  # type: ignore
-        return LlamaMlp(gate_proj, up_proj, down_proj, act)
+        get_bins()  # initialize bins
+        return LlamaMlp(gate_proj, up_proj, down_proj, act, measure_act_stats)
 
     @named_call
     def __call__(self, x: NamedArray, *, key=None) -> NamedArray:
         k_gate, k_up, k_down = maybe_rng_split(key, 3)
         hidden_states = self.gate_proj(x, key=k_gate)
+        extras = {}
+        if self.measure_act_stats:
+            extras["gate_hist"] = sharded_histogram(hidden_states.array, bins=get_bins())
         hidden_states = self.act(hidden_states)
         hidden_states = hidden_states * self.up_proj(x, key=k_up)
         outputs = self.down_proj(hidden_states, key=k_down)
-        return outputs
+        return outputs, extras
 
     def from_state_dict(self, state_dict: StateDict, prefix: Optional[str] = None):
         # unflatten the linear layers of HF state_dict to match the shape of LlamaMlp
@@ -402,6 +416,7 @@ def init(config: LlamaConfig, *, key) -> "LlamaDecoderLayer":
             config.activation_function,
             key=k_mlp,
             use_bias=config.use_bias,
+            measure_act_stats=config.measure_act_stats,
         )
         ln_1 = config.mk_LayerNorm(config.Embed)
         ln_2 = config.mk_LayerNorm(config.Embed)
@@ -420,9 +435,9 @@ def __call__(self, x: NamedArray, mask: Optional[NamedArray | AttentionMask], *,
         # MLP and skip connection
         residual = x
         x = self.post_attention_layernorm(x)
-        mlp_output = self.mlp(x, key=k_mlp)
+        mlp_output, extras = self.mlp(x, key=k_mlp)
         output = residual + mlp_output
-        return output
+        return output, extras
 
 
 class LlamaTransformer(StateDictSerializationMixin, eqx.Module):
@@ -449,10 +464,10 @@ def init(config: LlamaConfig, *, key) -> "LlamaTransformer":
     @named_call
     def __call__(self, x: NamedArray, attn_mask: Optional[NamedArray | AttentionMask], *, key) -> NamedArray:
         keys = maybe_rng_split(key, self.config.num_layers) if key is not None else None
-        x = self.layers.fold(x, mask=attn_mask, key=keys)
+        x, extras = self.layers.scan(x, mask=attn_mask, key=keys)
         x = self.norm(x)
 
-        return x
+        return x, extras
 
     def from_state_dict(self, state_dict: StateDict, prefix: Optional[str] = None):
         if isinstance(self.layers, Stacked):
@@ -544,9 +559,9 @@ def __call__(
         """
         k_t, k_head = maybe_rng_split(key, 2)
         x = self.embeddings.embed(input_ids)
-        x = self.transformer(x, attn_mask=attn_mask, key=k_t)
+        x, extras = self.transformer(x, attn_mask=attn_mask, key=k_t)
         lm_logits = self.lm_head(x, key=k_head)
-        return lm_logits
+        return lm_logits, extras
 
     def resize_vocab(self, new_size: int, key=None) -> "LmHeadModel[LlamaConfig]":
         new_Vocab = self.Vocab.resize(new_size)

diff --git a/src/levanter/models/lm_model.py b/src/levanter/models/lm_model.py
@@ -127,6 +127,10 @@ def compute_loss(
         reduced, and the result is a named array with axes (*batch axes, sequence_length).
         """
         logits = self(example.tokens, example.attn_mask, key=key)
+        extras = None
+        if isinstance(logits, tuple):
+            assert len(logits) == 2
+            logits, extras = logits
         # TODO: would be nice if we made the dtype configurable
         logits = logits.astype(jnp.float32)
         targets = hax.roll(example.tokens, -1, axis=self.Pos.name)
@@ -135,7 +139,7 @@ def compute_loss(
             logits, self.Vocab, target_y, reduction, reduction_axis=reduction_axis, where=example.loss_mask
         )
 
-        return loss
+        return loss, extras
 
     @property
     def vocab_size(self) -> int:

diff --git a/src/levanter/models/mistral.py b/src/levanter/models/mistral.py
@@ -23,6 +23,7 @@
 from levanter.models.llama import LlamaConfig, LlamaEmbedding, LlamaTransformer
 from levanter.models.lm_model import LmConfig, LmHeadModel
 from levanter.utils.flop_utils import lm_flops_per_token
+from levanter.utils.py_utils import cached_classproperty
 
 
 silence_transformer_nag()
@@ -192,9 +193,9 @@ def __call__(
         """
         k_t, k_head = maybe_rng_split(key, 2)
         x = self.embeddings.embed(input_ids)
-        x = self.transformer(x, attn_mask=attn_mask, key=k_t)
+        x, extras = self.transformer(x, attn_mask=attn_mask, key=k_t)
         lm_logits = self.lm_head(x, key=k_head)
-        return lm_logits
+        return lm_logits, extras
 
     def resize_vocab(self, new_size: int, key=None) -> "LmHeadModel[MistralConfig]":
         new_Vocab = self.Vocab.resize(new_size)