stanford-crfm · Helw150 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/config/diva_flash.yaml b/config/diva_flash.yaml
@@ -0,0 +1,38 @@
+data:
+  cache_dir: "gs://diva-flash/processed/llama"
+  tokenizer: "meta-llama/Llama-3.2-1B-Instruct"
+  processor: "openai/whisper-large-v3"
+  configs:
+    cv:
+      id: mozilla-foundation/common_voice_17_0
+      cache_dir: "gs://diva-flash/processed/llama/cv"
+      name: "en"
+      text_key: "sentence"
+      train_split: "train"
+      validation_split: "validation"
+  train_weights:
+    cv: 1.0
+model:
+  type: diva
+  reference_encoder: "openai/whisper-large-v3"
+  reference_decoder: "meta-llama/Llama-3.2-1B-Instruct"
+use_hf_model_config: true
+trainer:
+  steps_per_eval: 500
+  mp: p=f32,c=bf16
+  model_axis_size: 1
+  per_device_parallelism: -1
+  train_batch_size: 512
+  num_train_steps: 4300
+  checkpointer:
+    base_path: gs://diva-flash/cv-checkpoints
+    save_interval: 60m
+optimizer:
+  #learning_rate: 5E-5
+  learning_rate: 5e-4
+  weight_decay: 0.1
+  weight_decay_modules: None
+  default_weight_decay_mask: False
+  warmup: 0.01
+hf_save_path: gs://diva-flash/librispeech-hf-checkpoints
+diva_training: true
diff --git a/src/levanter/data/audio.py b/src/levanter/data/audio.py
@@ -81,6 +81,10 @@ def __init__(
         padding=True,
     ):
         self.feature_extractor: SequenceFeatureExtractor = processor.feature_extractor
+        if tokenizer.pad_token_id is None:
+            override_token = list(tokenizer.added_tokens_decoder.items())[-1]
+            tokenizer.pad_token_id = override_token[0]
+            tokenizer.pad_tokn = str(override_token[1])
         self.bt = BatchTokenizer(
             tokenizer,
             enforce_bos=enforce_bos,
@@ -272,6 +276,7 @@ class ProcessedAudioCache(AsyncDataset[AudioTextDict]):
     def __init__(self, cache: TreeCache[AudioTextDict]):
         super().__init__()
         self.cache = cache
+        self._cached_len: Optional[int] = None
 
     async def async_len(self) -> int:
         return await self.cache.async_len()
@@ -285,6 +290,15 @@ def is_finite(self) -> bool:
     async def current_len(self) -> Optional[int]:
         return await self.cache.current_len()
 
+    async def wait_until_len_at_least(self, length: int) -> int:
+        # length is brutally slow to compute, so we cache it
+        if self._cached_len is not None and self._cached_len >= length:
+            return self._cached_len
+
+        length = await super().wait_until_len_at_least(length)
+        self._cached_len = length
+        return length
+
     async def get_batch(self, indices: Sequence[int]) -> Sequence[AudioTextDict]:
         return await self.cache.get_batch(indices)
 

diff --git a/src/levanter/main/train_asr.py b/src/levanter/main/train_asr.py
@@ -16,6 +16,7 @@
 from levanter.compat.hf_checkpoints import HFCompatConfig, ModelWithHfSerializationMixin, save_hf_checkpoint_callback
 from levanter.data.audio import AudioIODatasetConfig, AudioMixtureDatasetConfig, AudioTextDataset
 from levanter.models.asr_model import ASRConfig, AudioTextExample
+from levanter.models.diva import DivaASRModel, diva_connector_only
 from levanter.models.whisper import WhisperConfig
 from levanter.optim import AdamConfig, OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
@@ -45,6 +46,7 @@ class TrainASRConfig:
     hf_save_path: Optional[str] = None
     hf_upload: Optional[str] = None
     hf_save_steps: int = 10000
+    diva_training: bool = False
 
 
 def main(config: TrainASRConfig):
@@ -122,7 +124,7 @@ def compute_loss(
         train_dataset = AudioTextDataset(
             config.data.train_set(key=data_key),
             Pos,
-            [config.model.Mels, config.model.MelPos],
+            config.model.AudioPos,
             KeyPos,
             ignore_index=config.data.pad_token_id,
         )
@@ -136,8 +138,16 @@ def compute_loss(
         if vocab_size != Vocab.size:
             logger.info(f"Rounding vocab size from {vocab_size} to {Vocab.size} for partitioning")
 
-        state = trainer.initial_state(training_key, model_init=lambda: config.model.build_asr(Vocab, key=model_key))
+        state = trainer.initial_state(
+            training_key,
+            model_init=lambda: config.model.build_asr(Vocab, key=model_key),
+        )
 
+        if config.diva_training and config.model.asr_model_type == DivaASRModel:
+            state = dataclasses.replace(state, model=None)
+            model = DivaASRModel.init(Vocab, config.model, key=model_key, init_from_submodels=True)
+            model = named_jit(trainer.mp.cast_to_param, parameter_axis_mapping)(model)
+            state = dataclasses.replace(state, model=model, is_trainable=diva_connector_only(model))
         if int(state.step) == 0:
             # TODO: I don't love that we init the model twice, but it's not a big deal i think?
             if config.initialize_from_hf:
@@ -164,7 +174,7 @@ def compute_loss(
             hax_eval_dataset = AudioTextDataset(
                 eval_dataset,
                 Pos,
-                [config.model.Mels, config.model.MelPos],
+                config.model.AudioPos,
                 KeyPos,
                 ignore_index=config.data.pad_token_id,
             )